summaryrefslogtreecommitdiffstats
path: root/src/crimson
diff options
context:
space:
mode:
Diffstat (limited to 'src/crimson')
-rw-r--r--src/crimson/admin/osd_admin.cc1
-rw-r--r--src/crimson/common/fixed_kv_node_layout.h9
-rw-r--r--src/crimson/common/logclient.cc1
-rw-r--r--src/crimson/common/shared_lru.h23
-rw-r--r--src/crimson/common/tmap_helpers.cc2
-rw-r--r--src/crimson/mon/MonClient.cc1
-rw-r--r--src/crimson/net/Socket.cc1
-rw-r--r--src/crimson/net/io_handler.cc4
-rw-r--r--src/crimson/net/io_handler.h7
-rw-r--r--src/crimson/os/alienstore/alien_store.cc57
-rw-r--r--src/crimson/os/alienstore/alien_store.h36
-rw-r--r--src/crimson/os/alienstore/thread_pool.cc3
-rw-r--r--src/crimson/os/cyanstore/cyan_store.cc34
-rw-r--r--src/crimson/os/cyanstore/cyan_store.h29
-rw-r--r--src/crimson/os/futurized_store.h28
-rw-r--r--src/crimson/os/seastore/CMakeLists.txt3
-rw-r--r--src/crimson/os/seastore/async_cleaner.cc47
-rw-r--r--src/crimson/os/seastore/async_cleaner.h15
-rw-r--r--src/crimson/os/seastore/backref/btree_backref_manager.cc14
-rw-r--r--src/crimson/os/seastore/backref/btree_backref_manager.h30
-rw-r--r--src/crimson/os/seastore/backref_entry.h127
-rw-r--r--src/crimson/os/seastore/backref_manager.h1
-rw-r--r--src/crimson/os/seastore/backref_mapping.h27
-rw-r--r--src/crimson/os/seastore/btree/btree_range_pin.cc54
-rw-r--r--src/crimson/os/seastore/btree/btree_range_pin.h29
-rw-r--r--src/crimson/os/seastore/btree/fixed_kv_btree.h10
-rw-r--r--src/crimson/os/seastore/btree/fixed_kv_node.h46
-rw-r--r--src/crimson/os/seastore/cache.cc372
-rw-r--r--src/crimson/os/seastore/cache.h619
-rw-r--r--src/crimson/os/seastore/cached_extent.cc210
-rw-r--r--src/crimson/os/seastore/cached_extent.h368
-rw-r--r--src/crimson/os/seastore/collection_manager/collection_flat_node.h2
-rw-r--r--src/crimson/os/seastore/collection_manager/flat_collection_manager.cc7
-rw-r--r--src/crimson/os/seastore/device.cc2
-rw-r--r--src/crimson/os/seastore/extent_placement_manager.cc4
-rw-r--r--src/crimson/os/seastore/extent_placement_manager.h24
-rw-r--r--src/crimson/os/seastore/extentmap_manager.cc33
-rw-r--r--src/crimson/os/seastore/journal.h15
-rw-r--r--src/crimson/os/seastore/journal/circular_bounded_journal.cc53
-rw-r--r--src/crimson/os/seastore/journal/circular_bounded_journal.h13
-rw-r--r--src/crimson/os/seastore/journal/record_submitter.cc4
-rw-r--r--src/crimson/os/seastore/journal/segment_allocator.cc4
-rw-r--r--src/crimson/os/seastore/journal/segmented_journal.cc37
-rw-r--r--src/crimson/os/seastore/journal/segmented_journal.h15
-rw-r--r--src/crimson/os/seastore/lba_manager.h1
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc53
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h16
-rw-r--r--src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h2
-rw-r--r--src/crimson/os/seastore/lba_mapping.cc44
-rw-r--r--src/crimson/os/seastore/lba_mapping.h73
-rw-r--r--src/crimson/os/seastore/object_data_handler.cc357
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h3
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc25
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h40
-rw-r--r--src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h18
-rw-r--r--src/crimson/os/seastore/onode.cc2
-rw-r--r--src/crimson/os/seastore/onode.h4
-rw-r--r--src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h10
-rw-r--r--src/crimson/os/seastore/random_block_manager/block_rb_manager.cc6
-rw-r--r--src/crimson/os/seastore/record_scanner.cc16
-rw-r--r--src/crimson/os/seastore/root_block.h2
-rw-r--r--src/crimson/os/seastore/root_meta.h76
-rw-r--r--src/crimson/os/seastore/seastore.cc60
-rw-r--r--src/crimson/os/seastore/seastore.h40
-rw-r--r--src/crimson/os/seastore/seastore_types.cc36
-rw-r--r--src/crimson/os/seastore/seastore_types.h146
-rw-r--r--src/crimson/os/seastore/segment_manager.cc12
-rw-r--r--src/crimson/os/seastore/segment_manager/block.cc58
-rw-r--r--src/crimson/os/seastore/segment_manager/ephemeral.cc15
-rw-r--r--src/crimson/os/seastore/segment_manager/zbd.cc43
-rw-r--r--src/crimson/os/seastore/segment_manager_group.cc10
-rw-r--r--src/crimson/os/seastore/transaction.h91
-rw-r--r--src/crimson/os/seastore/transaction_interruptor.cc15
-rw-r--r--src/crimson/os/seastore/transaction_interruptor.h77
-rw-r--r--src/crimson/os/seastore/transaction_manager.cc215
-rw-r--r--src/crimson/os/seastore/transaction_manager.h306
-rw-r--r--src/crimson/osd/backfill_facades.h11
-rw-r--r--src/crimson/osd/backfill_state.cc292
-rw-r--r--src/crimson/osd/backfill_state.h95
-rw-r--r--src/crimson/osd/ec_backend.cc1
-rw-r--r--src/crimson/osd/ec_backend.h1
-rw-r--r--src/crimson/osd/heartbeat.cc1
-rw-r--r--src/crimson/osd/main.cc3
-rw-r--r--src/crimson/osd/main_config_bootstrap_helpers.cc3
-rw-r--r--src/crimson/osd/object_context.h187
-rw-r--r--src/crimson/osd/object_context_loader.cc322
-rw-r--r--src/crimson/osd/object_context_loader.h255
-rw-r--r--src/crimson/osd/ops_executer.cc262
-rw-r--r--src/crimson/osd/ops_executer.h117
-rw-r--r--src/crimson/osd/osd.cc34
-rw-r--r--src/crimson/osd/osd.h1
-rw-r--r--src/crimson/osd/osd_operation.h55
-rw-r--r--src/crimson/osd/osd_operation_external_tracking.h185
-rw-r--r--src/crimson/osd/osd_operations/client_request.cc223
-rw-r--r--src/crimson/osd/osd_operations/client_request.h39
-rw-r--r--src/crimson/osd/osd_operations/internal_client_request.cc97
-rw-r--r--src/crimson/osd/osd_operations/internal_client_request.h12
-rw-r--r--src/crimson/osd/osd_operations/logmissing_request.cc6
-rw-r--r--src/crimson/osd/osd_operations/logmissing_request.h7
-rw-r--r--src/crimson/osd/osd_operations/logmissing_request_reply.cc5
-rw-r--r--src/crimson/osd/osd_operations/logmissing_request_reply.h5
-rw-r--r--src/crimson/osd/osd_operations/osdop_params.h2
-rw-r--r--src/crimson/osd/osd_operations/peering_event.cc3
-rw-r--r--src/crimson/osd/osd_operations/peering_event.h8
-rw-r--r--src/crimson/osd/osd_operations/pg_advance_map.h4
-rw-r--r--src/crimson/osd/osd_operations/recovery_subrequest.h3
-rw-r--r--src/crimson/osd/osd_operations/replicated_request.cc59
-rw-r--r--src/crimson/osd/osd_operations/replicated_request.h12
-rw-r--r--src/crimson/osd/osd_operations/scrub_events.h12
-rw-r--r--src/crimson/osd/osd_operations/snaptrim_event.cc76
-rw-r--r--src/crimson/osd/osd_operations/snaptrim_event.h11
-rw-r--r--src/crimson/osd/pg.cc323
-rw-r--r--src/crimson/osd/pg.h62
-rw-r--r--src/crimson/osd/pg_backend.cc60
-rw-r--r--src/crimson/osd/pg_backend.h29
-rw-r--r--src/crimson/osd/pg_recovery.cc19
-rw-r--r--src/crimson/osd/pg_recovery.h10
-rw-r--r--src/crimson/osd/pg_shard_manager.h46
-rw-r--r--src/crimson/osd/replicated_backend.cc134
-rw-r--r--src/crimson/osd/replicated_backend.h12
-rw-r--r--src/crimson/osd/replicated_recovery_backend.cc41
-rw-r--r--src/crimson/osd/shard_services.cc27
-rw-r--r--src/crimson/osd/shard_services.h4
-rw-r--r--src/crimson/tools/perf_crimson_msgr.cc1
-rw-r--r--src/crimson/tools/store_nbd/tm_driver.cc11
125 files changed, 4652 insertions, 2829 deletions
diff --git a/src/crimson/admin/osd_admin.cc b/src/crimson/admin/osd_admin.cc
index de9626a2f2d..41da72c9fde 100644
--- a/src/crimson/admin/osd_admin.cc
+++ b/src/crimson/admin/osd_admin.cc
@@ -14,6 +14,7 @@
#include "common/config.h"
#include "crimson/admin/admin_socket.h"
#include "crimson/common/log.h"
+#include "crimson/common/perf_counters_collection.h"
#include "crimson/osd/exceptions.h"
#include "crimson/osd/osd.h"
#include "crimson/osd/pg.h"
diff --git a/src/crimson/common/fixed_kv_node_layout.h b/src/crimson/common/fixed_kv_node_layout.h
index 2a91ac39540..db62a2df32d 100644
--- a/src/crimson/common/fixed_kv_node_layout.h
+++ b/src/crimson/common/fixed_kv_node_layout.h
@@ -360,11 +360,16 @@ public:
}
- FixedKVNodeLayout(char *buf) :
- buf(buf) {}
+ FixedKVNodeLayout() : buf(nullptr) {}
virtual ~FixedKVNodeLayout() = default;
+ void set_layout_buf(char *_buf) {
+ assert(buf == nullptr);
+ assert(_buf != nullptr);
+ buf = _buf;
+ }
+
const_iterator begin() const {
return const_iterator(
this,
diff --git a/src/crimson/common/logclient.cc b/src/crimson/common/logclient.cc
index d402ecd1901..a3c30227bc7 100644
--- a/src/crimson/common/logclient.cc
+++ b/src/crimson/common/logclient.cc
@@ -7,6 +7,7 @@
#include "crimson/net/Messenger.h"
#include "crimson/mon/MonClient.h"
#include "mon/MonMap.h"
+#include "common/Clock.h" // for ceph_clock_now()
#include "common/Graylog.h"
using std::map;
diff --git a/src/crimson/common/shared_lru.h b/src/crimson/common/shared_lru.h
index 92d99d332c4..0d73658e709 100644
--- a/src/crimson/common/shared_lru.h
+++ b/src/crimson/common/shared_lru.h
@@ -25,12 +25,17 @@ class SharedLRU {
SimpleLRU<K, shared_ptr_t, false> cache;
std::map<K, std::pair<weak_ptr_t, V*>> weak_refs;
+ // Once all of the shared pointers are destoryed,
+ // erase the tracked object from the weak_ref map
+ // before actually destorying it
struct Deleter {
- SharedLRU<K,V>* cache;
+ SharedLRU<K,V>* shared_lru_ptr;
const K key;
- void operator()(V* ptr) {
- cache->_erase_weak(key);
- delete ptr;
+ void operator()(V* value_ptr) {
+ if (shared_lru_ptr) {
+ shared_lru_ptr->_erase_weak(key);
+ }
+ delete value_ptr;
}
};
void _erase_weak(const K& key) {
@@ -42,9 +47,19 @@ public:
{}
~SharedLRU() {
cache.clear();
+
// initially, we were assuming that no pointer obtained from SharedLRU
// can outlive the lru itself. However, since going with the interruption
// concept for handling shutdowns, this is no longer valid.
+ // Moreover, before clearing weak_refs, invalidate each deleter
+ // cache pointer as this SharedLRU is being destoryed.
+ for (const auto& [key, value] : weak_refs) {
+ shared_ptr_t val;
+ val = value.first.lock();
+ auto this_deleter = get_deleter<Deleter>(val);
+ this_deleter->shared_lru_ptr = nullptr;
+ }
+
weak_refs.clear();
}
/**
diff --git a/src/crimson/common/tmap_helpers.cc b/src/crimson/common/tmap_helpers.cc
index 9c14ebc450e..58c4fc7e218 100644
--- a/src/crimson/common/tmap_helpers.cc
+++ b/src/crimson/common/tmap_helpers.cc
@@ -7,6 +7,8 @@
#include "include/encoding.h"
#include "include/rados.h"
+#include <map>
+
namespace detail {
#define decode_or_return(v, bp) \
diff --git a/src/crimson/mon/MonClient.cc b/src/crimson/mon/MonClient.cc
index 4919f0bf21f..4c076cf43c6 100644
--- a/src/crimson/mon/MonClient.cc
+++ b/src/crimson/mon/MonClient.cc
@@ -13,6 +13,7 @@
#include "auth/AuthClientHandler.h"
#include "auth/RotatingKeyRing.h"
+#include "common/Clock.h" // for ceph_clock_now()
#include "common/hostname.h"
#include "include/utime_fmt.h"
diff --git a/src/crimson/net/Socket.cc b/src/crimson/net/Socket.cc
index 2c729f4e8c2..3a7aeaf9651 100644
--- a/src/crimson/net/Socket.cc
+++ b/src/crimson/net/Socket.cc
@@ -8,6 +8,7 @@
#include <seastar/net/packet.hh>
#include "crimson/common/log.h"
+#include "include/random.h" // for ceph::util::generate_random_number()
#include "Errors.h"
using crimson::common::local_conf;
diff --git a/src/crimson/net/io_handler.cc b/src/crimson/net/io_handler.cc
index b93124f3c12..bc5e9bf404c 100644
--- a/src/crimson/net/io_handler.cc
+++ b/src/crimson/net/io_handler.cc
@@ -347,7 +347,7 @@ void IOHandler::do_set_io_state(
{
ceph_assert_always(seastar::this_shard_id() == get_shard_id());
auto prv_state = get_io_state();
- logger().debug("{} got {}do_set_io_state(): prv_state={}, new_state={}, "
+ logger().debug("{} got {} do_set_io_state(): prv_state={}, new_state={}, "
"fa={}, set_notify_out={}, at {}",
conn,
cc_seq.has_value() ? fmt::format("{} ", *cc_seq) : "",
@@ -984,7 +984,7 @@ void IOHandler::notify_out_dispatch()
});
});
}
- if (shard_states->try_enter_out_dispatching()) {
+ if (shard_states->try_enter_out_dispatching(conn)) {
shard_states->dispatch_in_background(
"do_out_dispatch", conn, [this] {
return do_out_dispatch(*shard_states);
diff --git a/src/crimson/net/io_handler.h b/src/crimson/net/io_handler.h
index 5986fcb16ac..41c76ab925b 100644
--- a/src/crimson/net/io_handler.h
+++ b/src/crimson/net/io_handler.h
@@ -309,7 +309,7 @@ public:
in_exit_dispatching = std::nullopt;
}
- bool try_enter_out_dispatching() {
+ bool try_enter_out_dispatching(SocketConnection &conn) {
assert(seastar::this_shard_id() == sid);
if (out_dispatching) {
// already dispatching out
@@ -327,6 +327,9 @@ public:
// do not dispatch out
return false;
default:
+ crimson::get_logger(ceph_subsys_ms).error(
+ "{} try_enter_out_dispatching() got wrong io_state {}",
+ conn, io_state);
ceph_abort("impossible");
}
}
@@ -574,6 +577,8 @@ struct fmt::formatter<crimson::net::IOHandler::io_state_t>
case switched:
name = "switched";
break;
+ default:
+ name = "undefined";
}
return formatter<string_view>::format(name, ctx);
}
diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc
index 3fd2bb1fd15..db6decd84f9 100644
--- a/src/crimson/os/alienstore/alien_store.cc
+++ b/src/crimson/os/alienstore/alien_store.cc
@@ -141,7 +141,8 @@ seastar::future<> AlienStore::stop()
AlienStore::base_errorator::future<bool>
AlienStore::exists(
CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
return op_gates.simple_dispatch("exists", [=, this] {
return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this] {
@@ -212,7 +213,8 @@ seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
AlienStore::list_objects(CollectionRef ch,
const ghobject_t& start,
const ghobject_t& end,
- uint64_t limit) const
+ uint64_t limit,
+ uint32_t op_flags) const
{
logger().debug("{}", __func__);
assert(tp);
@@ -348,7 +350,8 @@ AlienStore::readv(CollectionRef ch,
AlienStore::get_attr_errorator::future<ceph::bufferlist>
AlienStore::get_attr(CollectionRef ch,
const ghobject_t& oid,
- std::string_view name) const
+ std::string_view name,
+ uint32_t op_flags) const
{
logger().debug("{}", __func__);
assert(tp);
@@ -376,7 +379,8 @@ AlienStore::get_attr(CollectionRef ch,
AlienStore::get_attrs_ertr::future<AlienStore::attrs_t>
AlienStore::get_attrs(CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
logger().debug("{}", __func__);
assert(tp);
@@ -397,7 +401,8 @@ AlienStore::get_attrs(CollectionRef ch,
auto AlienStore::omap_get_values(CollectionRef ch,
const ghobject_t& oid,
- const set<string>& keys)
+ const set<string>& keys,
+ uint32_t op_flags)
-> read_errorator::future<omap_values_t>
{
logger().debug("{}", __func__);
@@ -421,7 +426,8 @@ auto AlienStore::omap_get_values(CollectionRef ch,
auto AlienStore::omap_get_values(CollectionRef ch,
const ghobject_t &oid,
- const std::optional<string> &start)
+ const std::optional<string> &start,
+ uint32_t op_flags)
-> read_errorator::future<std::tuple<bool, omap_values_t>>
{
logger().debug("{} with_start", __func__);
@@ -429,8 +435,21 @@ auto AlienStore::omap_get_values(CollectionRef ch,
return do_with_op_gate(omap_values_t{}, [=, this] (auto &values) {
return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &values] {
auto c = static_cast<AlienCollection*>(ch.get());
- return store->omap_get_values(c->collection, oid, start,
- reinterpret_cast<map<string, bufferlist>*>(&values));
+ return store->omap_iterate(
+ c->collection, oid,
+ ObjectStore::omap_iter_seek_t{
+ .seek_position = start.value_or(std::string{}),
+ // FIXME: classical OSDs begins iteration from LOWER_BOUND
+ // (or UPPER_BOUND if filter_prefix > start). However, these
+ // bits are not implemented yet
+ .seek_type = ObjectStore::omap_iter_seek_t::UPPER_BOUND
+ },
+ [&values]
+ (std::string_view key, std::string_view value) mutable {
+ values[std::string{key}].append(value);
+ // FIXME: there is limit on number of entries yet
+ return ObjectStore::omap_iter_ret_t::NEXT;
+ });
}).then([&values] (int r)
-> read_errorator::future<std::tuple<bool, omap_values_t>> {
if (r == -ENOENT) {
@@ -578,7 +597,8 @@ unsigned AlienStore::get_max_attr_name_length() const
seastar::future<struct stat> AlienStore::stat(
CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
assert(tp);
return do_with_op_gate((struct stat){}, [this, ch, oid](auto& st) {
@@ -590,8 +610,22 @@ seastar::future<struct stat> AlienStore::stat(
});
}
+seastar::future<std::string> AlienStore::get_default_device_class()
+{
+ logger().debug("{}", __func__);
+ assert(tp);
+ return op_gates.simple_dispatch("get_default_device_class", [=, this] {
+ return tp->submit([=, this] {
+ return store->get_default_device_class();
+ }).then([] (std::string device_class) {
+ return seastar::make_ready_future<std::string>(device_class);
+ });
+ });
+}
+
auto AlienStore::omap_get_header(CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
-> get_attr_errorator::future<ceph::bufferlist>
{
assert(tp);
@@ -617,7 +651,8 @@ AlienStore::read_errorator::future<std::map<uint64_t, uint64_t>> AlienStore::fie
CollectionRef ch,
const ghobject_t& oid,
uint64_t off,
- uint64_t len)
+ uint64_t len,
+ uint32_t op_flags)
{
assert(tp);
return do_with_op_gate(std::map<uint64_t, uint64_t>(), [=, this](auto& destmap) {
diff --git a/src/crimson/os/alienstore/alien_store.h b/src/crimson/os/alienstore/alien_store.h
index d36f449afd8..1d39411450e 100644
--- a/src/crimson/os/alienstore/alien_store.h
+++ b/src/crimson/os/alienstore/alien_store.h
@@ -36,7 +36,8 @@ public:
base_errorator::future<bool> exists(
CollectionRef c,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
mkfs_ertr::future<> mkfs(uuid_d new_osd_fsid) final;
read_errorator::future<ceph::bufferlist> read(CollectionRef c,
const ghobject_t& oid,
@@ -49,29 +50,36 @@ public:
uint32_t op_flags = 0) final;
- get_attr_errorator::future<ceph::bufferlist> get_attr(CollectionRef c,
- const ghobject_t& oid,
- std::string_view name) const final;
- get_attrs_ertr::future<attrs_t> get_attrs(CollectionRef c,
- const ghobject_t& oid) final;
+ get_attr_errorator::future<ceph::bufferlist> get_attr(
+ CollectionRef c,
+ const ghobject_t& oid,
+ std::string_view name,
+ uint32_t op_flags = 0) const final;
+ get_attrs_ertr::future<attrs_t> get_attrs(
+ CollectionRef c,
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
read_errorator::future<omap_values_t> omap_get_values(
CollectionRef c,
const ghobject_t& oid,
- const omap_keys_t& keys) final;
+ const omap_keys_t& keys,
+ uint32_t op_flags = 0) final;
/// Retrieves paged set of values > start (if present)
read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
CollectionRef c, ///< [in] collection
const ghobject_t &oid, ///< [in] oid
- const std::optional<std::string> &start ///< [in] start, empty for begin
+ const std::optional<std::string> &start, ///< [in] start, empty for begin
+ uint32_t op_flags = 0
) final; ///< @return <done, values> values.empty() iff done
seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
CollectionRef c,
const ghobject_t& start,
const ghobject_t& end,
- uint64_t limit) const final;
+ uint64_t limit,
+ uint32_t op_flags = 0) const final;
seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
@@ -97,15 +105,19 @@ public:
unsigned get_max_attr_name_length() const final;
seastar::future<struct stat> stat(
CollectionRef,
- const ghobject_t&) final;
+ const ghobject_t&,
+ uint32_t op_flags = 0) final;
+ seastar::future<std::string> get_default_device_class() final;
get_attr_errorator::future<ceph::bufferlist> omap_get_header(
CollectionRef,
- const ghobject_t&) final;
+ const ghobject_t&,
+ uint32_t) final;
read_errorator::future<std::map<uint64_t, uint64_t>> fiemap(
CollectionRef,
const ghobject_t&,
uint64_t off,
- uint64_t len) final;
+ uint64_t len,
+ uint32_t op_flags) final;
FuturizedStore::Shard& get_sharded_store() final {
return *this;
diff --git a/src/crimson/os/alienstore/thread_pool.cc b/src/crimson/os/alienstore/thread_pool.cc
index 5cf9590e61e..2d208548b32 100644
--- a/src/crimson/os/alienstore/thread_pool.cc
+++ b/src/crimson/os/alienstore/thread_pool.cc
@@ -7,6 +7,7 @@
#include <pthread.h>
#include "include/ceph_assert.h"
+#include "include/intarith.h" // for round_up_to()
#include "crimson/common/config_proxy.h"
using crimson::common::local_conf;
@@ -27,7 +28,7 @@ ThreadPool::ThreadPool(size_t n_threads,
pin(*cpus);
}
block_sighup();
- (void) pthread_setname_np(pthread_self(), "alien-store-tp");
+ (void) ceph_pthread_setname("alien-store-tp");
loop(queue_max_wait, i);
});
}
diff --git a/src/crimson/os/cyanstore/cyan_store.cc b/src/crimson/os/cyanstore/cyan_store.cc
index 7b945e5aa15..41819fb5eb6 100644
--- a/src/crimson/os/cyanstore/cyan_store.cc
+++ b/src/crimson/os/cyanstore/cyan_store.cc
@@ -12,6 +12,7 @@
#include "crimson/common/buffer_io.h"
#include "crimson/common/config_proxy.h"
+#include "crimson/common/perf_counters_collection.h"
#include "cyan_collection.h"
#include "cyan_object.h"
@@ -143,6 +144,12 @@ CyanStore::list_collections()
});
}
+seastar::future<std::string>
+CyanStore::get_default_device_class()
+{
+ return seastar::make_ready_future<std::string>("");
+}
+
CyanStore::mount_ertr::future<> CyanStore::Shard::mount()
{
static const char read_file_errmsg[]{"read_file"};
@@ -201,7 +208,8 @@ CyanStore::Shard::list_objects(
CollectionRef ch,
const ghobject_t& start,
const ghobject_t& end,
- uint64_t limit) const
+ uint64_t limit,
+ uint32_t op_flags) const
{
auto c = static_cast<Collection*>(ch.get());
logger().debug("{} {} {} {} {}",
@@ -250,7 +258,8 @@ CyanStore::Shard::list_collections()
CyanStore::Shard::base_errorator::future<bool>
CyanStore::Shard::exists(
CollectionRef ch,
- const ghobject_t &oid)
+ const ghobject_t &oid,
+ uint32_t op_flags)
{
auto c = static_cast<Collection*>(ch.get());
if (!c->exists) {
@@ -326,7 +335,8 @@ CyanStore::Shard::get_attr_errorator::future<ceph::bufferlist>
CyanStore::Shard::get_attr(
CollectionRef ch,
const ghobject_t& oid,
- std::string_view name) const
+ std::string_view name,
+ uint32_t op_flags) const
{
auto c = static_cast<Collection*>(ch.get());
logger().debug("{} {} {}",
@@ -345,7 +355,8 @@ CyanStore::Shard::get_attr(
CyanStore::Shard::get_attrs_ertr::future<CyanStore::Shard::attrs_t>
CyanStore::Shard::get_attrs(
CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
auto c = static_cast<Collection*>(ch.get());
logger().debug("{} {} {}",
@@ -360,7 +371,8 @@ CyanStore::Shard::get_attrs(
auto CyanStore::Shard::omap_get_values(
CollectionRef ch,
const ghobject_t& oid,
- const omap_keys_t& keys)
+ const omap_keys_t& keys,
+ uint32_t op_flags)
-> read_errorator::future<omap_values_t>
{
auto c = static_cast<Collection*>(ch.get());
@@ -381,7 +393,8 @@ auto CyanStore::Shard::omap_get_values(
auto CyanStore::Shard::omap_get_values(
CollectionRef ch,
const ghobject_t &oid,
- const std::optional<string> &start)
+ const std::optional<string> &start,
+ uint32_t op_flags)
-> CyanStore::Shard::read_errorator::future<std::tuple<bool, omap_values_t>>
{
auto c = static_cast<Collection*>(ch.get());
@@ -402,7 +415,8 @@ auto CyanStore::Shard::omap_get_values(
auto CyanStore::Shard::omap_get_header(
CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
-> CyanStore::Shard::get_attr_errorator::future<ceph::bufferlist>
{
auto c = static_cast<Collection*>(ch.get());
@@ -970,7 +984,8 @@ CyanStore::Shard::fiemap(
CollectionRef ch,
const ghobject_t& oid,
uint64_t off,
- uint64_t len)
+ uint64_t len,
+ uint32_t op_flags)
{
auto c = static_cast<Collection*>(ch.get());
@@ -985,7 +1000,8 @@ CyanStore::Shard::fiemap(
seastar::future<struct stat>
CyanStore::Shard::stat(
CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
auto c = static_cast<Collection*>(ch.get());
auto o = c->get_object(oid);
diff --git a/src/crimson/os/cyanstore/cyan_store.h b/src/crimson/os/cyanstore/cyan_store.h
index 99583d07d36..1d481ef5829 100644
--- a/src/crimson/os/cyanstore/cyan_store.h
+++ b/src/crimson/os/cyanstore/cyan_store.h
@@ -34,11 +34,13 @@ public:
seastar::future<struct stat> stat(
CollectionRef c,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
base_errorator::future<bool> exists(
CollectionRef ch,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
read_errorator::future<ceph::bufferlist> read(
CollectionRef c,
@@ -56,33 +58,39 @@ public:
get_attr_errorator::future<ceph::bufferlist> get_attr(
CollectionRef c,
const ghobject_t& oid,
- std::string_view name) const final;
+ std::string_view name,
+ uint32_t op_flags = 0) const final;
get_attrs_ertr::future<attrs_t> get_attrs(
CollectionRef c,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
read_errorator::future<omap_values_t> omap_get_values(
CollectionRef c,
const ghobject_t& oid,
- const omap_keys_t& keys) final;
+ const omap_keys_t& keys,
+ uint32_t op_flags = 0) final;
read_errorator::future<std::tuple<bool, omap_values_t>> omap_get_values(
CollectionRef c, ///< [in] collection
const ghobject_t &oid, ///< [in] oid
- const std::optional<std::string> &start ///< [in] start, empty for begin
+ const std::optional<std::string> &start, ///< [in] start, empty for begin
+ uint32_t op_flags = 0
) final;
get_attr_errorator::future<ceph::bufferlist> omap_get_header(
CollectionRef c,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
list_objects(
CollectionRef c,
const ghobject_t& start,
const ghobject_t& end,
- uint64_t limit) const final;
+ uint64_t limit,
+ uint32_t op_flags = 0) const final;
seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
@@ -101,7 +109,8 @@ public:
CollectionRef c,
const ghobject_t& oid,
uint64_t off,
- uint64_t len) final;
+ uint64_t len,
+ uint32_t op_flags) final;
unsigned get_max_attr_name_length() const final;
@@ -221,6 +230,8 @@ public:
seastar::future<std::vector<coll_core_t>> list_collections() final;
+ seastar::future<std::string> get_default_device_class() final;
+
private:
seastar::sharded<CyanStore::Shard> shard_stores;
const std::string path;
diff --git a/src/crimson/os/futurized_store.h b/src/crimson/os/futurized_store.h
index 0dca695ba3a..e7d4c8546de 100644
--- a/src/crimson/os/futurized_store.h
+++ b/src/crimson/os/futurized_store.h
@@ -54,7 +54,8 @@ public:
virtual base_errorator::future<bool> exists(
CollectionRef c,
- const ghobject_t& oid) = 0;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) = 0;
using get_attr_errorator = crimson::errorator<
crimson::ct_error::enoent,
@@ -62,42 +63,49 @@ public:
virtual get_attr_errorator::future<ceph::bufferlist> get_attr(
CollectionRef c,
const ghobject_t& oid,
- std::string_view name) const = 0;
+ std::string_view name,
+ uint32_t op_flags = 0) const = 0;
using get_attrs_ertr = crimson::errorator<
crimson::ct_error::enoent>;
using attrs_t = std::map<std::string, ceph::bufferlist, std::less<>>;
virtual get_attrs_ertr::future<attrs_t> get_attrs(
CollectionRef c,
- const ghobject_t& oid) = 0;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) = 0;
virtual seastar::future<struct stat> stat(
CollectionRef c,
- const ghobject_t& oid) = 0;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) = 0;
using omap_values_t = attrs_t;
using omap_keys_t = std::set<std::string>;
virtual read_errorator::future<omap_values_t> omap_get_values(
CollectionRef c,
const ghobject_t& oid,
- const omap_keys_t& keys) = 0;
+ const omap_keys_t& keys,
+ uint32_t op_flags = 0) = 0;
using omap_values_paged_t = std::tuple<bool, omap_values_t>;
virtual read_errorator::future<omap_values_paged_t> omap_get_values(
CollectionRef c, ///< [in] collection
const ghobject_t &oid, ///< [in] oid
- const std::optional<std::string> &start ///< [in] start, empty for begin
+ const std::optional<std::string> &start, ///< [in] start, empty for begin
+ uint32_t op_flags = 0
) = 0; ///< @return <done, values> values.empty() only if done
virtual get_attr_errorator::future<bufferlist> omap_get_header(
CollectionRef c,
- const ghobject_t& oid) = 0;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) = 0;
virtual seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>> list_objects(
CollectionRef c,
const ghobject_t& start,
const ghobject_t& end,
- uint64_t limit) const = 0;
+ uint64_t limit,
+ uint32_t op_flags = 0) const = 0;
virtual seastar::future<CollectionRef> create_new_collection(const coll_t& cid) = 0;
@@ -153,7 +161,8 @@ public:
CollectionRef ch,
const ghobject_t& oid,
uint64_t off,
- uint64_t len) = 0;
+ uint64_t len,
+ uint32_t op_flags = 0) = 0;
virtual unsigned get_max_attr_name_length() const = 0;
};
@@ -203,6 +212,7 @@ public:
using coll_core_t = std::pair<coll_t, core_id_t>;
virtual seastar::future<std::vector<coll_core_t>> list_collections() = 0;
+ virtual seastar::future<std::string> get_default_device_class() = 0;
protected:
const core_id_t primary_core;
};
diff --git a/src/crimson/os/seastore/CMakeLists.txt b/src/crimson/os/seastore/CMakeLists.txt
index 4bdbab8c4e5..3da5e65ceec 100644
--- a/src/crimson/os/seastore/CMakeLists.txt
+++ b/src/crimson/os/seastore/CMakeLists.txt
@@ -1,9 +1,11 @@
set(crimson_seastore_srcs
cached_extent.cc
+ lba_mapping.cc
seastore_types.cc
segment_manager.cc
segment_manager/ephemeral.cc
segment_manager/block.cc
+ transaction_interruptor.cc
transaction_manager.cc
transaction.cc
cache.cc
@@ -18,7 +20,6 @@ set(crimson_seastore_srcs
omap_manager.cc
omap_manager/btree/btree_omap_manager.cc
omap_manager/btree/omap_btree_node_impl.cc
- btree/btree_range_pin.cc
btree/fixed_kv_node.cc
onode.cc
onode_manager/staged-fltree/node.cc
diff --git a/src/crimson/os/seastore/async_cleaner.cc b/src/crimson/os/seastore/async_cleaner.cc
index 5046980eae5..64e6749562e 100644
--- a/src/crimson/os/seastore/async_cleaner.cc
+++ b/src/crimson/os/seastore/async_cleaner.cc
@@ -131,7 +131,7 @@ void segments_info_t::add_segment_manager(
auto ssize = segment_manager.get_segment_size();
auto nsegments = segment_manager.get_num_segments();
auto sm_size = segment_manager.get_available_size();
- INFO("adding segment manager {}, size={}, ssize={}, segments={}",
+ INFO("adding segment manager {}, size=0x{:x}, segment size=0x{:x}, segments={}",
device_id_printer_t{d_id}, sm_size, ssize, nsegments);
ceph_assert(ssize > 0);
ceph_assert(nsegments > 0);
@@ -329,9 +329,9 @@ std::ostream &operator<<(std::ostream &os, const segments_info_t &infos)
<< ", closed=" << infos.get_num_closed()
<< ", type_journal=" << infos.get_num_type_journal()
<< ", type_ool=" << infos.get_num_type_ool()
- << ", total=" << infos.get_total_bytes() << "B"
- << ", available=" << infos.get_available_bytes() << "B"
- << ", unavailable=" << infos.get_unavailable_bytes() << "B"
+ << ", total=0x" << std::hex << infos.get_total_bytes() << "B"
+ << ", available=0x" << infos.get_available_bytes() << "B"
+ << ", unavailable=0x" << infos.get_unavailable_bytes() << "B" << std::dec
<< ", available_ratio=" << infos.get_available_ratio()
<< ", submitted_head=" << infos.get_submitted_journal_head()
<< ", time_bound=" << sea_time_point_printer_t{infos.get_time_bound()}
@@ -609,6 +609,7 @@ JournalTrimmerImpl::trim_alloc()
return extent_callback->with_transaction_intr(
Transaction::src_t::TRIM_ALLOC,
"trim_alloc",
+ CACHE_HINT_NOCACHE,
[this, FNAME](auto &t)
{
auto target = get_alloc_tail_target();
@@ -653,6 +654,7 @@ JournalTrimmerImpl::trim_dirty()
return extent_callback->with_transaction_intr(
Transaction::src_t::TRIM_DIRTY,
"trim_dirty",
+ CACHE_HINT_NOCACHE,
[this, FNAME](auto &t)
{
auto target = get_dirty_tail_target();
@@ -765,10 +767,10 @@ int64_t SpaceTrackerDetailed::SegmentMap::allocate(
for (auto i = b; i < e; ++i) {
if (bitmap[i]) {
if (!error) {
- ERROR("found allocated in {}, {} ~ {}", segment, offset, len);
+ ERROR("found allocated in {}, 0x{:x}~0x{:x}", segment, offset, len);
error = true;
}
- DEBUG("block {} allocated", i * block_size);
+ DEBUG("block 0x{:x}B allocated", i * block_size);
}
bitmap[i] = true;
}
@@ -792,10 +794,10 @@ int64_t SpaceTrackerDetailed::SegmentMap::release(
for (auto i = b; i < e; ++i) {
if (!bitmap[i]) {
if (!error) {
- ERROR("found unallocated in {}, {} ~ {}", segment, offset, len);
+ ERROR("found unallocated in {}, 0x{:x}~0x{:x}", segment, offset, len);
error = true;
}
- DEBUG("block {} unallocated", i * block_size);
+ DEBUG("block 0x{:x}B unallocated", i * block_size);
}
bitmap[i] = false;
}
@@ -831,7 +833,7 @@ void SpaceTrackerDetailed::SegmentMap::dump_usage(extent_len_t block_size) const
INFO("dump start");
for (unsigned i = 0; i < bitmap.size(); ++i) {
if (bitmap[i]) {
- LOCAL_LOGGER.info(" {} still live", i * block_size);
+ LOCAL_LOGGER.info(" 0x{:x}B still live", i * block_size);
}
}
}
@@ -847,7 +849,7 @@ void SpaceTrackerDetailed::dump_usage(segment_id_t id) const
void SpaceTrackerSimple::dump_usage(segment_id_t id) const
{
LOG_PREFIX(SpaceTrackerSimple::dump_usage);
- INFO("id: {}, live_bytes: {}",
+ INFO("id: {}, live_bytes: 0x{:x}",
id, live_bytes_by_segment[id].live_bytes);
}
@@ -1125,6 +1127,7 @@ SegmentCleaner::do_reclaim_space(
return extent_callback->with_transaction_intr(
src,
"clean_reclaim_space",
+ CACHE_HINT_NOCACHE,
[this, &backref_extents, &pin_list, &reclaimed](auto &t)
{
return seastar::do_with(
@@ -1142,8 +1145,7 @@ SegmentCleaner::do_reclaim_space(
pin->get_key(),
pin->get_val(),
pin->get_length(),
- pin->get_type(),
- JOURNAL_SEQ_NULL);
+ pin->get_type());
}
for (auto &cached_backref : cached_backref_entries) {
if (cached_backref.laddr == L_ADDR_NULL) {
@@ -1165,7 +1167,7 @@ SegmentCleaner::do_reclaim_space(
[this, &extents, &t](auto &ent)
{
LOG_PREFIX(SegmentCleaner::do_reclaim_space);
- TRACET("getting extent of type {} at {}~{}",
+ TRACET("getting extent of type {} at {}~0x{:x}",
t,
ent.type,
ent.paddr,
@@ -1241,6 +1243,7 @@ SegmentCleaner::clean_space_ret SegmentCleaner::clean_space()
return extent_callback->with_transaction_intr(
Transaction::src_t::READ,
"retrieve_from_backref_tree",
+ CACHE_HINT_NOCACHE,
[this, &weak_read_ret](auto &t) {
return backref_manager.get_mappings(
t,
@@ -1507,6 +1510,7 @@ bool SegmentCleaner::check_usage()
SpaceTrackerIRef tracker(space_tracker->make_empty());
extent_callback->with_transaction_weak(
"check_usage",
+ CACHE_HINT_NOCACHE,
[this, &tracker](auto &t) {
return backref_manager.scan_mapped_space(
t,
@@ -1568,7 +1572,7 @@ void SegmentCleaner::mark_space_used(
background_callback->maybe_wake_background();
assert(ret > 0);
- DEBUG("segment {} new len: {}~{}, live_bytes: {}",
+ DEBUG("segment {} new len: {}~0x{:x}, live_bytes: 0x{:x}",
seg_addr.get_segment_id(),
addr,
len,
@@ -1591,7 +1595,7 @@ void SegmentCleaner::mark_space_free(
stats.used_bytes -= len;
auto& seg_addr = addr.as_seg_paddr();
- DEBUG("segment {} free len: {}~{}",
+ DEBUG("segment {} free len: {}~0x{:x}",
seg_addr.get_segment_id(), addr, len);
auto old_usage = calc_utilization(seg_addr.get_segment_id());
[[maybe_unused]] auto ret = space_tracker->release(
@@ -1602,7 +1606,7 @@ void SegmentCleaner::mark_space_free(
adjust_segment_util(old_usage, new_usage);
background_callback->maybe_wake_blocked_io();
assert(ret >= 0);
- DEBUG("segment {} free len: {}~{}, live_bytes: {}",
+ DEBUG("segment {} free len: {}~0x{:x}, live_bytes: 0x{:x}",
seg_addr.get_segment_id(),
addr,
len,
@@ -1687,11 +1691,11 @@ void SegmentCleaner::print(std::ostream &os, bool is_detailed) const
<< ", reclaim_ratio=" << get_reclaim_ratio()
<< ", alive_ratio=" << get_alive_ratio();
if (is_detailed) {
- os << ", unavailable_unreclaimable="
+ os << ", unavailable_unreclaimable=0x" << std::hex
<< get_unavailable_unreclaimable_bytes() << "B"
- << ", unavailable_reclaimble="
+ << ", unavailable_reclaimble=0x"
<< get_unavailable_reclaimable_bytes() << "B"
- << ", alive=" << stats.used_bytes << "B"
+ << ", alive=0x" << stats.used_bytes << "B" << std::dec
<< ", " << segments;
}
os << ")";
@@ -1722,7 +1726,7 @@ void RBMCleaner::mark_space_used(
for (auto rbm : rbms) {
if (addr.get_device_id() == rbm->get_device_id()) {
if (rbm->get_start() <= addr) {
- DEBUG("allocate addr: {} len: {}", addr, len);
+ DEBUG("allocate addr: {} len: 0x{:x}", addr, len);
stats.used_bytes += len;
rbm->mark_space_used(addr, len);
}
@@ -1741,7 +1745,7 @@ void RBMCleaner::mark_space_free(
for (auto rbm : rbms) {
if (addr.get_device_id() == rbm->get_device_id()) {
if (rbm->get_start() <= addr) {
- DEBUG("free addr: {} len: {}", addr, len);
+ DEBUG("free addr: {} len: 0x{:x}", addr, len);
ceph_assert(stats.used_bytes >= len);
stats.used_bytes -= len;
rbm->mark_space_free(addr, len);
@@ -1813,6 +1817,7 @@ bool RBMCleaner::check_usage()
RBMSpaceTracker tracker(rbms);
extent_callback->with_transaction_weak(
"check_usage",
+ CACHE_HINT_NOCACHE,
[this, &tracker, &rbms](auto &t) {
return backref_manager.scan_mapped_space(
t,
diff --git a/src/crimson/os/seastore/async_cleaner.h b/src/crimson/os/seastore/async_cleaner.h
index 424247c5bdc..1cef771aeb8 100644
--- a/src/crimson/os/seastore/async_cleaner.h
+++ b/src/crimson/os/seastore/async_cleaner.h
@@ -17,6 +17,7 @@
#include "crimson/os/seastore/randomblock_manager_group.h"
#include "crimson/os/seastore/transaction.h"
#include "crimson/os/seastore/segment_seq_allocator.h"
+#include "crimson/os/seastore/backref_mapping.h"
namespace crimson::os::seastore {
@@ -299,24 +300,29 @@ public:
/// Creates empty transaction
/// weak transaction should be type READ
virtual TransactionRef create_transaction(
- Transaction::src_t, const char *name, bool is_weak=false) = 0;
+ Transaction::src_t,
+ const char *name,
+ cache_hint_t cache_hint = CACHE_HINT_TOUCH,
+ bool is_weak=false) = 0;
/// Creates empty transaction with interruptible context
template <typename Func>
auto with_transaction_intr(
Transaction::src_t src,
const char* name,
+ cache_hint_t cache_hint,
Func &&f) {
return do_with_transaction_intr<Func, false>(
- src, name, std::forward<Func>(f));
+ src, name, cache_hint, std::forward<Func>(f));
}
template <typename Func>
auto with_transaction_weak(
const char* name,
+ cache_hint_t cache_hint,
Func &&f) {
return do_with_transaction_intr<Func, true>(
- Transaction::src_t::READ, name, std::forward<Func>(f)
+ Transaction::src_t::READ, name, cache_hint, std::forward<Func>(f)
).handle_error(
crimson::ct_error::eagain::assert_failure{"unexpected eagain"},
crimson::ct_error::pass_further_all{}
@@ -385,9 +391,10 @@ private:
auto do_with_transaction_intr(
Transaction::src_t src,
const char* name,
+ cache_hint_t cache_hint,
Func &&f) {
return seastar::do_with(
- create_transaction(src, name, IsWeak),
+ create_transaction(src, name, cache_hint, IsWeak),
[f=std::forward<Func>(f)](auto &ref_t) mutable {
return with_trans_intr(
*ref_t,
diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.cc b/src/crimson/os/seastore/backref/btree_backref_manager.cc
index f89698d602a..9cbf65f4033 100644
--- a/src/crimson/os/seastore/backref/btree_backref_manager.cc
+++ b/src/crimson/os/seastore/backref/btree_backref_manager.cc
@@ -28,28 +28,22 @@ const get_phy_tree_root_node_ret get_phy_tree_root_node<
ceph_assert(backref_root->is_initial_pending()
== root_block->is_pending());
return {true,
- trans_intr::make_interruptible(
- c.cache.get_extent_viewable_by_trans(c.trans, backref_root))};
+ c.cache.get_extent_viewable_by_trans(c.trans, backref_root)};
} else if (root_block->is_pending()) {
auto &prior = static_cast<RootBlock&>(*root_block->get_prior_instance());
backref_root = prior.backref_root_node;
if (backref_root) {
return {true,
- trans_intr::make_interruptible(
- c.cache.get_extent_viewable_by_trans(c.trans, backref_root))};
+ c.cache.get_extent_viewable_by_trans(c.trans, backref_root)};
} else {
c.cache.account_absent_access(c.trans.get_src());
return {false,
- trans_intr::make_interruptible(
- Cache::get_extent_ertr::make_ready_future<
- CachedExtentRef>())};
+ Cache::get_extent_iertr::make_ready_future<CachedExtentRef>()};
}
} else {
c.cache.account_absent_access(c.trans.get_src());
return {false,
- trans_intr::make_interruptible(
- Cache::get_extent_ertr::make_ready_future<
- CachedExtentRef>())};
+ Cache::get_extent_iertr::make_ready_future<CachedExtentRef>()};
}
}
diff --git a/src/crimson/os/seastore/backref/btree_backref_manager.h b/src/crimson/os/seastore/backref/btree_backref_manager.h
index 38084bb00e6..24897dd55da 100644
--- a/src/crimson/os/seastore/backref/btree_backref_manager.h
+++ b/src/crimson/os/seastore/backref/btree_backref_manager.h
@@ -9,44 +9,28 @@
namespace crimson::os::seastore::backref {
-constexpr size_t BACKREF_BLOCK_SIZE = 4096;
-
-class BtreeBackrefMapping : public BtreeNodeMapping<paddr_t, laddr_t> {
- extent_types_t type;
+class BtreeBackrefMapping : public BackrefMapping {
public:
BtreeBackrefMapping(op_context_t<paddr_t> ctx)
- : BtreeNodeMapping(ctx) {}
+ : BackrefMapping(ctx) {}
BtreeBackrefMapping(
op_context_t<paddr_t> ctx,
CachedExtentRef parent,
uint16_t pos,
backref_map_val_t &val,
backref_node_meta_t &&meta)
- : BtreeNodeMapping(
+ : BackrefMapping(
+ val.type,
ctx,
parent,
pos,
val.laddr,
val.len,
- std::forward<backref_node_meta_t>(meta)),
- type(val.type)
- {}
- extent_types_t get_type() const final {
- return type;
- }
-
- bool is_clone() const final {
- return false;
- }
-
-protected:
- std::unique_ptr<BtreeNodeMapping<paddr_t, laddr_t>> _duplicate(
- op_context_t<paddr_t> ctx) const final {
- return std::unique_ptr<BtreeNodeMapping<paddr_t, laddr_t>>(
- new BtreeBackrefMapping(ctx));
- }
+ std::forward<backref_node_meta_t>(meta)) {}
};
+constexpr size_t BACKREF_BLOCK_SIZE = 4096;
+
using BackrefBtree = FixedKVBtree<
paddr_t, backref_map_val_t, BackrefInternalNode,
BackrefLeafNode, BtreeBackrefMapping, BACKREF_BLOCK_SIZE, false>;
diff --git a/src/crimson/os/seastore/backref_entry.h b/src/crimson/os/seastore/backref_entry.h
new file mode 100644
index 00000000000..5f9becc9565
--- /dev/null
+++ b/src/crimson/os/seastore/backref_entry.h
@@ -0,0 +1,127 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <memory>
+#include <iostream>
+
+#if FMT_VERSION >= 90000
+#include <fmt/ostream.h>
+#endif
+
+#include <boost/intrusive/set.hpp>
+
+#include "crimson/os/seastore/seastore_types.h"
+
+namespace crimson::os::seastore {
+
+struct backref_entry_t {
+ using ref_t = std::unique_ptr<backref_entry_t>;
+
+ backref_entry_t(
+ const paddr_t& paddr,
+ const laddr_t& laddr,
+ extent_len_t len,
+ extent_types_t type)
+ : paddr(paddr),
+ laddr(laddr),
+ len(len),
+ type(type) {
+ assert(len > 0);
+ }
+ paddr_t paddr = P_ADDR_NULL;
+ laddr_t laddr = L_ADDR_NULL;
+ extent_len_t len = 0;
+ extent_types_t type = extent_types_t::NONE;
+ friend bool operator< (
+ const backref_entry_t &l,
+ const backref_entry_t &r) {
+ return l.paddr < r.paddr;
+ }
+ friend bool operator> (
+ const backref_entry_t &l,
+ const backref_entry_t &r) {
+ return l.paddr > r.paddr;
+ }
+ friend bool operator== (
+ const backref_entry_t &l,
+ const backref_entry_t &r) {
+ return l.paddr == r.paddr;
+ }
+
+ using set_hook_t =
+ boost::intrusive::set_member_hook<
+ boost::intrusive::link_mode<
+ boost::intrusive::auto_unlink>>;
+ set_hook_t backref_set_hook;
+ using backref_set_member_options = boost::intrusive::member_hook<
+ backref_entry_t,
+ set_hook_t,
+ &backref_entry_t::backref_set_hook>;
+ using multiset_t = boost::intrusive::multiset<
+ backref_entry_t,
+ backref_set_member_options,
+ boost::intrusive::constant_time_size<false>>;
+
+ struct cmp_t {
+ using is_transparent = paddr_t;
+ bool operator()(
+ const backref_entry_t &l,
+ const backref_entry_t &r) const {
+ return l.paddr < r.paddr;
+ }
+ bool operator()(const paddr_t l, const backref_entry_t &r) const {
+ return l < r.paddr;
+ }
+ bool operator()(const backref_entry_t &l, const paddr_t r) const {
+ return l.paddr < r;
+ }
+ };
+
+ static ref_t create_alloc(
+ const paddr_t& paddr,
+ const laddr_t& laddr,
+ extent_len_t len,
+ extent_types_t type) {
+ assert(is_backref_mapped_type(type));
+ assert(laddr != L_ADDR_NULL);
+ return std::make_unique<backref_entry_t>(
+ paddr, laddr, len, type);
+ }
+
+ static ref_t create_retire(
+ const paddr_t& paddr,
+ extent_len_t len,
+ extent_types_t type) {
+ assert(is_backref_mapped_type(type) ||
+ is_retired_placeholder_type(type));
+ return std::make_unique<backref_entry_t>(
+ paddr, L_ADDR_NULL, len, type);
+ }
+
+ static ref_t create(const alloc_blk_t& delta) {
+ return std::make_unique<backref_entry_t>(
+ delta.paddr, delta.laddr, delta.len, delta.type);
+ }
+};
+
+inline std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent) {
+ return out << "backref_entry_t{"
+ << ent.paddr << "~0x" << std::hex << ent.len << std::dec << ", "
+ << "laddr: " << ent.laddr << ", "
+ << "type: " << ent.type
+ << "}";
+}
+
+using backref_entry_ref = backref_entry_t::ref_t;
+using backref_entry_mset_t = backref_entry_t::multiset_t;
+using backref_entry_refs_t = std::vector<backref_entry_ref>;
+using backref_entryrefs_by_seq_t = std::map<journal_seq_t, backref_entry_refs_t>;
+using backref_entry_query_set_t = std::set<backref_entry_t, backref_entry_t::cmp_t>;
+
+} // namespace crimson::os::seastore
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::backref_entry_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/backref_manager.h b/src/crimson/os/seastore/backref_manager.h
index 3feedb997b4..8c746b571b2 100644
--- a/src/crimson/os/seastore/backref_manager.h
+++ b/src/crimson/os/seastore/backref_manager.h
@@ -6,6 +6,7 @@
#include "crimson/os/seastore/cache.h"
#include "crimson/os/seastore/cached_extent.h"
#include "crimson/os/seastore/transaction.h"
+#include "crimson/os/seastore/backref_mapping.h"
namespace crimson::os::seastore {
diff --git a/src/crimson/os/seastore/backref_mapping.h b/src/crimson/os/seastore/backref_mapping.h
new file mode 100644
index 00000000000..d0a6a0ea6ff
--- /dev/null
+++ b/src/crimson/os/seastore/backref_mapping.h
@@ -0,0 +1,27 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/btree/btree_range_pin.h"
+
+namespace crimson::os::seastore {
+
+class BackrefMapping : public BtreeNodeMapping<paddr_t, laddr_t> {
+ extent_types_t type;
+public:
+ BackrefMapping(op_context_t<paddr_t> ctx)
+ : BtreeNodeMapping(ctx) {}
+ template <typename... T>
+ BackrefMapping(extent_types_t type, T&&... t)
+ : BtreeNodeMapping(std::forward<T>(t)...),
+ type(type) {}
+ extent_types_t get_type() const {
+ return type;
+ }
+};
+
+using BackrefMappingRef = std::unique_ptr<BackrefMapping>;
+using backref_pin_list_t = std::list<BackrefMappingRef>;
+
+} // namespace crimson::os::seastore
diff --git a/src/crimson/os/seastore/btree/btree_range_pin.cc b/src/crimson/os/seastore/btree/btree_range_pin.cc
deleted file mode 100644
index f0d507a24c4..00000000000
--- a/src/crimson/os/seastore/btree/btree_range_pin.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "crimson/os/seastore/btree/btree_range_pin.h"
-#include "crimson/os/seastore/btree/fixed_kv_node.h"
-
-namespace crimson::os::seastore {
-
-template <typename key_t, typename val_t>
-get_child_ret_t<LogicalCachedExtent>
-BtreeNodeMapping<key_t, val_t>::get_logical_extent(
- Transaction &t)
-{
- ceph_assert(is_parent_viewable());
- assert(pos != std::numeric_limits<uint16_t>::max());
- ceph_assert(t.get_trans_id() == ctx.trans.get_trans_id());
- auto &p = (FixedKVNode<key_t>&)*parent;
- auto k = this->is_indirect()
- ? this->get_intermediate_base()
- : get_key();
- auto v = p.template get_child<LogicalCachedExtent>(ctx, pos, k);
- if (!v.has_child()) {
- this->child_pos = v.get_child_pos();
- }
- return v;
-}
-
-template <typename key_t, typename val_t>
-bool BtreeNodeMapping<key_t, val_t>::is_stable() const
-{
- assert(!this->parent_modified());
- assert(pos != std::numeric_limits<uint16_t>::max());
- auto &p = (FixedKVNode<key_t>&)*parent;
- auto k = this->is_indirect()
- ? this->get_intermediate_base()
- : get_key();
- return p.is_child_stable(ctx, pos, k);
-}
-
-template <typename key_t, typename val_t>
-bool BtreeNodeMapping<key_t, val_t>::is_data_stable() const
-{
- assert(!this->parent_modified());
- assert(pos != std::numeric_limits<uint16_t>::max());
- auto &p = (FixedKVNode<key_t>&)*parent;
- auto k = this->is_indirect()
- ? this->get_intermediate_base()
- : get_key();
- return p.is_child_data_stable(ctx, pos, k);
-}
-
-template class BtreeNodeMapping<laddr_t, paddr_t>;
-template class BtreeNodeMapping<paddr_t, laddr_t>;
-} // namespace crimson::os::seastore
diff --git a/src/crimson/os/seastore/btree/btree_range_pin.h b/src/crimson/os/seastore/btree/btree_range_pin.h
index 91751801e5d..bfd350a8bed 100644
--- a/src/crimson/os/seastore/btree/btree_range_pin.h
+++ b/src/crimson/os/seastore/btree/btree_range_pin.h
@@ -7,11 +7,12 @@
#include "crimson/common/log.h"
-#include "crimson/os/seastore/cache.h"
#include "crimson/os/seastore/cached_extent.h"
#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction.h"
namespace crimson::os::seastore {
+class Cache;
template <typename node_key_t>
struct op_context_t {
@@ -116,8 +117,6 @@ protected:
extent_len_t len = 0;
fixed_kv_node_meta_t<key_t> range;
uint16_t pos = std::numeric_limits<uint16_t>::max();
-
- virtual std::unique_ptr<BtreeNodeMapping> _duplicate(op_context_t<key_t>) const = 0;
fixed_kv_node_meta_t<key_t> _get_pin_range() const {
return range;
}
@@ -139,11 +138,7 @@ public:
len(len),
range(meta),
pos(pos)
- {
- if (!parent->is_pending()) {
- this->child_pos = {parent, pos};
- }
- }
+ {}
CachedExtentRef get_parent() const final {
return parent;
@@ -162,11 +157,6 @@ public:
return len;
}
- extent_types_t get_type() const override {
- ceph_abort("should never happen");
- return extent_types_t::ROOT;
- }
-
val_t get_val() const final {
if constexpr (std::is_same_v<val_t, paddr_t>) {
return value.get_paddr();
@@ -180,16 +170,6 @@ public:
return range.begin;
}
- PhysicalNodeMappingRef<key_t, val_t> duplicate() const final {
- auto ret = _duplicate(ctx);
- ret->range = range;
- ret->value = value;
- ret->parent = parent;
- ret->len = len;
- ret->pos = pos;
- return ret;
- }
-
bool has_been_invalidated() const final {
return parent->has_been_invalidated();
}
@@ -215,9 +195,6 @@ public:
return unviewable;
}
- get_child_ret_t<LogicalCachedExtent> get_logical_extent(Transaction&) final;
- bool is_stable() const final;
- bool is_data_stable() const final;
bool is_parent_viewable() const final {
ceph_assert(parent);
if (!parent->is_valid()) {
diff --git a/src/crimson/os/seastore/btree/fixed_kv_btree.h b/src/crimson/os/seastore/btree/fixed_kv_btree.h
index cb4fff32750..04ebcc7e2ca 100644
--- a/src/crimson/os/seastore/btree/fixed_kv_btree.h
+++ b/src/crimson/os/seastore/btree/fixed_kv_btree.h
@@ -32,10 +32,6 @@ inline ChildableCachedExtent* get_reserved_ptr() {
template <typename T>
phy_tree_root_t& get_phy_tree_root(root_t& r);
-using get_child_iertr =
- ::crimson::interruptible::interruptible_errorator<
- typename trans_intr::condition,
- get_child_ertr>;
using get_phy_tree_root_node_ret =
std::pair<bool, get_child_iertr::future<CachedExtentRef>>;
@@ -1501,7 +1497,7 @@ private:
// checking the lba child must be atomic with creating
// and linking the absent child
if (v.has_child()) {
- return trans_intr::make_interruptible(std::move(v.get_child_fut())
+ return std::move(v.get_child_fut()
).si_then([on_found=std::move(on_found), node_iter, c,
parent_entry](auto child) {
LOG_PREFIX(FixedKVBtree::lookup_internal_level);
@@ -1571,7 +1567,7 @@ private:
// checking the lba child must be atomic with creating
// and linking the absent child
if (v.has_child()) {
- return trans_intr::make_interruptible(std::move(v.get_child_fut())
+ return std::move(v.get_child_fut()
).si_then([on_found=std::move(on_found), node_iter, c,
parent_entry](auto child) {
LOG_PREFIX(FixedKVBtree::lookup_leaf);
@@ -2126,7 +2122,7 @@ private:
// checking the lba child must be atomic with creating
// and linking the absent child
if (v.has_child()) {
- return trans_intr::make_interruptible(std::move(v.get_child_fut())
+ return std::move(v.get_child_fut()
).si_then([do_merge=std::move(do_merge), &pos,
donor_iter, donor_is_left, c, parent_pos](auto child) {
LOG_PREFIX(FixedKVBtree::merge_level);
diff --git a/src/crimson/os/seastore/btree/fixed_kv_node.h b/src/crimson/os/seastore/btree/fixed_kv_node.h
index 09f54a4f2d0..63e2ca38c42 100644
--- a/src/crimson/os/seastore/btree/fixed_kv_node.h
+++ b/src/crimson/os/seastore/btree/fixed_kv_node.h
@@ -165,6 +165,11 @@ struct FixedKVNode : ChildableCachedExtent {
: ChildableCachedExtent(std::move(ptr)),
children(capacity, nullptr),
capacity(capacity) {}
+ // Must be identical with FixedKVNode(capacity, ptr) after on_fully_loaded()
+ explicit FixedKVNode(uint16_t capacity, extent_len_t length)
+ : ChildableCachedExtent(length),
+ children(capacity, nullptr),
+ capacity(capacity) {}
FixedKVNode(const FixedKVNode &rhs)
: ChildableCachedExtent(rhs),
range(rhs.range),
@@ -708,12 +713,17 @@ struct FixedKVInternalNode
node_size,
node_type_t>;
- FixedKVInternalNode(ceph::bufferptr &&ptr)
- : FixedKVNode<NODE_KEY>(CAPACITY, std::move(ptr)),
- node_layout_t(this->get_bptr().c_str()) {}
+ explicit FixedKVInternalNode(ceph::bufferptr &&ptr)
+ : FixedKVNode<NODE_KEY>(CAPACITY, std::move(ptr)) {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
+ // Must be identical with FixedKVInternalNode(ptr) after on_fully_loaded()
+ explicit FixedKVInternalNode(extent_len_t length)
+ : FixedKVNode<NODE_KEY>(CAPACITY, length) {}
FixedKVInternalNode(const FixedKVInternalNode &rhs)
- : FixedKVNode<NODE_KEY>(rhs),
- node_layout_t(this->get_bptr().c_str()) {}
+ : FixedKVNode<NODE_KEY>(rhs) {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
bool have_children() const final {
return true;
@@ -985,6 +995,10 @@ struct FixedKVInternalNode
pivot);
}
+ void on_fully_loaded() final {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
+
/**
* Internal relative addresses on read or in memory prior to commit
* are either record or block relative depending on whether this
@@ -994,8 +1008,7 @@ struct FixedKVInternalNode
* resolve_relative_addrs fixes up relative internal references
* based on base.
*/
- void resolve_relative_addrs(paddr_t base)
- {
+ void resolve_relative_addrs(paddr_t base) final {
LOG_PREFIX(FixedKVInternalNode::resolve_relative_addrs);
for (auto i: *this) {
if (i->get_val().is_relative()) {
@@ -1122,13 +1135,18 @@ struct FixedKVLeafNode
node_type_t,
has_children>;
using base_t = FixedKVNode<NODE_KEY>;
- FixedKVLeafNode(ceph::bufferptr &&ptr)
- : FixedKVNode<NODE_KEY>(has_children ? CAPACITY : 0, std::move(ptr)),
- node_layout_t(this->get_bptr().c_str()) {}
+ explicit FixedKVLeafNode(ceph::bufferptr &&ptr)
+ : FixedKVNode<NODE_KEY>(has_children ? CAPACITY : 0, std::move(ptr)) {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
+ // Must be identical with FixedKVLeafNode(ptr) after on_fully_loaded()
+ explicit FixedKVLeafNode(extent_len_t length)
+ : FixedKVNode<NODE_KEY>(has_children ? CAPACITY : 0, length) {}
FixedKVLeafNode(const FixedKVLeafNode &rhs)
: FixedKVNode<NODE_KEY>(rhs),
- node_layout_t(this->get_bptr().c_str()),
- modifications(rhs.modifications) {}
+ modifications(rhs.modifications) {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
static constexpr bool do_has_children = has_children;
// for the stable extent, modifications is always 0;
@@ -1235,6 +1253,10 @@ struct FixedKVLeafNode
}
}
+ void on_fully_loaded() final {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
+
void prepare_commit() final {
if constexpr (has_children) {
if (this->is_initial_pending()) {
diff --git a/src/crimson/os/seastore/cache.cc b/src/crimson/os/seastore/cache.cc
index 5dcb7514ee1..86f816e1648 100644
--- a/src/crimson/os/seastore/cache.cc
+++ b/src/crimson/os/seastore/cache.cc
@@ -28,15 +28,6 @@ SET_SUBSYS(seastore_cache);
namespace crimson::os::seastore {
-std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent) {
- return out << "backref_entry_t{"
- << ent.paddr << "~" << ent.len << ", "
- << "laddr: " << ent.laddr << ", "
- << "type: " << ent.type << ", "
- << "seq: " << ent.seq << ", "
- << "}";
-}
-
Cache::Cache(
ExtentPlacementManager &epm)
: epm(epm),
@@ -44,7 +35,7 @@ Cache::Cache(
"seastore_cache_lru_size"))
{
LOG_PREFIX(Cache::Cache);
- INFO("created, lru_capacity={}B", lru.get_capacity_bytes());
+ INFO("created, lru_capacity=0x{:x}B", lru.get_capacity_bytes());
register_metrics();
segment_providers_by_device_id.resize(DEVICE_ID_MAX, nullptr);
}
@@ -63,18 +54,18 @@ Cache::retire_extent_ret Cache::retire_extent_addr(
Transaction &t, paddr_t addr, extent_len_t length)
{
LOG_PREFIX(Cache::retire_extent_addr);
- TRACET("retire {}~{}", t, addr, length);
+ TRACET("retire {}~0x{:x}", t, addr, length);
assert(addr.is_real() && !addr.is_block_relative());
CachedExtentRef ext;
auto result = t.get_extent(addr, &ext);
if (result == Transaction::get_extent_ret::PRESENT) {
- DEBUGT("retire {}~{} on t -- {}", t, addr, length, *ext);
+ DEBUGT("retire {}~0x{:x} on t -- {}", t, addr, length, *ext);
t.add_to_retired_set(CachedExtentRef(&*ext));
return retire_extent_iertr::now();
} else if (result == Transaction::get_extent_ret::RETIRED) {
- ERRORT("retire {}~{} failed, already retired -- {}", t, addr, length, *ext);
+ ERRORT("retire {}~0x{:x} failed, already retired -- {}", t, addr, length, *ext);
ceph_abort();
}
@@ -85,7 +76,7 @@ Cache::retire_extent_ret Cache::retire_extent_addr(
// retiring is not included by the cache hit metrics
ext = query_cache(addr);
if (ext) {
- DEBUGT("retire {}~{} in cache -- {}", t, addr, length, *ext);
+ DEBUGT("retire {}~0x{:x} in cache -- {}", t, addr, length, *ext);
} else {
// add a new placeholder to Cache
ext = CachedExtent::make_cached_extent_ref<
@@ -95,7 +86,7 @@ Cache::retire_extent_ret Cache::retire_extent_addr(
PLACEMENT_HINT_NULL,
NULL_GENERATION,
TRANS_ID_NULL);
- DEBUGT("retire {}~{} as placeholder, add extent -- {}",
+ DEBUGT("retire {}~0x{:x} as placeholder, add extent -- {}",
t, addr, length, *ext);
add_extent(ext);
}
@@ -123,7 +114,7 @@ void Cache::retire_absent_extent_addr(
PLACEMENT_HINT_NULL,
NULL_GENERATION,
TRANS_ID_NULL);
- DEBUGT("retire {}~{} as placeholder, add extent -- {}",
+ DEBUGT("retire {}~0x{:x} as placeholder, add extent -- {}",
t, addr, length, *ext);
add_extent(ext);
t.add_to_read_set(ext);
@@ -172,6 +163,7 @@ void Cache::register_metrics()
{extent_types_t::LADDR_INTERNAL, sm::label_instance("ext", "LADDR_INTERNAL")},
{extent_types_t::LADDR_LEAF, sm::label_instance("ext", "LADDR_LEAF")},
{extent_types_t::DINK_LADDR_LEAF, sm::label_instance("ext", "DINK_LADDR_LEAF")},
+ {extent_types_t::ROOT_META, sm::label_instance("ext", "ROOT_META")},
{extent_types_t::OMAP_INNER, sm::label_instance("ext", "OMAP_INNER")},
{extent_types_t::OMAP_LEAF, sm::label_instance("ext", "OMAP_LEAF")},
{extent_types_t::ONODE_BLOCK_STAGED, sm::label_instance("ext", "ONODE_BLOCK_STAGED")},
@@ -1081,7 +1073,7 @@ CachedExtentRef Cache::alloc_new_extent_by_type(
)
{
LOG_PREFIX(Cache::alloc_new_extent_by_type);
- SUBDEBUGT(seastore_cache, "allocate {} {}B, hint={}, gen={}",
+ SUBDEBUGT(seastore_cache, "allocate {} 0x{:x}B, hint={}, gen={}",
t, type, length, hint, rewrite_gen_printer_t{gen});
ceph_assert(get_extent_category(type) == data_category_t::METADATA);
switch (type) {
@@ -1093,6 +1085,9 @@ CachedExtentRef Cache::alloc_new_extent_by_type(
case extent_types_t::LADDR_LEAF:
return alloc_new_non_data_extent<lba_manager::btree::LBALeafNode>(
t, length, hint, gen);
+ case extent_types_t::ROOT_META:
+ return alloc_new_non_data_extent<RootMetaBlock>(
+ t, length, hint, gen);
case extent_types_t::ONODE_BLOCK_STAGED:
return alloc_new_non_data_extent<onode::SeastoreNodeExtent>(
t, length, hint, gen);
@@ -1129,7 +1124,7 @@ std::vector<CachedExtentRef> Cache::alloc_new_data_extents_by_type(
)
{
LOG_PREFIX(Cache::alloc_new_data_extents_by_type);
- SUBDEBUGT(seastore_cache, "allocate {} {}B, hint={}, gen={}",
+ SUBDEBUGT(seastore_cache, "allocate {} 0x{:x}B, hint={}, gen={}",
t, type, length, hint, rewrite_gen_printer_t{gen});
ceph_assert(get_extent_category(type) == data_category_t::DATA);
std::vector<CachedExtentRef> res;
@@ -1344,21 +1339,39 @@ record_t Cache::prepare_record(
io_stat_t retire_stat;
std::vector<alloc_delta_t> alloc_deltas;
alloc_delta_t rel_delta;
+ backref_entry_refs_t backref_entries;
rel_delta.op = alloc_delta_t::op_types_t::CLEAR;
for (auto &i: t.retired_set) {
auto &extent = i.extent;
get_by_ext(efforts.retire_by_ext,
extent->get_type()).increment(extent->get_length());
retire_stat.increment(extent->get_length());
- DEBUGT("retired and remove extent -- {}", t, *extent);
+ DEBUGT("retired and remove extent {}~0x{:x} -- {}",
+ t, extent->get_paddr(), extent->get_length(), *extent);
commit_retire_extent(t, extent);
- if (is_backref_mapped_extent_node(extent) ||
- is_retired_placeholder_type(extent->get_type())) {
+
+ // Note: commit extents and backref allocations in the same place
+ if (is_backref_mapped_type(extent->get_type()) ||
+ is_retired_placeholder_type(extent->get_type())) {
+ DEBUGT("backref_entry free {}~0x{:x}",
+ t,
+ extent->get_paddr(),
+ extent->get_length());
rel_delta.alloc_blk_ranges.emplace_back(
- extent->get_paddr(),
- L_ADDR_NULL,
- extent->get_length(),
- extent->get_type());
+ alloc_blk_t::create_retire(
+ extent->get_paddr(),
+ extent->get_length(),
+ extent->get_type()));
+ backref_entries.emplace_back(
+ backref_entry_t::create_retire(
+ extent->get_paddr(),
+ extent->get_length(),
+ extent->get_type()));
+ } else if (is_backref_node(extent->get_type())) {
+ remove_backref_extent(extent->get_paddr());
+ } else {
+ ERRORT("Got unexpected extent type: {}", t, *extent);
+ ceph_abort("imposible");
}
}
alloc_deltas.emplace_back(std::move(rel_delta));
@@ -1395,27 +1408,40 @@ record_t Cache::prepare_record(
if (modify_time == NULL_TIME) {
modify_time = commit_time;
}
+ laddr_t fresh_laddr;
+ if (i->is_logical()) {
+ fresh_laddr = i->cast<LogicalCachedExtent>()->get_laddr();
+ } else if (is_lba_node(i->get_type())) {
+ fresh_laddr = i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin;
+ } else {
+ fresh_laddr = L_ADDR_NULL;
+ }
record.push_back(extent_t{
i->get_type(),
- i->is_logical()
- ? i->cast<LogicalCachedExtent>()->get_laddr()
- : (is_lba_node(i->get_type())
- ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin
- : L_ADDR_NULL),
+ fresh_laddr,
std::move(bl)
},
modify_time);
- if (i->is_valid()
- && is_backref_mapped_extent_node(i)) {
+
+ if (!i->is_valid()) {
+ continue;
+ }
+ if (is_backref_mapped_type(i->get_type())) {
+ laddr_t alloc_laddr;
+ if (i->is_logical()) {
+ alloc_laddr = i->cast<LogicalCachedExtent>()->get_laddr();
+ } else if (is_lba_node(i->get_type())) {
+ alloc_laddr = i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin;
+ } else {
+ assert(i->get_type() == extent_types_t::TEST_BLOCK_PHYSICAL);
+ alloc_laddr = L_ADDR_MIN;
+ }
alloc_delta.alloc_blk_ranges.emplace_back(
- i->get_paddr(),
- i->is_logical()
- ? i->cast<LogicalCachedExtent>()->get_laddr()
- : (is_lba_node(i->get_type())
- ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin
- : L_ADDR_NULL),
- i->get_length(),
- i->get_type());
+ alloc_blk_t::create_alloc(
+ i->get_paddr(),
+ alloc_laddr,
+ i->get_length(),
+ i->get_type()));
}
}
@@ -1426,14 +1452,20 @@ record_t Cache::prepare_record(
get_by_ext(efforts.fresh_ool_by_ext,
i->get_type()).increment(i->get_length());
i->prepare_commit();
- if (is_backref_mapped_extent_node(i)) {
+ if (is_backref_mapped_type(i->get_type())) {
+ laddr_t alloc_laddr;
+ if (i->is_logical()) {
+ alloc_laddr = i->cast<LogicalCachedExtent>()->get_laddr();
+ } else {
+ assert(is_lba_node(i->get_type()));
+ alloc_laddr = i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin;
+ }
alloc_delta.alloc_blk_ranges.emplace_back(
- i->get_paddr(),
- i->is_logical()
- ? i->cast<LogicalCachedExtent>()->get_laddr()
- : i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin,
- i->get_length(),
- i->get_type());
+ alloc_blk_t::create_alloc(
+ i->get_paddr(),
+ alloc_laddr,
+ i->get_length(),
+ i->get_type()));
}
}
@@ -1451,19 +1483,57 @@ record_t Cache::prepare_record(
i->state = CachedExtent::extent_state_t::CLEAN;
assert(i->is_logical());
i->clear_modified_region();
- touch_extent(*i, &trans_src);
+ touch_extent(*i, &trans_src, t.get_cache_hint());
DEBUGT("inplace rewrite ool block is commmitted -- {}", t, *i);
}
+ auto existing_stats = t.get_existing_block_stats();
+ DEBUGT("total existing blocks num: {}, exist clean num: {}, "
+ "exist mutation pending num: {}",
+ t,
+ existing_stats.valid_num,
+ existing_stats.clean_num,
+ existing_stats.mutated_num);
for (auto &i: t.existing_block_list) {
- if (i->is_valid()) {
- alloc_delta.alloc_blk_ranges.emplace_back(
- i->get_paddr(),
+ assert(is_logical_type(i->get_type()));
+ if (!i->is_valid()) {
+ continue;
+ }
+
+ if (i->is_exist_clean()) {
+ i->state = CachedExtent::extent_state_t::CLEAN;
+ } else {
+ assert(i->is_exist_mutation_pending());
+ // i->state must become DIRTY in complete_commit()
+ }
+
+ // exist mutation pending extents must be in t.mutated_block_list
+ add_extent(i);
+ const auto t_src = t.get_src();
+ if (i->is_dirty()) {
+ add_to_dirty(i, &t_src);
+ } else {
+ touch_extent(*i, &t_src, t.get_cache_hint());
+ }
+
+ alloc_delta.alloc_blk_ranges.emplace_back(
+ alloc_blk_t::create_alloc(
+ i->get_paddr(),
i->cast<LogicalCachedExtent>()->get_laddr(),
i->get_length(),
- i->get_type());
- }
+ i->get_type()));
+
+ // Note: commit extents and backref allocations in the same place
+ // Note: remapping is split into 2 steps, retire and alloc, they must be
+ // committed atomically together
+ backref_entries.emplace_back(
+ backref_entry_t::create_alloc(
+ i->get_paddr(),
+ i->cast<LogicalCachedExtent>()->get_laddr(),
+ i->get_length(),
+ i->get_type()));
}
+
alloc_deltas.emplace_back(std::move(alloc_delta));
for (auto b : alloc_deltas) {
@@ -1517,6 +1587,9 @@ record_t Cache::prepare_record(
record.push_back(std::move(delta));
}
+ apply_backref_mset(backref_entries);
+ t.set_backref_entries(std::move(backref_entries));
+
ceph_assert(t.get_fresh_block_stats().num ==
t.inline_block_list.size() +
t.ool_block_list.size() +
@@ -1616,26 +1689,35 @@ record_t Cache::prepare_record(
return record;
}
-void Cache::backref_batch_update(
- std::vector<backref_entry_ref> &&list,
- const journal_seq_t &seq)
+void Cache::apply_backref_byseq(
+ backref_entry_refs_t&& backref_entries,
+ const journal_seq_t& seq)
{
- LOG_PREFIX(Cache::backref_batch_update);
- DEBUG("inserting {} entries at {}", list.size(), seq);
- ceph_assert(seq != JOURNAL_SEQ_NULL);
-
- for (auto &ent : list) {
- backref_entry_mset.insert(*ent);
+ LOG_PREFIX(Cache::apply_backref_byseq);
+ DEBUG("backref_entry apply {} entries at {}",
+ backref_entries.size(), seq);
+ assert(seq != JOURNAL_SEQ_NULL);
+ if (backref_entries.empty()) {
+ return;
}
-
- auto iter = backref_entryrefs_by_seq.find(seq);
- if (iter == backref_entryrefs_by_seq.end()) {
- backref_entryrefs_by_seq.emplace(seq, std::move(list));
+ if (backref_entryrefs_by_seq.empty()) {
+ backref_entryrefs_by_seq.insert(
+ backref_entryrefs_by_seq.end(),
+ {seq, std::move(backref_entries)});
+ return;
+ }
+ auto last = backref_entryrefs_by_seq.rbegin();
+ assert(last->first <= seq);
+ if (last->first == seq) {
+ last->second.insert(
+ last->second.end(),
+ std::make_move_iterator(backref_entries.begin()),
+ std::make_move_iterator(backref_entries.end()));
} else {
- iter->second.insert(
- iter->second.end(),
- std::make_move_iterator(list.begin()),
- std::make_move_iterator(list.end()));
+ assert(last->first < seq);
+ backref_entryrefs_by_seq.insert(
+ backref_entryrefs_by_seq.end(),
+ {seq, std::move(backref_entries)});
}
}
@@ -1648,7 +1730,7 @@ void Cache::complete_commit(
SUBTRACET(seastore_t, "final_block_start={}, start_seq={}",
t, final_block_start, start_seq);
- std::vector<backref_entry_ref> backref_list;
+ backref_entry_refs_t backref_entries;
t.for_each_finalized_fresh_block([&](const CachedExtentRef &i) {
if (!i->is_valid()) {
return;
@@ -1677,24 +1759,30 @@ void Cache::complete_commit(
add_extent(i);
assert(!i->is_dirty());
const auto t_src = t.get_src();
- touch_extent(*i, &t_src);
+ touch_extent(*i, &t_src, t.get_cache_hint());
epm.commit_space_used(i->get_paddr(), i->get_length());
- if (is_backref_mapped_extent_node(i)) {
- DEBUGT("backref_list new {} len {}",
+
+ // Note: commit extents and backref allocations in the same place
+ if (is_backref_mapped_type(i->get_type())) {
+ DEBUGT("backref_entry alloc {}~0x{:x}",
t,
i->get_paddr(),
i->get_length());
- backref_list.emplace_back(
- std::make_unique<backref_entry_t>(
+ laddr_t alloc_laddr;
+ if (i->is_logical()) {
+ alloc_laddr = i->cast<LogicalCachedExtent>()->get_laddr();
+ } else if (is_lba_node(i->get_type())) {
+ alloc_laddr = i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin;
+ } else {
+ assert(i->get_type() == extent_types_t::TEST_BLOCK_PHYSICAL);
+ alloc_laddr = L_ADDR_MIN;
+ }
+ backref_entries.emplace_back(
+ backref_entry_t::create_alloc(
i->get_paddr(),
- i->is_logical()
- ? i->cast<LogicalCachedExtent>()->get_laddr()
- : (is_lba_node(i->get_type())
- ? i->cast<lba_manager::btree::LBANode>()->get_node_meta().begin
- : L_ADDR_NULL),
+ alloc_laddr,
i->get_length(),
- i->get_type(),
- start_seq));
+ i->get_type()));
} else if (is_backref_node(i->get_type())) {
add_backref_extent(
i->get_paddr(),
@@ -1731,9 +1819,10 @@ void Cache::complete_commit(
epm.mark_space_free(extent->get_paddr(), extent->get_length());
}
for (auto &i: t.existing_block_list) {
- if (i->is_valid()) {
- epm.mark_space_used(i->get_paddr(), i->get_length());
+ if (!i->is_valid()) {
+ continue;
}
+ epm.mark_space_used(i->get_paddr(), i->get_length());
}
for (auto &i: t.mutated_block_list) {
@@ -1747,64 +1836,10 @@ void Cache::complete_commit(
for (auto &i: t.retired_set) {
auto &extent = i.extent;
extent->dirty_from_or_retired_at = start_seq;
- if (is_backref_mapped_extent_node(extent) ||
- is_retired_placeholder_type(extent->get_type())) {
- DEBUGT("backref_list free {} len {}",
- t,
- extent->get_paddr(),
- extent->get_length());
- backref_list.emplace_back(
- std::make_unique<backref_entry_t>(
- extent->get_paddr(),
- L_ADDR_NULL,
- extent->get_length(),
- extent->get_type(),
- start_seq));
- } else if (is_backref_node(extent->get_type())) {
- remove_backref_extent(extent->get_paddr());
- } else {
- ERRORT("{}", t, *extent);
- ceph_abort("not possible");
- }
}
- auto existing_stats = t.get_existing_block_stats();
- DEBUGT("total existing blocks num: {}, exist clean num: {}, "
- "exist mutation pending num: {}",
- t,
- existing_stats.valid_num,
- existing_stats.clean_num,
- existing_stats.mutated_num);
- for (auto &i: t.existing_block_list) {
- if (i->is_valid()) {
- if (i->is_exist_clean()) {
- i->state = CachedExtent::extent_state_t::CLEAN;
- } else {
- assert(i->state == CachedExtent::extent_state_t::DIRTY);
- }
- DEBUGT("backref_list new existing {} len {}",
- t,
- i->get_paddr(),
- i->get_length());
- backref_list.emplace_back(
- std::make_unique<backref_entry_t>(
- i->get_paddr(),
- i->cast<LogicalCachedExtent>()->get_laddr(),
- i->get_length(),
- i->get_type(),
- start_seq));
- add_extent(i);
- const auto t_src = t.get_src();
- if (i->is_dirty()) {
- add_to_dirty(i, &t_src);
- } else {
- touch_extent(*i, &t_src);
- }
- }
- }
- if (!backref_list.empty()) {
- backref_batch_update(std::move(backref_list), start_seq);
- }
+ apply_backref_byseq(t.move_backref_entries(), start_seq);
+ commit_backref_entries(std::move(backref_entries), start_seq);
for (auto &i: t.pre_alloc_list) {
if (!i->is_valid()) {
@@ -1822,7 +1857,7 @@ void Cache::init()
remove_extent(root, nullptr);
root = nullptr;
}
- root = new RootBlock();
+ root = CachedExtent::make_cached_extent_ref<RootBlock>();
root->init(CachedExtent::extent_state_t::CLEAN,
P_ADDR_ROOT,
PLACEMENT_HINT_NULL,
@@ -1927,25 +1962,18 @@ Cache::replay_delta(
alloc_delta_t alloc_delta;
decode(alloc_delta, delta.bl);
- std::vector<backref_entry_ref> backref_list;
+ backref_entry_refs_t backref_entries;
for (auto &alloc_blk : alloc_delta.alloc_blk_ranges) {
if (alloc_blk.paddr.is_relative()) {
assert(alloc_blk.paddr.is_record_relative());
alloc_blk.paddr = record_base.add_relative(alloc_blk.paddr);
}
- DEBUG("replay alloc_blk {}~{} {}, journal_seq: {}",
+ DEBUG("replay alloc_blk {}~0x{:x} {}, journal_seq: {}",
alloc_blk.paddr, alloc_blk.len, alloc_blk.laddr, journal_seq);
- backref_list.emplace_back(
- std::make_unique<backref_entry_t>(
- alloc_blk.paddr,
- alloc_blk.laddr,
- alloc_blk.len,
- alloc_blk.type,
- journal_seq));
- }
- if (!backref_list.empty()) {
- backref_batch_update(std::move(backref_list), journal_seq);
+ backref_entries.emplace_back(
+ backref_entry_t::create(alloc_blk));
}
+ commit_backref_entries(std::move(backref_entries), journal_seq);
return replay_delta_ertr::make_ready_future<std::pair<bool, CachedExtentRef>>(
std::make_pair(true, nullptr));
}
@@ -1998,8 +2026,9 @@ Cache::replay_delta(
[](CachedExtent &) {},
[this](CachedExtent &ext) {
// replay is not included by the cache hit metrics
- touch_extent(ext, nullptr);
- }) :
+ touch_extent(ext, nullptr, CACHE_HINT_TOUCH);
+ },
+ nullptr) :
_get_extent_if_cached(
delta.paddr)
).handle_error(
@@ -2162,7 +2191,8 @@ Cache::do_get_caching_extent_by_type(
laddr_t laddr,
extent_len_t length,
extent_init_func_t &&extent_init_func,
- extent_init_func_t &&on_cache)
+ extent_init_func_t &&on_cache,
+ const Transaction::src_t* p_src)
{
return [=, this, extent_init_func=std::move(extent_init_func)]() mutable {
switch (type) {
@@ -2171,55 +2201,61 @@ Cache::do_get_caching_extent_by_type(
return get_extent_ertr::make_ready_future<CachedExtentRef>();
case extent_types_t::BACKREF_INTERNAL:
return do_get_caching_extent<backref::BackrefInternalNode>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::BACKREF_LEAF:
return do_get_caching_extent<backref::BackrefLeafNode>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::LADDR_INTERNAL:
return do_get_caching_extent<lba_manager::btree::LBAInternalNode>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::LADDR_LEAF:
return do_get_caching_extent<lba_manager::btree::LBALeafNode>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
+ case extent_types_t::ROOT_META:
+ return do_get_caching_extent<RootMetaBlock>(
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
+ ).safe_then([](auto extent) {
+ return CachedExtentRef(extent.detach(), false /* add_ref */);
+ });
case extent_types_t::OMAP_INNER:
return do_get_caching_extent<omap_manager::OMapInnerNode>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::OMAP_LEAF:
return do_get_caching_extent<omap_manager::OMapLeafNode>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::COLL_BLOCK:
return do_get_caching_extent<collection_manager::CollectionNode>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::ONODE_BLOCK_STAGED:
return do_get_caching_extent<onode::SeastoreNodeExtent>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::OBJECT_DATA_BLOCK:
return do_get_caching_extent<ObjectDataBlock>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
@@ -2228,13 +2264,13 @@ Cache::do_get_caching_extent_by_type(
return get_extent_ertr::make_ready_future<CachedExtentRef>();
case extent_types_t::TEST_BLOCK:
return do_get_caching_extent<TestBlock>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
case extent_types_t::TEST_BLOCK_PHYSICAL:
return do_get_caching_extent<TestBlockPhysical>(
- offset, length, std::move(extent_init_func), std::move(on_cache)
+ offset, length, std::move(extent_init_func), std::move(on_cache), p_src
).safe_then([](auto extent) {
return CachedExtentRef(extent.detach(), false /* add_ref */);
});
diff --git a/src/crimson/os/seastore/cache.h b/src/crimson/os/seastore/cache.h
index dba3610e95f..a239b861726 100644
--- a/src/crimson/os/seastore/cache.h
+++ b/src/crimson/os/seastore/cache.h
@@ -3,13 +3,13 @@
#pragma once
-#include <iostream>
-
#include "seastar/core/shared_future.hh"
#include "include/buffer.h"
#include "crimson/common/errorator.h"
+#include "crimson/common/errorator-loop.h"
+#include "crimson/os/seastore/backref_entry.h"
#include "crimson/os/seastore/cached_extent.h"
#include "crimson/os/seastore/extent_placement_manager.h"
#include "crimson/os/seastore/logging.h"
@@ -37,86 +37,6 @@ class FixedKVBtree;
class BackrefManager;
class SegmentProvider;
-struct backref_entry_t {
- backref_entry_t(
- const paddr_t paddr,
- const laddr_t laddr,
- const extent_len_t len,
- const extent_types_t type,
- const journal_seq_t seq)
- : paddr(paddr),
- laddr(laddr),
- len(len),
- type(type),
- seq(seq)
- {}
- backref_entry_t(alloc_blk_t alloc_blk)
- : paddr(alloc_blk.paddr),
- laddr(alloc_blk.laddr),
- len(alloc_blk.len),
- type(alloc_blk.type)
- {}
- paddr_t paddr = P_ADDR_NULL;
- laddr_t laddr = L_ADDR_NULL;
- extent_len_t len = 0;
- extent_types_t type =
- extent_types_t::ROOT;
- journal_seq_t seq;
- friend bool operator< (
- const backref_entry_t &l,
- const backref_entry_t &r) {
- return l.paddr < r.paddr;
- }
- friend bool operator> (
- const backref_entry_t &l,
- const backref_entry_t &r) {
- return l.paddr > r.paddr;
- }
- friend bool operator== (
- const backref_entry_t &l,
- const backref_entry_t &r) {
- return l.paddr == r.paddr;
- }
-
- using set_hook_t =
- boost::intrusive::set_member_hook<
- boost::intrusive::link_mode<
- boost::intrusive::auto_unlink>>;
- set_hook_t backref_set_hook;
- using backref_set_member_options = boost::intrusive::member_hook<
- backref_entry_t,
- set_hook_t,
- &backref_entry_t::backref_set_hook>;
- using multiset_t = boost::intrusive::multiset<
- backref_entry_t,
- backref_set_member_options,
- boost::intrusive::constant_time_size<false>>;
-
- struct cmp_t {
- using is_transparent = paddr_t;
- bool operator()(
- const backref_entry_t &l,
- const backref_entry_t &r) const {
- return l.paddr < r.paddr;
- }
- bool operator()(const paddr_t l, const backref_entry_t &r) const {
- return l < r.paddr;
- }
- bool operator()(const backref_entry_t &l, const paddr_t r) const {
- return l.paddr < r;
- }
- };
-};
-
-std::ostream &operator<<(std::ostream &out, const backref_entry_t &ent);
-
-using backref_entry_ref = std::unique_ptr<backref_entry_t>;
-using backref_entry_mset_t = backref_entry_t::multiset_t;
-using backref_entry_refs_t = std::vector<backref_entry_ref>;
-using backref_entryrefs_by_seq_t = std::map<journal_seq_t, backref_entry_refs_t>;
-using backref_entry_query_set_t = std::set<
- backref_entry_t, backref_entry_t::cmp_t>;
-
/**
* Cache
*
@@ -204,6 +124,7 @@ public:
TransactionRef create_transaction(
Transaction::src_t src,
const char* name,
+ cache_hint_t cache_hint,
bool is_weak) {
LOG_PREFIX(Cache::create_transaction);
@@ -217,7 +138,8 @@ public:
[this](Transaction& t) {
return on_transaction_destruct(t);
},
- ++next_id
+ ++next_id,
+ cache_hint
);
SUBDEBUGT(seastore_t, "created name={}, source={}, is_weak={}",
*ret, name, src, is_weak);
@@ -323,8 +245,9 @@ public:
CachedExtentRef>(ret);
});
} else {
- SUBDEBUGT(seastore_cache, "{} {} is present on t -- {}"
- " without being fully loaded", t, type, offset, *ret);
+ SUBDEBUGT(seastore_cache,
+ "{} {} is present on t -- {} without fully loaded",
+ t, type, offset, *ret);
return get_extent_if_cached_iertr::make_ready_future<
CachedExtentRef>();
}
@@ -354,8 +277,8 @@ public:
if (!ret->is_fully_loaded()) {
// ignore non-full extent
- SUBDEBUGT(seastore_cache, "{} {} is present without "
- "being fully loaded", t, type, offset);
+ SUBDEBUGT(seastore_cache,
+ "{} {} is present without fully loaded", t, type, offset);
return get_extent_if_cached_iertr::make_ready_future<CachedExtentRef>();
}
@@ -363,7 +286,7 @@ public:
SUBDEBUGT(seastore_cache, "{} {} is present in cache -- {}",
t, type, offset, *ret);
t.add_to_read_set(ret);
- touch_extent(*ret, &t_src);
+ touch_extent(*ret, &t_src, t.get_cache_hint());
return ret->wait_io().then([ret] {
return get_extent_if_cached_iertr::make_ready_future<
CachedExtentRef>(ret);
@@ -394,39 +317,37 @@ public:
extent_len_t length) {
CachedExtentRef ret;
LOG_PREFIX(Cache::get_caching_extent);
+ const auto t_src = t.get_src();
auto result = t.get_extent(offset, &ret);
if (result == Transaction::get_extent_ret::RETIRED) {
- SUBERRORT(seastore_cache, "{} {}~{} is retired on t -- {}",
+ SUBERRORT(seastore_cache, "{} {}~0x{:x} is retired on t -- {}",
t, T::TYPE, offset, length, *ret);
ceph_abort("impossible");
} else if (result == Transaction::get_extent_ret::PRESENT) {
+ assert(ret->get_length() == length);
if (ret->is_fully_loaded()) {
- SUBTRACET(seastore_cache, "{} {}~{} is present on t -- {}",
+ SUBTRACET(seastore_cache, "{} {}~0x{:x} is present on t -- {}",
t, T::TYPE, offset, length, *ret);
return ret->wait_io().then([ret] {
return seastar::make_ready_future<TCachedExtentRef<T>>(
ret->cast<T>());
});
} else {
- assert(!ret->is_mutable());
- SUBDEBUGT(seastore_cache, "{} {}~{} is present on t without been \
- fully loaded, reading ... {}", t, T::TYPE, offset, length, *ret);
- auto bp = alloc_cache_buf(ret->get_length());
- ret->set_bptr(std::move(bp));
- return read_extent<T>(
- ret->cast<T>());
+ SUBDEBUGT(seastore_cache,
+ "{} {}~0x{:x} is present on t without fully loaded, reading ... -- {}",
+ t, T::TYPE, offset, length, *ret);
+ return do_read_extent_maybe_partial<T>(ret->cast<T>(), 0, length, &t_src);
}
} else {
- SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...",
+ SUBTRACET(seastore_cache, "{} {}~0x{:x} is absent on t, query cache ...",
t, T::TYPE, offset, length);
- auto f = [&t, this](CachedExtent &ext) {
+ auto f = [&t, this, t_src](CachedExtent &ext) {
t.add_to_read_set(CachedExtentRef(&ext));
- const auto t_src = t.get_src();
- touch_extent(ext, &t_src);
+ touch_extent(ext, &t_src, t.get_cache_hint());
};
return trans_intr::make_interruptible(
do_get_caching_extent<T>(
- offset, length, [](T &){}, std::move(f))
+ offset, length, [](T &){}, std::move(f), &t_src)
);
}
}
@@ -435,12 +356,15 @@ public:
* get_absent_extent
*
* The extent in query is supposed to be absent in Cache.
+ * partially load buffer from partial_off~partial_len if not present.
*/
template <typename T, typename Func>
get_extent_iertr::future<TCachedExtentRef<T>> get_absent_extent(
Transaction &t,
paddr_t offset,
extent_len_t length,
+ extent_len_t partial_off,
+ extent_len_t partial_len,
Func &&extent_init_func) {
CachedExtentRef ret;
LOG_PREFIX(Cache::get_absent_extent);
@@ -453,13 +377,13 @@ public:
}
#endif
- SUBTRACET(seastore_cache, "{} {}~{} is absent on t, query cache ...",
+ SUBTRACET(seastore_cache, "{} {}~0x{:x} is absent on t, query cache ...",
t, T::TYPE, offset, length);
- auto f = [&t, this](CachedExtent &ext) {
+ const auto t_src = t.get_src();
+ auto f = [&t, this, t_src](CachedExtent &ext) {
// FIXME: assert(ext.is_stable_clean());
assert(ext.is_stable());
assert(T::TYPE == ext.get_type());
- const auto t_src = t.get_src();
extent_access_stats_t& access_stats = get_by_ext(
get_by_src(stats.access_by_src_ext, t_src),
T::TYPE);
@@ -467,11 +391,12 @@ public:
++stats.access.s.load_absent;
t.add_to_read_set(CachedExtentRef(&ext));
- touch_extent(ext, &t_src);
+ touch_extent(ext, &t_src, t.get_cache_hint());
};
return trans_intr::make_interruptible(
do_get_caching_extent<T>(
- offset, length, std::forward<Func>(extent_init_func), std::move(f))
+ offset, length, partial_off, partial_len,
+ std::forward<Func>(extent_init_func), std::move(f), &t_src)
);
}
@@ -495,6 +420,16 @@ public:
return get_absent_extent<T>(t, offset, length, [](T &){});
}
+ template <typename T, typename Func>
+ get_extent_iertr::future<TCachedExtentRef<T>> get_absent_extent(
+ Transaction &t,
+ paddr_t offset,
+ extent_len_t length,
+ Func &&extent_init_func) {
+ return get_absent_extent<T>(t, offset, length, 0, length,
+ std::forward<Func>(extent_init_func));
+ }
+
bool is_viewable_extent_stable(
Transaction &t,
CachedExtentRef extent)
@@ -513,8 +448,7 @@ public:
return view->is_data_stable();
}
- using get_extent_ertr = base_ertr;
- get_extent_ertr::future<CachedExtentRef>
+ get_extent_iertr::future<CachedExtentRef>
get_extent_viewable_by_trans(
Transaction &t,
CachedExtentRef extent)
@@ -539,7 +473,7 @@ public:
if (p_extent->is_mutable()) {
assert(p_extent->is_fully_loaded());
assert(!p_extent->is_pending_io());
- return get_extent_ertr::make_ready_future<CachedExtentRef>(
+ return get_extent_iertr::make_ready_future<CachedExtentRef>(
CachedExtentRef(p_extent));
} else {
assert(p_extent->is_exist_clean());
@@ -555,7 +489,7 @@ public:
++access_stats.cache_lru;
++stats.access.s.cache_lru;
}
- touch_extent(*p_extent, &t_src);
+ touch_extent(*p_extent, &t_src, t.get_cache_hint());
} else {
if (p_extent->is_dirty()) {
++access_stats.trans_dirty;
@@ -574,7 +508,7 @@ public:
if (extent->is_mutable()) {
assert(extent->is_fully_loaded());
assert(!extent->is_pending_io());
- return get_extent_ertr::make_ready_future<CachedExtentRef>(extent);
+ return get_extent_iertr::make_ready_future<CachedExtentRef>(extent);
} else {
assert(extent->is_exist_clean());
p_extent = extent.get();
@@ -583,40 +517,66 @@ public:
// user should not see RETIRED_PLACEHOLDER extents
ceph_assert(!is_retired_placeholder_type(p_extent->get_type()));
- if (!p_extent->is_fully_loaded()) {
- assert(!p_extent->is_mutable());
- ++access_stats.load_present;
- ++stats.access.s.load_present;
- LOG_PREFIX(Cache::get_extent_viewable_by_trans);
- SUBDEBUG(seastore_cache,
- "{} {}~{} is present without been fully loaded, reading ... -- {}",
- p_extent->get_type(), p_extent->get_paddr(),p_extent->get_length(),
- *p_extent);
- auto bp = alloc_cache_buf(p_extent->get_length());
- p_extent->set_bptr(std::move(bp));
- return read_extent<CachedExtent>(CachedExtentRef(p_extent));
- }
- return p_extent->wait_io(
- ).then([p_extent] {
- return get_extent_ertr::make_ready_future<CachedExtentRef>(
+ // for logical extents, handle partial load in TM::read_pin(),
+ // also see read_extent_maybe_partial() and get_absent_extent()
+ assert(is_logical_type(p_extent->get_type()) ||
+ p_extent->is_fully_loaded());
+
+ return trans_intr::make_interruptible(
+ p_extent->wait_io()
+ ).then_interruptible([p_extent] {
+ return get_extent_iertr::make_ready_future<CachedExtentRef>(
CachedExtentRef(p_extent));
});
}
template <typename T>
- using read_extent_ret = get_extent_ertr::future<TCachedExtentRef<T>>;
-
- template <typename T>
- read_extent_ret<T> get_extent_viewable_by_trans(
+ get_extent_iertr::future<TCachedExtentRef<T>>
+ get_extent_viewable_by_trans(
Transaction &t,
TCachedExtentRef<T> extent)
{
return get_extent_viewable_by_trans(t, CachedExtentRef(extent.get())
- ).safe_then([](auto p_extent) {
+ ).si_then([](auto p_extent) {
return p_extent->template cast<T>();
});
}
+ // wait extent io or do partial reads
+ template <typename T>
+ get_extent_iertr::future<TCachedExtentRef<T>>
+ read_extent_maybe_partial(
+ Transaction &t,
+ TCachedExtentRef<T> extent,
+ extent_len_t partial_off,
+ extent_len_t partial_len) {
+ assert(is_logical_type(extent->get_type()));
+ if (!extent->is_range_loaded(partial_off, partial_len)) {
+ LOG_PREFIX(Cache::read_extent_maybe_partial);
+ SUBDEBUGT(seastore_cache,
+ "{} {}~0x{:x} is present on t without range 0x{:x}~0x{:x}, reading ... -- {}",
+ t, extent->get_type(), extent->get_paddr(), extent->get_length(),
+ partial_off, partial_len, *extent);
+ const auto t_src = t.get_src();
+ extent_access_stats_t& access_stats = get_by_ext(
+ get_by_src(stats.access_by_src_ext, t_src),
+ extent->get_type());
+ ++access_stats.load_present;
+ ++stats.access.s.load_present;
+ return trans_intr::make_interruptible(
+ do_read_extent_maybe_partial(
+ std::move(extent), partial_off, partial_len, &t_src));
+ } else {
+ // TODO(implement fine-grained-wait):
+ // the range might be already loaded, but we don't know
+ return trans_intr::make_interruptible(
+ extent->wait_io()
+ ).then_interruptible([extent] {
+ return get_extent_iertr::make_ready_future<TCachedExtentRef<T>>(extent);
+ });
+ }
+ }
+
extent_len_t get_block_size() const {
return epm.get_block_size();
}
@@ -628,54 +588,122 @@ public:
}
private:
+ using get_extent_ertr = base_ertr;
+ template <typename T>
+ using read_extent_ret = get_extent_ertr::future<TCachedExtentRef<T>>;
+ /// Implements exclusive call to read_extent() for the extent
+ template <typename T>
+ read_extent_ret<T> do_read_extent_maybe_partial(
+ TCachedExtentRef<T>&& extent,
+ extent_len_t partial_off,
+ extent_len_t partial_len,
+ const Transaction::src_t* p_src)
+ {
+ LOG_PREFIX(Cache::do_read_extent_maybe_partial);
+ // They must be atomic:
+ // 1. checking missing range and wait io
+ // 2. checking missing range and read
+ // because the extents in Caches can be accessed concurrently
+ //
+ // TODO(implement fine-grained-wait)
+ assert(!extent->is_range_loaded(partial_off, partial_len));
+ assert(!extent->is_mutable());
+ if (extent->is_pending_io()) {
+ std::optional<Transaction::src_t> src;
+ if (p_src) {
+ src = *p_src;
+ }
+ auto* p_extent = extent.get();
+ return p_extent->wait_io(
+ ).then([extent=std::move(extent), partial_off, partial_len, this, FNAME, src]() mutable
+ -> read_extent_ret<T> {
+ if (extent->is_range_loaded(partial_off, partial_len)) {
+ SUBDEBUG(seastore_cache,
+ "{} {}~0x{:x} got range 0x{:x}~0x{:x} ... -- {}",
+ extent->get_type(), extent->get_paddr(), extent->get_length(),
+ partial_off, partial_len, *extent);
+ // we don't know whether the target range is loading or not
+ if (extent->is_pending_io()) {
+ auto* p_extent = extent.get();
+ return p_extent->wait_io(
+ ).then([extent=std::move(extent)]() mutable {
+ return seastar::make_ready_future<TCachedExtentRef<T>>(std::move(extent));
+ });
+ } else {
+ return seastar::make_ready_future<TCachedExtentRef<T>>(std::move(extent));
+ }
+ } else { // range not loaded
+ SUBDEBUG(seastore_cache,
+ "{} {}~0x{:x} without range 0x{:x}~0x{:x} ... -- {}",
+ extent->get_type(), extent->get_paddr(), extent->get_length(),
+ partial_off, partial_len, *extent);
+ Transaction::src_t* p_src = (src.has_value() ? &src.value() : nullptr);
+ return do_read_extent_maybe_partial(
+ std::move(extent), partial_off, partial_len, p_src);
+ }
+ });
+ } else {
+ SUBDEBUG(seastore_cache,
+ "{} {}~0x{:x} is not pending without range 0x{:x}~0x{:x}, reading ... -- {}",
+ extent->get_type(), extent->get_paddr(), extent->get_length(),
+ partial_off, partial_len, *extent);
+ return read_extent<T>(
+ std::move(extent), partial_off, partial_len, p_src);
+ }
+ }
+
/**
* do_get_caching_extent
*
* returns ref to extent at offset~length of type T either from
* - extent_set if already in cache
* - disk
+ * only load partial_off~partial_len
*/
using src_ext_t = std::pair<Transaction::src_t, extent_types_t>;
template <typename T, typename Func, typename OnCache>
read_extent_ret<T> do_get_caching_extent(
paddr_t offset, ///< [in] starting addr
extent_len_t length, ///< [in] length
+ extent_len_t partial_off, ///< [in] offset of piece in extent
+ extent_len_t partial_len, ///< [in] length of piece in extent
Func &&extent_init_func, ///< [in] init func for extent
- OnCache &&on_cache
+ OnCache &&on_cache,
+ const Transaction::src_t* p_src
) {
LOG_PREFIX(Cache::do_get_caching_extent);
auto cached = query_cache(offset);
if (!cached) {
- auto ret = CachedExtent::make_cached_extent_ref<T>(
- alloc_cache_buf(length));
+ // partial read
+ TCachedExtentRef<T> ret = CachedExtent::make_cached_extent_ref<T>(length);
ret->init(CachedExtent::extent_state_t::CLEAN_PENDING,
offset,
PLACEMENT_HINT_NULL,
NULL_GENERATION,
TRANS_ID_NULL);
SUBDEBUG(seastore_cache,
- "{} {}~{} is absent, add extent and reading ... -- {}",
- T::TYPE, offset, length, *ret);
+ "{} {}~0x{:x} is absent, add extent and reading range 0x{:x}~0x{:x} ... -- {}",
+ T::TYPE, offset, length, partial_off, partial_len, *ret);
add_extent(ret);
// touch_extent() should be included in on_cache
on_cache(*ret);
extent_init_func(*ret);
return read_extent<T>(
- std::move(ret));
+ std::move(ret), partial_off, partial_len, p_src);
}
// extent PRESENT in cache
if (is_retired_placeholder_type(cached->get_type())) {
- auto ret = CachedExtent::make_cached_extent_ref<T>(
- alloc_cache_buf(length));
+ // partial read
+ TCachedExtentRef<T> ret = CachedExtent::make_cached_extent_ref<T>(length);
ret->init(CachedExtent::extent_state_t::CLEAN_PENDING,
offset,
PLACEMENT_HINT_NULL,
NULL_GENERATION,
TRANS_ID_NULL);
SUBDEBUG(seastore_cache,
- "{} {}~{} is absent(placeholder), reading ... -- {}",
- T::TYPE, offset, length, *ret);
+ "{} {}~0x{:x} is absent(placeholder), add extent and reading range 0x{:x}~0x{:x} ... -- {}",
+ T::TYPE, offset, length, partial_off, partial_len, *ret);
extents_index.replace(*ret, *cached);
on_cache(*ret);
@@ -688,34 +716,41 @@ private:
cached->state = CachedExtent::extent_state_t::INVALID;
extent_init_func(*ret);
return read_extent<T>(
- std::move(ret));
- } else if (!cached->is_fully_loaded()) {
- auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get()));
- on_cache(*ret);
- SUBDEBUG(seastore_cache,
- "{} {}~{} is present without been fully loaded, reading ... -- {}",
- T::TYPE, offset, length, *ret);
- auto bp = alloc_cache_buf(length);
- ret->set_bptr(std::move(bp));
- return read_extent<T>(
- std::move(ret));
- } else {
+ std::move(ret), partial_off, partial_len, p_src);
+ }
+
+ auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get()));
+ on_cache(*ret);
+ if (ret->is_range_loaded(partial_off, partial_len)) {
SUBTRACE(seastore_cache,
- "{} {}~{} is present in cache -- {}",
- T::TYPE, offset, length, *cached);
- auto ret = TCachedExtentRef<T>(static_cast<T*>(cached.get()));
- on_cache(*ret);
- return ret->wait_io(
- ).then([ret=std::move(ret)]() mutable
- -> read_extent_ret<T> {
+ "{} {}~0x{:x} is present with range 0x{:x}~0x{:x} ... -- {}",
+ T::TYPE, offset, length, partial_off, partial_len, *ret);
+ return ret->wait_io().then([ret] {
// ret may be invalid, caller must check
- return read_extent_ret<T>(
- get_extent_ertr::ready_future_marker{},
- std::move(ret));
+ return seastar::make_ready_future<TCachedExtentRef<T>>(ret);
});
+ } else {
+ SUBDEBUG(seastore_cache,
+ "{} {}~0x{:x} is present without range 0x{:x}~0x{:x}, reading ... -- {}",
+ T::TYPE, offset, length, partial_off, partial_len, *ret);
+ return do_read_extent_maybe_partial(
+ std::move(ret), partial_off, partial_len, p_src);
}
}
+ template <typename T, typename Func, typename OnCache>
+ read_extent_ret<T> do_get_caching_extent(
+ paddr_t offset, ///< [in] starting addr
+ extent_len_t length, ///< [in] length
+ Func &&extent_init_func, ///< [in] init func for extent
+ OnCache &&on_cache,
+ const Transaction::src_t* p_src
+ ) {
+ return do_get_caching_extent<T>(offset, length, 0, length,
+ std::forward<Func>(extent_init_func),
+ std::forward<OnCache>(on_cache),
+ p_src);
+ }
// This is a workaround std::move_only_function not being available,
// not really worth generalizing at this time.
@@ -751,8 +786,8 @@ private:
laddr_t laddr,
extent_len_t length,
extent_init_func_t &&extent_init_func,
- extent_init_func_t &&on_cache
- );
+ extent_init_func_t &&on_cache,
+ const Transaction::src_t* p_src);
/**
* get_caching_extent_by_type
@@ -774,40 +809,39 @@ private:
extent_init_func_t &&extent_init_func
) {
LOG_PREFIX(Cache::get_caching_extent_by_type);
+ const auto t_src = t.get_src();
CachedExtentRef ret;
auto status = t.get_extent(offset, &ret);
if (status == Transaction::get_extent_ret::RETIRED) {
- SUBERRORT(seastore_cache, "{} {}~{} {} is retired on t -- {}",
+ SUBERRORT(seastore_cache, "{} {}~0x{:x} {} is retired on t -- {}",
t, type, offset, length, laddr, *ret);
ceph_abort("impossible");
} else if (status == Transaction::get_extent_ret::PRESENT) {
+ assert(ret->get_length() == length);
if (ret->is_fully_loaded()) {
- SUBTRACET(seastore_cache, "{} {}~{} {} is present on t -- {}",
+ SUBTRACET(seastore_cache, "{} {}~0x{:x} {} is present on t -- {}",
t, type, offset, length, laddr, *ret);
return ret->wait_io().then([ret] {
return seastar::make_ready_future<CachedExtentRef>(ret);
});
} else {
- assert(!ret->is_mutable());
- SUBDEBUGT(seastore_cache, "{} {}~{} {} is present on t without been \
- fully loaded, reading ...", t, type, offset, length, laddr);
- auto bp = alloc_cache_buf(ret->get_length());
- ret->set_bptr(std::move(bp));
- return read_extent<CachedExtent>(
- std::move(ret));
+ SUBDEBUGT(seastore_cache,
+ "{} {}~0x{:x} {} is present on t without fully loaded, reading ... -- {}",
+ t, type, offset, length, laddr, *ret);
+ return do_read_extent_maybe_partial<CachedExtent>(
+ std::move(ret), 0, length, &t_src);
}
} else {
- SUBTRACET(seastore_cache, "{} {}~{} {} is absent on t, query cache ...",
+ SUBTRACET(seastore_cache, "{} {}~0x{:x} {} is absent on t, query cache ...",
t, type, offset, length, laddr);
- auto f = [&t, this](CachedExtent &ext) {
+ auto f = [&t, this, t_src](CachedExtent &ext) {
t.add_to_read_set(CachedExtentRef(&ext));
- const auto t_src = t.get_src();
- touch_extent(ext, &t_src);
+ touch_extent(ext, &t_src, t.get_cache_hint());
};
return trans_intr::make_interruptible(
do_get_caching_extent_by_type(
type, offset, laddr, length,
- std::move(extent_init_func), std::move(f))
+ std::move(extent_init_func), std::move(f), &t_src)
);
}
}
@@ -831,12 +865,12 @@ private:
}
#endif
- SUBTRACET(seastore_cache, "{} {}~{} {} is absent on t, query cache ...",
+ SUBTRACET(seastore_cache, "{} {}~0x{:x} {} is absent on t, query cache ...",
t, type, offset, length, laddr);
- auto f = [&t, this](CachedExtent &ext) {
+ const auto t_src = t.get_src();
+ auto f = [&t, this, t_src](CachedExtent &ext) {
// FIXME: assert(ext.is_stable_clean());
assert(ext.is_stable());
- const auto t_src = t.get_src();
extent_access_stats_t& access_stats = get_by_ext(
get_by_src(stats.access_by_src_ext, t_src),
ext.get_type());
@@ -844,12 +878,12 @@ private:
++stats.access.s.load_absent;
t.add_to_read_set(CachedExtentRef(&ext));
- touch_extent(ext, &t_src);
+ touch_extent(ext, &t_src, t.get_cache_hint());
};
return trans_intr::make_interruptible(
do_get_caching_extent_by_type(
type, offset, laddr, length,
- std::move(extent_init_func), std::move(f))
+ std::move(extent_init_func), std::move(f), &t_src)
);
}
@@ -871,7 +905,7 @@ private:
for (auto it = start_iter;
it != end_iter;
it++) {
- res.emplace(it->paddr, it->laddr, it->len, it->type, it->seq);
+ res.emplace(it->paddr, it->laddr, it->len, it->type);
}
return res;
}
@@ -970,7 +1004,7 @@ public:
#endif
) {
LOG_PREFIX(Cache::alloc_new_non_data_extent);
- SUBTRACET(seastore_cache, "allocate {} {}B, hint={}, gen={}",
+ SUBTRACET(seastore_cache, "allocate {} 0x{:x}B, hint={}, gen={}",
t, T::TYPE, length, hint, rewrite_gen_printer_t{gen});
#ifdef UNIT_TESTS_BUILT
auto result = epm.alloc_new_non_data_extent(t, T::TYPE, length, hint, gen, epaddr);
@@ -978,7 +1012,8 @@ public:
auto result = epm.alloc_new_non_data_extent(t, T::TYPE, length, hint, gen);
#endif
if (!result) {
- return nullptr;
+ SUBERRORT(seastore_cache, "insufficient space", t);
+ std::rethrow_exception(crimson::ct_error::enospc::exception_ptr());
}
auto ret = CachedExtent::make_cached_extent_ref<T>(std::move(result->bp));
ret->init(CachedExtent::extent_state_t::INITIAL_WRITE_PENDING,
@@ -988,7 +1023,7 @@ public:
t.get_trans_id());
t.add_fresh_extent(ret);
SUBDEBUGT(seastore_cache,
- "allocated {} {}B extent at {}, hint={}, gen={} -- {}",
+ "allocated {} 0x{:x}B extent at {}, hint={}, gen={} -- {}",
t, T::TYPE, length, result->paddr,
hint, rewrite_gen_printer_t{result->gen}, *ret);
return ret;
@@ -1012,13 +1047,17 @@ public:
#endif
) {
LOG_PREFIX(Cache::alloc_new_data_extents);
- SUBTRACET(seastore_cache, "allocate {} {}B, hint={}, gen={}",
+ SUBTRACET(seastore_cache, "allocate {} 0x{:x}B, hint={}, gen={}",
t, T::TYPE, length, hint, rewrite_gen_printer_t{gen});
#ifdef UNIT_TESTS_BUILT
auto results = epm.alloc_new_data_extents(t, T::TYPE, length, hint, gen, epaddr);
#else
auto results = epm.alloc_new_data_extents(t, T::TYPE, length, hint, gen);
#endif
+ if (results.empty()) {
+ SUBERRORT(seastore_cache, "insufficient space", t);
+ std::rethrow_exception(crimson::ct_error::enospc::exception_ptr());
+ }
std::vector<TCachedExtentRef<T>> extents;
for (auto &result : results) {
auto ret = CachedExtent::make_cached_extent_ref<T>(std::move(result.bp));
@@ -1029,7 +1068,7 @@ public:
t.get_trans_id());
t.add_fresh_extent(ret);
SUBDEBUGT(seastore_cache,
- "allocated {} {}B extent at {}, hint={}, gen={} -- {}",
+ "allocated {} 0x{:x}B extent at {}, hint={}, gen={} -- {}",
t, T::TYPE, length, result.paddr,
hint, rewrite_gen_printer_t{result.gen}, *ret);
extents.emplace_back(std::move(ret));
@@ -1063,7 +1102,7 @@ public:
// (relative/temp) paddr, so make extent directly
ext = CachedExtent::make_cached_extent_ref<T>(std::move(nbp));
} else {
- ext = CachedExtent::make_placeholder_cached_extent_ref<T>(remap_length);
+ ext = CachedExtent::make_cached_extent_ref<T>(remap_length);
}
ext->init(CachedExtent::extent_state_t::EXIST_CLEAN,
@@ -1075,7 +1114,7 @@ public:
auto extent = ext->template cast<T>();
extent->set_laddr(remap_laddr);
t.add_fresh_extent(ext);
- SUBTRACET(seastore_cache, "allocated {} {}B, hint={}, has ptr? {} -- {}",
+ SUBTRACET(seastore_cache, "allocated {} 0x{:x}B, hint={}, has ptr? {} -- {}",
t, T::TYPE, remap_length, remap_laddr, original_bptr.has_value(), *extent);
return extent;
}
@@ -1218,7 +1257,7 @@ public:
{
LOG_PREFIX(Cache::init_cached_extents);
SUBINFOT(seastore_cache,
- "start with {}({}B) extents, {} dirty, dirty_from={}, alloc_from={}",
+ "start with {}(0x{:x}B) extents, {} dirty, dirty_from={}, alloc_from={}",
t,
extents_index.size(),
extents_index.get_bytes(),
@@ -1261,7 +1300,7 @@ public:
}
).si_then([this, FNAME, &t] {
SUBINFOT(seastore_cache,
- "finish with {}({}B) extents, {} dirty, dirty_from={}, alloc_from={}",
+ "finish with {}(0x{:x}B) extents, {} dirty, dirty_from={}, alloc_from={}",
t,
extents_index.size(),
extents_index.get_bytes(),
@@ -1435,11 +1474,10 @@ private:
/// Update lru for access to ref
void touch_extent(
CachedExtent &ext,
- const Transaction::src_t* p_src)
+ const Transaction::src_t* p_src,
+ cache_hint_t hint)
{
- if (p_src &&
- is_background_transaction(*p_src) &&
- is_logical_type(ext.get_type())) {
+ if (hint == CACHE_HINT_NOCACHE && is_logical_type(ext.get_type())) {
return;
}
if (ext.is_stable_clean() && !ext.is_placeholder()) {
@@ -1530,22 +1568,29 @@ private:
assert(extent.is_stable_clean() && !extent.is_placeholder());
assert(extent.primary_ref_list_hook.is_linked());
assert(lru.size() > 0);
- auto extent_length = extent.get_length();
- assert(current_size >= extent_length);
+ auto extent_loaded_length = extent.get_loaded_length();
+ assert(current_size >= extent_loaded_length);
lru.erase(lru.s_iterator_to(extent));
- current_size -= extent_length;
- get_by_ext(sizes_by_ext, extent.get_type()).account_out(extent_length);
- overall_io.out_sizes.account_in(extent_length);
+ current_size -= extent_loaded_length;
+ get_by_ext(sizes_by_ext, extent.get_type()).account_out(extent_loaded_length);
+ overall_io.out_sizes.account_in(extent_loaded_length);
if (p_src) {
get_by_ext(
get_by_src(trans_io_by_src_ext, *p_src),
extent.get_type()
- ).out_sizes.account_in(extent_length);
+ ).out_sizes.account_in(extent_loaded_length);
}
intrusive_ptr_release(&extent);
}
+ void trim_to_capacity(
+ const Transaction::src_t* p_src) {
+ while (current_size > capacity) {
+ do_remove_from_lru(lru.front(), p_src);
+ }
+ }
+
public:
LRU(size_t capacity) : capacity(capacity) {}
@@ -1579,31 +1624,55 @@ private:
const Transaction::src_t* p_src) {
assert(extent.is_stable_clean() && !extent.is_placeholder());
- auto extent_length = extent.get_length();
+ auto extent_loaded_length = extent.get_loaded_length();
if (extent.primary_ref_list_hook.is_linked()) {
// present, move to top (back)
assert(lru.size() > 0);
- assert(current_size >= extent_length);
+ assert(current_size >= extent_loaded_length);
lru.erase(lru.s_iterator_to(extent));
lru.push_back(extent);
} else {
// absent, add to top (back)
- current_size += extent_length;
- get_by_ext(sizes_by_ext, extent.get_type()).account_in(extent_length);
- overall_io.in_sizes.account_in(extent_length);
+ if (extent_loaded_length > 0) {
+ current_size += extent_loaded_length;
+ get_by_ext(sizes_by_ext, extent.get_type()).account_in(extent_loaded_length);
+ overall_io.in_sizes.account_in(extent_loaded_length);
+ if (p_src) {
+ get_by_ext(
+ get_by_src(trans_io_by_src_ext, *p_src),
+ extent.get_type()
+ ).in_sizes.account_in(extent_loaded_length);
+ }
+ } // else: the extent isn't loaded upon touch_extent()/on_cache(),
+ // account the io later in increase_cached_size() upon read_extent()
+ intrusive_ptr_add_ref(&extent);
+ lru.push_back(extent);
+
+ trim_to_capacity(p_src);
+ }
+ }
+
+ void increase_cached_size(
+ CachedExtent &extent,
+ extent_len_t increased_length,
+ const Transaction::src_t* p_src) {
+ assert(!extent.is_mutable());
+
+ if (extent.primary_ref_list_hook.is_linked()) {
+ assert(extent.is_stable_clean() && !extent.is_placeholder());
+ // present, increase size
+ assert(lru.size() > 0);
+ current_size += increased_length;
+ get_by_ext(sizes_by_ext, extent.get_type()).account_in(increased_length);
+ overall_io.in_sizes.account_in(increased_length);
if (p_src) {
get_by_ext(
get_by_src(trans_io_by_src_ext, *p_src),
extent.get_type()
- ).in_sizes.account_in(extent_length);
+ ).in_sizes.account_in(increased_length);
}
- intrusive_ptr_add_ref(&extent);
- lru.push_back(extent);
- // trim to capacity
- while (current_size > capacity) {
- do_remove_from_lru(lru.front(), p_src);
- }
+ trim_to_capacity(nullptr);
}
}
@@ -1758,18 +1827,23 @@ private:
seastar::metrics::metric_group metrics;
void register_metrics();
- /// alloc buffer for cached extent
- bufferptr alloc_cache_buf(size_t size) {
- // TODO: memory pooling etc
- auto bp = ceph::bufferptr(
- buffer::create_page_aligned(size));
- bp.zero();
- return bp;
+ void apply_backref_mset(
+ backref_entry_refs_t& backref_entries) {
+ for (auto& entry : backref_entries) {
+ backref_entry_mset.insert(*entry);
+ }
}
- void backref_batch_update(
- std::vector<backref_entry_ref> &&,
- const journal_seq_t &);
+ void apply_backref_byseq(
+ backref_entry_refs_t&& backref_entries,
+ const journal_seq_t& seq);
+
+ void commit_backref_entries(
+ backref_entry_refs_t&& backref_entries,
+ const journal_seq_t& seq) {
+ apply_backref_mset(backref_entries);
+ apply_backref_byseq(std::move(backref_entries), seq);
+ }
/// Add extent to extents handling dirty and refcounting
///
@@ -1819,39 +1893,74 @@ private:
/// Introspect transaction when it is being destructed
void on_transaction_destruct(Transaction& t);
+ /// Read the extent in range offset~length,
+ /// must be called exclusively for an extent,
+ /// also see do_read_extent_maybe_partial().
+ ///
+ /// May return an invalid extent due to transaction conflict.
template <typename T>
read_extent_ret<T> read_extent(
- TCachedExtentRef<T>&& extent
+ TCachedExtentRef<T>&& extent,
+ extent_len_t offset,
+ extent_len_t length,
+ const Transaction::src_t* p_src
) {
+ LOG_PREFIX(Cache::read_extent);
assert(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING ||
- extent->state == CachedExtent::extent_state_t::EXIST_CLEAN ||
- extent->state == CachedExtent::extent_state_t::CLEAN);
+ extent->state == CachedExtent::extent_state_t::EXIST_CLEAN ||
+ extent->state == CachedExtent::extent_state_t::CLEAN);
+ assert(!extent->is_range_loaded(offset, length));
+ assert(is_aligned(offset, get_block_size()));
+ assert(is_aligned(length, get_block_size()));
extent->set_io_wait();
- return epm.read(
- extent->get_paddr(),
- extent->get_length(),
- extent->get_bptr()
- ).safe_then(
- [extent=std::move(extent), this]() mutable {
- LOG_PREFIX(Cache::read_extent);
- if (likely(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING)) {
- extent->state = CachedExtent::extent_state_t::CLEAN;
- }
- ceph_assert(extent->state == CachedExtent::extent_state_t::EXIST_CLEAN
- || extent->state == CachedExtent::extent_state_t::CLEAN
- || !extent->is_valid());
- if (extent->is_valid()) {
- // crc will be checked against LBA leaf entry for logical extents,
- // or check against in-extent crc for physical extents.
- if (epm.get_checksum_needed(extent->get_paddr())) {
- extent->last_committed_crc = extent->calc_crc32c();
- } else {
- extent->last_committed_crc = CRC_NULL;
- }
- extent->on_clean_read();
- }
+ auto old_length = extent->get_loaded_length();
+ load_ranges_t to_read = extent->load_ranges(offset, length);
+ auto new_length = extent->get_loaded_length();
+ assert(new_length > old_length);
+ lru.increase_cached_size(*extent, new_length - old_length, p_src);
+ return seastar::do_with(to_read.ranges, [extent, this, FNAME](auto &read_ranges) {
+ return ExtentPlacementManager::read_ertr::parallel_for_each(
+ read_ranges, [extent, this, FNAME](auto &read_range) {
+ SUBDEBUG(seastore_cache, "reading extent {} 0x{:x}~0x{:x} ...",
+ extent->get_paddr(), read_range.offset, read_range.get_length());
+ assert(is_aligned(read_range.offset, get_block_size()));
+ assert(is_aligned(read_range.get_length(), get_block_size()));
+ return epm.read(
+ extent->get_paddr() + read_range.offset,
+ read_range.get_length(),
+ read_range.ptr);
+ });
+ }).safe_then(
+ [this, FNAME, extent=std::move(extent), offset, length]() mutable {
+ if (likely(extent->state == CachedExtent::extent_state_t::CLEAN_PENDING)) {
+ extent->state = CachedExtent::extent_state_t::CLEAN;
+ }
+ ceph_assert(extent->state == CachedExtent::extent_state_t::EXIST_CLEAN
+ || extent->state == CachedExtent::extent_state_t::CLEAN
+ || !extent->is_valid());
+ if (extent->is_valid()) {
+ if (extent->is_fully_loaded()) {
+ // crc will be checked against LBA leaf entry for logical extents,
+ // or check against in-extent crc for physical extents.
+ if (epm.get_checksum_needed(extent->get_paddr())) {
+ extent->last_committed_crc = extent->calc_crc32c();
+ } else {
+ extent->last_committed_crc = CRC_NULL;
+ }
+ // on_clean_read() may change the content, call after calc_crc32c()
+ extent->on_clean_read();
+ SUBDEBUG(seastore_cache, "read extent 0x{:x}~0x{:x} done -- {}",
+ offset, length, *extent);
+ } else {
+ extent->last_committed_crc = CRC_NULL;
+ SUBDEBUG(seastore_cache, "read extent 0x{:x}~0x{:x} done (partial) -- {}",
+ offset, length, *extent);
+ }
+ } else {
+ SUBDEBUG(seastore_cache, "read extent 0x{:x}~0x{:x} done (invalidated) -- {}",
+ offset, length, *extent);
+ }
extent->complete_io();
- SUBDEBUG(seastore_cache, "read extent done -- {}", *extent);
return get_extent_ertr::make_ready_future<TCachedExtentRef<T>>(
std::move(extent));
},
diff --git a/src/crimson/os/seastore/cached_extent.cc b/src/crimson/os/seastore/cached_extent.cc
index 76c18bde667..49fede1d9a8 100644
--- a/src/crimson/os/seastore/cached_extent.cc
+++ b/src/crimson/os/seastore/cached_extent.cc
@@ -7,6 +7,7 @@
#include "crimson/common/log.h"
#include "crimson/os/seastore/btree/fixed_kv_node.h"
+#include "crimson/os/seastore/lba_mapping.h"
namespace {
[[maybe_unused]] seastar::logger& logger() {
@@ -38,12 +39,6 @@ void intrusive_ptr_release(CachedExtent *ptr)
#endif
-bool is_backref_mapped_extent_node(const CachedExtentRef &extent) {
- return extent->is_logical()
- || is_lba_node(extent->get_type())
- || extent->get_type() == extent_types_t::TEST_BLOCK_PHYSICAL;
-}
-
std::ostream &operator<<(std::ostream &out, CachedExtent::extent_state_t state)
{
switch (state) {
@@ -94,15 +89,15 @@ CachedExtent* CachedExtent::get_transactional_view(transaction_id_t tid) {
}
std::ostream &operator<<(std::ostream &out, const parent_tracker_t &tracker) {
- return out << "parent_tracker=" << (void*)&tracker
- << ", parent=" << (void*)tracker.get_parent().get();
+ return out << "tracker_ptr=" << (void*)&tracker
+ << ", parent_ptr=" << (void*)tracker.get_parent().get();
}
std::ostream &ChildableCachedExtent::print_detail(std::ostream &out) const {
if (parent_tracker) {
- out << *parent_tracker;
+ out << ", parent_tracker(" << *parent_tracker << ")";
} else {
- out << ", parent_tracker=" << (void*)nullptr;
+ out << ", parent_tracker(nullptr)";
}
_print_detail(out);
return out;
@@ -148,6 +143,12 @@ void LogicalCachedExtent::on_replace_prior() {
parent->children[off] = this;
}
+void LogicalCachedExtent::maybe_set_intermediate_laddr(LBAMapping &mapping) {
+ laddr = mapping.is_indirect()
+ ? mapping.get_intermediate_base()
+ : mapping.get_key();
+}
+
parent_tracker_t::~parent_tracker_t() {
// this is parent's tracker, reset it
auto &p = (FixedKVNode<laddr_t>&)*parent;
@@ -156,30 +157,183 @@ parent_tracker_t::~parent_tracker_t() {
}
}
-std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs)
+bool BufferSpace::is_range_loaded(extent_len_t offset, extent_len_t length) const
{
- out << "LBAMapping(" << rhs.get_key()
- << "~0x" << std::hex << rhs.get_length() << std::dec
- << "->" << rhs.get_val();
- if (rhs.is_indirect()) {
- out << ",indirect(" << rhs.get_intermediate_base()
- << "~0x" << std::hex << rhs.get_intermediate_length()
- << "@0x" << rhs.get_intermediate_offset() << std::dec
- << ")";
+ assert(length > 0);
+ auto i = buffer_map.upper_bound(offset);
+ if (i == buffer_map.begin()) {
+ return false;
}
- out << ")";
- return out;
+ --i;
+ auto& [i_offset, i_bl] = *i;
+ assert(offset >= i_offset);
+ assert(i_bl.length() > 0);
+ if (offset + length > i_offset + i_bl.length()) {
+ return false;
+ } else {
+ return true;
+ }
+}
+
+ceph::bufferlist BufferSpace::get_buffer(extent_len_t offset, extent_len_t length) const
+{
+ assert(length > 0);
+ auto i = buffer_map.upper_bound(offset);
+ assert(i != buffer_map.begin());
+ --i;
+ auto& [i_offset, i_bl] = *i;
+ assert(offset >= i_offset);
+ assert(i_bl.length() > 0);
+ assert(offset + length <= i_offset + i_bl.length());
+ ceph::bufferlist res;
+ res.substr_of(i_bl, offset - i_offset, length);
+ return res;
+}
+
+load_ranges_t BufferSpace::load_ranges(extent_len_t offset, extent_len_t length)
+{
+ assert(length > 0);
+ load_ranges_t ret;
+ auto next = buffer_map.upper_bound(offset);
+
+ // must be assigned for the main-loop
+ map_t::iterator previous;
+ extent_len_t range_offset;
+ extent_len_t range_length;
+
+ // returns whether to proceed main-loop or not
+ auto f_merge_next_check_hole = [this, &next, &range_offset, &range_length](
+ ceph::bufferlist& previous_bl,
+ extent_len_t hole_length,
+ extent_len_t next_offset,
+ const ceph::bufferlist& next_bl) {
+ range_length -= hole_length;
+ previous_bl.append(next_bl);
+ if (range_length <= next_bl.length()) {
+ // "next" end includes or beyonds the range
+ buffer_map.erase(next);
+ return false;
+ } else {
+ range_offset = next_offset + next_bl.length();
+ range_length -= next_bl.length();
+ // erase next should destruct next_bl
+ next = buffer_map.erase(next);
+ return true;
+ }
+ };
+
+ // returns whether to proceed main-loop or not
+ auto f_prepare_without_merge_previous = [
+ this, offset, length,
+ &ret, &previous, &next, &range_length,
+ &f_merge_next_check_hole]() {
+ if (next == buffer_map.end()) {
+ // "next" reaches end,
+ // range has no "next" to merge
+ create_hole_insert_map(ret, offset, length, next);
+ return false;
+ }
+ // "next" is valid
+ auto& [n_offset, n_bl] = *next;
+ // next is from upper_bound()
+ assert(offset < n_offset);
+ extent_len_t hole_length = n_offset - offset;
+ if (length < hole_length) {
+ // "next" is beyond the range end,
+ // range has no "next" to merge
+ create_hole_insert_map(ret, offset, length, next);
+ return false;
+ }
+ // length >= hole_length
+ // insert hole as "previous"
+ previous = create_hole_insert_map(ret, offset, hole_length, next);
+ auto& p_bl = previous->second;
+ range_length = length;
+ return f_merge_next_check_hole(p_bl, hole_length, n_offset, n_bl);
+ };
+
+ /*
+ * prepare main-loop
+ */
+ if (next == buffer_map.begin()) {
+ // "previous" is invalid
+ if (!f_prepare_without_merge_previous()) {
+ return ret;
+ }
+ } else {
+ // "previous" is valid
+ previous = std::prev(next);
+ auto& [p_offset, p_bl] = *previous;
+ assert(offset >= p_offset);
+ extent_len_t p_end = p_offset + p_bl.length();
+ if (offset <= p_end) {
+ // "previous" is adjacent or overlaps the range
+ range_offset = p_end;
+ assert(offset + length > p_end);
+ range_length = offset + length - p_end;
+ // start the main-loop (merge "previous")
+ } else {
+ // "previous" is not adjacent to the range
+ // range and buffer_map should not overlap
+ assert(offset > p_end);
+ if (!f_prepare_without_merge_previous()) {
+ return ret;
+ }
+ }
+ }
+
+ /*
+ * main-loop: merge the range with "previous" and look at "next"
+ *
+ * "previous": the previous buffer_map entry, must be valid, must be mergable
+ * "next": the next buffer_map entry, maybe end, maybe mergable
+ * range_offset/length: the current range right after "previous"
+ */
+ assert(std::next(previous) == next);
+ auto& [p_offset, p_bl] = *previous;
+ assert(range_offset == p_offset + p_bl.length());
+ assert(range_length > 0);
+ while (next != buffer_map.end()) {
+ auto& [n_offset, n_bl] = *next;
+ assert(range_offset < n_offset);
+ extent_len_t hole_length = n_offset - range_offset;
+ if (range_length < hole_length) {
+ // "next" offset is beyond the range end
+ break;
+ }
+ // range_length >= hole_length
+ create_hole_append_bl(ret, p_bl, range_offset, hole_length);
+ if (!f_merge_next_check_hole(p_bl, hole_length, n_offset, n_bl)) {
+ return ret;
+ }
+ assert(std::next(previous) == next);
+ assert(range_offset == p_offset + p_bl.length());
+ assert(range_length > 0);
+ }
+ // range has no "next" to merge:
+ // 1. "next" reaches end
+ // 2. "next" offset is beyond the range end
+ create_hole_append_bl(ret, p_bl, range_offset, range_length);
+ return ret;
}
-std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs)
+ceph::bufferptr BufferSpace::to_full_ptr(extent_len_t length)
{
- bool first = true;
- out << '[';
- for (const auto &i: rhs) {
- out << (first ? "" : ",") << *i;
- first = false;
+ assert(length > 0);
+ assert(buffer_map.size() == 1);
+ auto it = buffer_map.begin();
+ auto& [i_off, i_buf] = *it;
+ assert(i_off == 0);
+ if (!i_buf.is_contiguous()) {
+ // Allocate page aligned ptr, also see create_extent_ptr_*()
+ i_buf.rebuild();
}
- return out << ']';
+ assert(i_buf.get_num_buffers() == 1);
+ ceph::bufferptr ptr(i_buf.front());
+ assert(ptr.is_page_aligned());
+ assert(ptr.length() == length);
+ buffer_map.clear();
+ return ptr;
}
}
diff --git a/src/crimson/os/seastore/cached_extent.h b/src/crimson/os/seastore/cached_extent.h
index 6025725aa33..9dc60d719eb 100644
--- a/src/crimson/os/seastore/cached_extent.h
+++ b/src/crimson/os/seastore/cached_extent.h
@@ -6,15 +6,15 @@
#include <iostream>
#include <boost/intrusive/list.hpp>
+#include <boost/intrusive/set.hpp>
#include <boost/intrusive_ptr.hpp>
#include <boost/smart_ptr/intrusive_ref_counter.hpp>
#include "seastar/core/shared_future.hh"
#include "include/buffer.h"
-#include "crimson/common/errorator.h"
-#include "crimson/common/interruptible_future.h"
#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_interruptor.h"
struct btree_lba_manager_test;
struct lba_btree_test;
@@ -23,7 +23,6 @@ struct cache_test_t;
namespace crimson::os::seastore {
-class Transaction;
class CachedExtent;
using CachedExtentRef = boost::intrusive_ptr<CachedExtent>;
class SegmentedAllocator;
@@ -41,6 +40,20 @@ void intrusive_ptr_release(CachedExtent *);
#endif
+// Note: BufferSpace::to_full_ptr() also creates extent ptr.
+
+inline ceph::bufferptr create_extent_ptr_rand(extent_len_t len) {
+ assert(is_aligned(len, CEPH_PAGE_SIZE));
+ assert(len > 0);
+ return ceph::bufferptr(buffer::create_page_aligned(len));
+}
+
+inline ceph::bufferptr create_extent_ptr_zero(extent_len_t len) {
+ auto bp = create_extent_ptr_rand(len);
+ bp.zero();
+ return bp;
+}
+
template <typename T>
using TCachedExtentRef = boost::intrusive_ptr<T>;
@@ -155,6 +168,85 @@ struct trans_spec_view_t {
boost::intrusive::compare<cmp_t>>;
};
+struct load_range_t {
+ extent_len_t offset;
+ ceph::bufferptr ptr;
+
+ extent_len_t get_length() const {
+ return ptr.length();
+ }
+
+ extent_len_t get_end() const {
+ extent_len_t end = offset + ptr.length();
+ assert(end > offset);
+ return end;
+ }
+};
+struct load_ranges_t {
+ extent_len_t length = 0;
+ std::list<load_range_t> ranges;
+
+ void push_back(extent_len_t offset, ceph::bufferptr ptr) {
+ assert(ranges.empty() ||
+ (ranges.back().get_end() < offset));
+ assert(ptr.length());
+ length += ptr.length();
+ ranges.push_back({offset, std::move(ptr)});
+ }
+};
+
+/// manage small chunks of extent
+class BufferSpace {
+ using map_t = std::map<extent_len_t, ceph::bufferlist>;
+public:
+ BufferSpace() = default;
+
+ /// Returns true if offset~length is fully loaded
+ bool is_range_loaded(extent_len_t offset, extent_len_t length) const;
+
+ /// Returns the bufferlist of offset~length
+ ceph::bufferlist get_buffer(extent_len_t offset, extent_len_t length) const;
+
+ /// Returns the ranges to load, merge the buffer_map if possible
+ load_ranges_t load_ranges(extent_len_t offset, extent_len_t length);
+
+ /// Converts to ptr when fully loaded
+ ceph::bufferptr to_full_ptr(extent_len_t length);
+
+private:
+ // create and append the read-hole to
+ // load_ranges_t and bl
+ static void create_hole_append_bl(
+ load_ranges_t& ret,
+ ceph::bufferlist& bl,
+ extent_len_t hole_offset,
+ extent_len_t hole_length) {
+ ceph::bufferptr hole_ptr = create_extent_ptr_rand(hole_length);
+ bl.append(hole_ptr);
+ ret.push_back(hole_offset, std::move(hole_ptr));
+ }
+
+ // create and insert the read-hole to buffer_map,
+ // and append to load_ranges_t
+ // returns the iterator containing the inserted read-hole
+ auto create_hole_insert_map(
+ load_ranges_t& ret,
+ extent_len_t hole_offset,
+ extent_len_t hole_length,
+ const map_t::const_iterator& next_it) {
+ assert(!buffer_map.contains(hole_offset));
+ ceph::bufferlist bl;
+ create_hole_append_bl(ret, bl, hole_offset, hole_length);
+ auto it = buffer_map.insert(
+ next_it, std::pair{hole_offset, std::move(bl)});
+ assert(next_it == std::next(it));
+ return it;
+ }
+
+ /// extent offset -> buffer, won't overlap nor contiguous
+ map_t buffer_map;
+};
+
class ExtentIndex;
class CachedExtent
: public boost::intrusive_ref_counter<
@@ -256,6 +348,17 @@ public:
virtual void on_initial_write() {}
/**
+ * on_fully_loaded
+ *
+ * Called when ptr is ready. Normally this should be used to initiate
+ * the extent to be identical to CachedExtent(ptr).
+ *
+ * Note this doesn't mean the content is fully read, use on_clean_read for
+ * this purpose.
+ */
+ virtual void on_fully_loaded() {}
+
+ /**
* on_clean_read
*
* Called after read of initially written extent.
@@ -350,12 +453,12 @@ public:
<< ", modify_time=" << sea_time_point_printer_t{modify_time}
<< ", paddr=" << get_paddr()
<< ", prior_paddr=" << prior_poffset_str
- << std::hex << ", length=0x" << get_length() << std::dec
+ << std::hex << ", length=0x" << get_length()
+ << ", loaded=0x" << get_loaded_length() << std::dec
<< ", state=" << state
<< ", last_committed_crc=" << last_committed_crc
<< ", refcount=" << use_count()
<< ", user_hint=" << user_hint
- << ", fully_loaded=" << is_fully_loaded()
<< ", rewrite_gen=" << rewrite_gen_printer_t{rewrite_generation};
if (state != extent_state_t::INVALID &&
state != extent_state_t::CLEAN_PENDING) {
@@ -537,7 +640,40 @@ public:
/// Return true if extent is fully loaded or is about to be fully loaded (call
/// wait_io() in this case)
bool is_fully_loaded() const {
- return ptr.has_value();
+ if (ptr.has_value()) {
+ // length == 0 iff root
+ assert(length == loaded_length);
+ assert(!buffer_space.has_value());
+ return true;
+ } else { // ptr is std::nullopt
+ assert(length > loaded_length);
+ assert(buffer_space.has_value());
+ return false;
+ }
+ }
+
+ /// Return true if range offset~_length is loaded
+ bool is_range_loaded(extent_len_t offset, extent_len_t _length) {
+ assert(is_aligned(offset, CEPH_PAGE_SIZE));
+ assert(is_aligned(_length, CEPH_PAGE_SIZE));
+ assert(_length > 0);
+ assert(offset + _length <= length);
+ if (is_fully_loaded()) {
+ return true;
+ }
+ return buffer_space->is_range_loaded(offset, _length);
+ }
+
+ /// Get buffer by given offset and _length.
+ ceph::bufferlist get_range(extent_len_t offset, extent_len_t _length) {
+ assert(is_range_loaded(offset, _length));
+ ceph::bufferlist res;
+ if (is_fully_loaded()) {
+ res.append(ceph::bufferptr(get_bptr(), offset, _length));
+ } else {
+ res = buffer_space->get_buffer(offset, _length);
+ }
+ return res;
}
/**
@@ -553,12 +689,9 @@ public:
return length;
}
+ /// Returns length of partially loaded extent data in cache
extent_len_t get_loaded_length() const {
- if (ptr.has_value()) {
- return ptr->length();
- } else {
- return 0;
- }
+ return loaded_length;
}
/// Returns version, get_version() == 0 iff is_clean()
@@ -697,12 +830,19 @@ private:
*/
journal_seq_t dirty_from_or_retired_at;
- /// cache data contents, std::nullopt if no data in cache
+ /// cache data contents, std::nullopt iff partially loaded
std::optional<ceph::bufferptr> ptr;
- /// disk data length
+ /// disk data length, 0 iff root
extent_len_t length;
+ /// loaded data length, <length iff partially loaded
+ extent_len_t loaded_length;
+
+ /// manager of buffer pieces for ObjectDataBLock
+ /// valid iff partially loaded
+ std::optional<BufferSpace> buffer_space;
+
/// number of deltas since initial write
extent_version_t version = 0;
@@ -748,9 +888,29 @@ protected:
trans_view_set_t retired_transactions;
CachedExtent(CachedExtent &&other) = delete;
- CachedExtent(ceph::bufferptr &&_ptr) : ptr(std::move(_ptr)) {
- length = ptr->length();
+
+ /// construct a fully loaded CachedExtent
+ explicit CachedExtent(ceph::bufferptr &&_ptr)
+ : length(_ptr.length()),
+ loaded_length(_ptr.length()) {
+ ptr = std::move(_ptr);
+
+ assert(ptr->is_page_aligned());
+ assert(length > 0);
+ assert(is_fully_loaded());
+ // must call init() to fully initialize
+ }
+
+ /// construct a partially loaded CachedExtent
+ /// must be identical with CachedExtent(ptr) after on_fully_loaded()
+ explicit CachedExtent(extent_len_t _length)
+ : length(_length),
+ loaded_length(0),
+ buffer_space(std::in_place) {
+ assert(is_aligned(length, CEPH_PAGE_SIZE));
assert(length > 0);
+ assert(!is_fully_loaded());
+ // must call init() to fully initialize
}
/// construct new CachedExtent, will deep copy the buffer
@@ -758,16 +918,20 @@ protected:
: state(other.state),
dirty_from_or_retired_at(other.dirty_from_or_retired_at),
length(other.get_length()),
+ loaded_length(other.get_loaded_length()),
version(other.version),
poffset(other.poffset) {
- assert((length % CEPH_PAGE_SIZE) == 0);
- if (other.is_fully_loaded()) {
- ptr.emplace(buffer::create_page_aligned(length));
- other.ptr->copy_out(0, length, ptr->c_str());
- } else {
- // the extent must be fully loaded before CoW
- assert(length == 0); // in case of root
- }
+ // the extent must be fully loaded before CoW
+ assert(other.is_fully_loaded());
+ assert(is_aligned(length, CEPH_PAGE_SIZE));
+ if (length > 0) {
+ ptr = create_extent_ptr_rand(length);
+ other.ptr->copy_out(0, length, ptr->c_str());
+ } else { // length == 0, must be root
+ ptr = ceph::bufferptr(0);
+ }
+
+ assert(is_fully_loaded());
}
struct share_buffer_t {};
@@ -777,23 +941,35 @@ protected:
dirty_from_or_retired_at(other.dirty_from_or_retired_at),
ptr(other.ptr),
length(other.get_length()),
+ loaded_length(other.get_loaded_length()),
version(other.version),
- poffset(other.poffset) {}
+ poffset(other.poffset) {
+ // the extent must be fully loaded before CoW
+ assert(other.is_fully_loaded());
+ assert(is_aligned(length, CEPH_PAGE_SIZE));
+ assert(length > 0);
+ assert(is_fully_loaded());
+ }
// 0 length is only possible for the RootBlock
- struct zero_length_t {};
- CachedExtent(zero_length_t) : ptr(ceph::bufferptr(0)), length(0) {};
-
- struct retired_placeholder_t{};
- CachedExtent(retired_placeholder_t, extent_len_t _length)
- : state(extent_state_t::CLEAN),
- length(_length) {
- assert(length > 0);
+ struct root_construct_t {};
+ CachedExtent(root_construct_t)
+ : ptr(ceph::bufferptr(0)),
+ length(0),
+ loaded_length(0) {
+ assert(is_fully_loaded());
+ // must call init() to fully initialize
}
- /// no buffer extent, for lazy read
- CachedExtent(extent_len_t _length) : length(_length) {
- assert(length > 0);
+ struct retired_placeholder_construct_t {};
+ CachedExtent(retired_placeholder_construct_t, extent_len_t _length)
+ : state(extent_state_t::CLEAN),
+ length(_length),
+ loaded_length(0),
+ buffer_space(std::in_place) {
+ assert(!is_fully_loaded());
+ assert(is_aligned(length, CEPH_PAGE_SIZE));
+ // must call init() to fully initialize
}
friend class Cache;
@@ -804,9 +980,8 @@ protected:
}
template <typename T>
- static TCachedExtentRef<T> make_placeholder_cached_extent_ref(
- extent_len_t length) {
- return new T(length);
+ static TCachedExtentRef<T> make_cached_extent_ref() {
+ return new T();
}
void reset_prior_instance() {
@@ -869,6 +1044,45 @@ protected:
}
}
+ /// Returns the ranges to load, convert to fully loaded is possible
+ load_ranges_t load_ranges(extent_len_t offset, extent_len_t _length) {
+ assert(is_aligned(offset, CEPH_PAGE_SIZE));
+ assert(is_aligned(_length, CEPH_PAGE_SIZE));
+ assert(_length > 0);
+ assert(offset + _length <= length);
+ assert(!is_fully_loaded());
+
+ if (loaded_length == 0 && _length == length) {
+ assert(offset == 0);
+ // skip rebuilding the buffer from buffer_space
+ ptr = create_extent_ptr_rand(length);
+ loaded_length = _length;
+ buffer_space.reset();
+ assert(is_fully_loaded());
+ on_fully_loaded();
+ load_ranges_t ret;
+ ret.push_back(offset, *ptr);
+ return ret;
+ }
+
+ load_ranges_t ret = buffer_space->load_ranges(offset, _length);
+ loaded_length += ret.length;
+ assert(length >= loaded_length);
+ if (length == loaded_length) {
+ // convert to fully loaded
+ ptr = buffer_space->to_full_ptr(length);
+ buffer_space.reset();
+ assert(is_fully_loaded());
+ on_fully_loaded();
+ // adjust ret since the ptr has been rebuild
+ for (load_range_t& range : ret.ranges) {
+ auto range_length = range.ptr.length();
+ range.ptr = ceph::bufferptr(*ptr, range.offset, range_length);
+ }
+ }
+ return ret;
+ }
+
friend class crimson::os::seastore::SegmentedAllocator;
friend class crimson::os::seastore::TransactionManager;
friend class crimson::os::seastore::ExtentPlacementManager;
@@ -883,8 +1097,6 @@ protected:
std::ostream &operator<<(std::ostream &, CachedExtent::extent_state_t);
std::ostream &operator<<(std::ostream &, const CachedExtent&);
-bool is_backref_mapped_extent_node(const CachedExtentRef &extent);
-
/// Compare extents by paddr
struct paddr_cmp {
bool operator()(paddr_t lhs, const CachedExtent &rhs) const {
@@ -1067,7 +1279,6 @@ private:
};
class ChildableCachedExtent;
-class LogicalCachedExtent;
class child_pos_t {
public:
@@ -1088,14 +1299,17 @@ private:
uint16_t pos = std::numeric_limits<uint16_t>::max();
};
-using get_child_ertr = crimson::errorator<
- crimson::ct_error::input_output_error>;
+using get_child_iertr = trans_iertr<crimson::errorator<
+ crimson::ct_error::input_output_error>>;
+template <typename T>
+using get_child_ifut = get_child_iertr::future<TCachedExtentRef<T>>;
+
template <typename T>
struct get_child_ret_t {
- std::variant<child_pos_t, get_child_ertr::future<TCachedExtentRef<T>>> ret;
+ std::variant<child_pos_t, get_child_ifut<T>> ret;
get_child_ret_t(child_pos_t pos)
: ret(std::move(pos)) {}
- get_child_ret_t(get_child_ertr::future<TCachedExtentRef<T>> child)
+ get_child_ret_t(get_child_ifut<T> child)
: ret(std::move(child)) {}
bool has_child() const {
@@ -1107,7 +1321,7 @@ struct get_child_ret_t {
return std::get<0>(ret);
}
- get_child_ertr::future<TCachedExtentRef<T>> &get_child_fut() {
+ get_child_ifut<T> &get_child_fut() {
ceph_assert(ret.index() == 1);
return std::get<1>(ret);
}
@@ -1122,48 +1336,18 @@ using PhysicalNodeMappingRef = std::unique_ptr<PhysicalNodeMapping<key_t, val_t>
template <typename key_t, typename val_t>
class PhysicalNodeMapping {
public:
+ PhysicalNodeMapping() = default;
+ PhysicalNodeMapping(const PhysicalNodeMapping&) = delete;
virtual extent_len_t get_length() const = 0;
- virtual extent_types_t get_type() const = 0;
virtual val_t get_val() const = 0;
virtual key_t get_key() const = 0;
- virtual PhysicalNodeMappingRef<key_t, val_t> duplicate() const = 0;
- virtual PhysicalNodeMappingRef<key_t, val_t> refresh_with_pending_parent() {
- ceph_abort("impossible");
- return {};
- }
virtual bool has_been_invalidated() const = 0;
virtual CachedExtentRef get_parent() const = 0;
virtual uint16_t get_pos() const = 0;
- // An lba pin may be indirect, see comments in lba_manager/btree/btree_lba_manager.h
- virtual bool is_indirect() const { return false; }
- virtual key_t get_intermediate_key() const { return min_max_t<key_t>::null; }
- virtual key_t get_intermediate_base() const { return min_max_t<key_t>::null; }
- virtual extent_len_t get_intermediate_length() const { return 0; }
virtual uint32_t get_checksum() const {
ceph_abort("impossible");
return 0;
}
- // The start offset of the pin, must be 0 if the pin is not indirect
- virtual extent_len_t get_intermediate_offset() const {
- return std::numeric_limits<extent_len_t>::max();
- }
-
- virtual get_child_ret_t<LogicalCachedExtent>
- get_logical_extent(Transaction &t) = 0;
-
- void link_child(ChildableCachedExtent *c) {
- ceph_assert(child_pos);
- child_pos->link_child(c);
- }
-
- // For reserved mappings, the return values are
- // undefined although it won't crash
- virtual bool is_stable() const = 0;
- virtual bool is_data_stable() const = 0;
- virtual bool is_clone() const = 0;
- bool is_zero_reserved() const {
- return !get_val().is_real();
- }
virtual bool is_parent_viewable() const = 0;
virtual bool is_parent_valid() const = 0;
virtual bool parent_modified() const {
@@ -1176,24 +1360,8 @@ public:
}
virtual ~PhysicalNodeMapping() {}
-protected:
- std::optional<child_pos_t> child_pos = std::nullopt;
};
-using LBAMapping = PhysicalNodeMapping<laddr_t, paddr_t>;
-using LBAMappingRef = PhysicalNodeMappingRef<laddr_t, paddr_t>;
-
-std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs);
-
-using lba_pin_list_t = std::list<LBAMappingRef>;
-
-std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs);
-
-using BackrefMapping = PhysicalNodeMapping<paddr_t, laddr_t>;
-using BackrefMappingRef = PhysicalNodeMappingRef<paddr_t, laddr_t>;
-
-using backref_pin_list_t = std::list<BackrefMappingRef>;
-
/**
* RetiredExtentPlaceholder
*
@@ -1209,7 +1377,7 @@ class RetiredExtentPlaceholder : public CachedExtent {
public:
RetiredExtentPlaceholder(extent_len_t length)
- : CachedExtent(CachedExtent::retired_placeholder_t{}, length) {}
+ : CachedExtent(CachedExtent::retired_placeholder_construct_t{}, length) {}
CachedExtentRef duplicate_for_write(Transaction&) final {
ceph_assert(0 == "Should never happen for a placeholder");
@@ -1307,6 +1475,8 @@ private:
return out;
}
};
+
+class LBAMapping;
/**
* LogicalCachedExtent
*
@@ -1341,11 +1511,7 @@ public:
laddr = nladdr;
}
- void maybe_set_intermediate_laddr(LBAMapping &mapping) {
- laddr = mapping.is_indirect()
- ? mapping.get_intermediate_base()
- : mapping.get_key();
- }
+ void maybe_set_intermediate_laddr(LBAMapping &mapping);
void apply_delta_and_adjust_crc(
paddr_t base, const ceph::bufferlist &bl) final {
@@ -1445,8 +1611,6 @@ using lextent_list_t = addr_extent_list_base_t<
}
#if FMT_VERSION >= 90000
-template <> struct fmt::formatter<crimson::os::seastore::lba_pin_list_t> : fmt::ostream_formatter {};
template <> struct fmt::formatter<crimson::os::seastore::CachedExtent> : fmt::ostream_formatter {};
template <> struct fmt::formatter<crimson::os::seastore::LogicalCachedExtent> : fmt::ostream_formatter {};
-template <> struct fmt::formatter<crimson::os::seastore::LBAMapping> : fmt::ostream_formatter {};
#endif
diff --git a/src/crimson/os/seastore/collection_manager/collection_flat_node.h b/src/crimson/os/seastore/collection_manager/collection_flat_node.h
index aa1e7135613..1f4de652bba 100644
--- a/src/crimson/os/seastore/collection_manager/collection_flat_node.h
+++ b/src/crimson/os/seastore/collection_manager/collection_flat_node.h
@@ -96,6 +96,8 @@ struct CollectionNode
explicit CollectionNode(ceph::bufferptr &&ptr)
: LogicalCachedExtent(std::move(ptr)) {}
+ explicit CollectionNode(extent_len_t length)
+ : LogicalCachedExtent(length) {}
explicit CollectionNode(const CollectionNode &other)
: LogicalCachedExtent(other),
decoded(other.decoded) {}
diff --git a/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc b/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc
index c32dc66619a..866b5bf350c 100644
--- a/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc
+++ b/src/crimson/os/seastore/collection_manager/flat_collection_manager.cc
@@ -51,8 +51,11 @@ FlatCollectionManager::get_coll_root(const coll_root_t &coll_root, Transaction &
cc.t,
coll_root.get_location(),
coll_root.get_size()
- ).si_then([](auto&& e) {
- return get_root_iertr::make_ready_future<CollectionNodeRef>(std::move(e));
+ ).si_then([](auto maybe_indirect_extent) {
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
+ return get_root_iertr::make_ready_future<CollectionNodeRef>(
+ std::move(maybe_indirect_extent.extent));
});
}
diff --git a/src/crimson/os/seastore/device.cc b/src/crimson/os/seastore/device.cc
index c3bda82a7f6..cc83eb54826 100644
--- a/src/crimson/os/seastore/device.cc
+++ b/src/crimson/os/seastore/device.cc
@@ -12,7 +12,7 @@ namespace crimson::os::seastore {
std::ostream& operator<<(std::ostream& out, const device_spec_t& ds)
{
return out << "device_spec("
- << "magic=" << ds.magic
+ << "magic=0x" << std::hex << ds.magic << std::dec
<< ", dtype=" << ds.dtype
<< ", " << device_id_printer_t{ds.id}
<< ")";
diff --git a/src/crimson/os/seastore/extent_placement_manager.cc b/src/crimson/os/seastore/extent_placement_manager.cc
index 0458fbfed74..fd19eeb7e58 100644
--- a/src/crimson/os/seastore/extent_placement_manager.cc
+++ b/src/crimson/os/seastore/extent_placement_manager.cc
@@ -1069,8 +1069,8 @@ RandomBlockOolWriter::do_write(
w_info.bp = bp;
writes.push_back(w_info);
}
- TRACE("current extent: base off {} len {},\
- maybe-merged current extent: base off {} len {}",
+ TRACE("current extent: {}~0x{:x},\
+ maybe-merged current extent: {}~0x{:x}",
paddr, ex->get_length(), writes.back().offset, writes.back().bp.length());
}
diff --git a/src/crimson/os/seastore/extent_placement_manager.h b/src/crimson/os/seastore/extent_placement_manager.h
index c4e98a5f4a1..4ff9729c5f4 100644
--- a/src/crimson/os/seastore/extent_placement_manager.h
+++ b/src/crimson/os/seastore/extent_placement_manager.h
@@ -236,9 +236,9 @@ struct io_usage_t {
cleaner_usage_t cleaner_usage;
friend std::ostream &operator<<(std::ostream &out, const io_usage_t &usage) {
return out << "io_usage_t("
- << "inline_usage=" << usage.inline_usage
- << ", main_cleaner_usage=" << usage.cleaner_usage.main_usage
- << ", cold_cleaner_usage=" << usage.cleaner_usage.cold_ool_usage
+ << "inline_usage=0x" << std::hex << usage.inline_usage
+ << ", main_cleaner_usage=0x" << usage.cleaner_usage.main_usage
+ << ", cold_cleaner_usage=0x" << usage.cleaner_usage.cold_ool_usage << std::dec
<< ")";
}
};
@@ -371,9 +371,7 @@ public:
// XXX: bp might be extended to point to different memory (e.g. PMem)
// according to the allocator.
- auto bp = ceph::bufferptr(
- buffer::create_page_aligned(length));
- bp.zero();
+ auto bp = create_extent_ptr_zero(length);
return alloc_result_t{addr, std::move(bp), gen};
}
@@ -405,9 +403,7 @@ public:
#ifdef UNIT_TESTS_BUILT
if (unlikely(external_paddr.has_value())) {
assert(external_paddr->is_fake());
- auto bp = ceph::bufferptr(
- buffer::create_page_aligned(length));
- bp.zero();
+ auto bp = create_extent_ptr_zero(length);
allocs.emplace_back(alloc_result_t{*external_paddr, std::move(bp), gen});
} else {
#else
@@ -418,15 +414,17 @@ public:
for (auto &ext : addrs) {
auto left = ext.len;
while (left > 0) {
- auto len = std::min(max_data_allocation_size, left);
- auto bp = ceph::bufferptr(buffer::create_page_aligned(len));
- bp.zero();
+ auto len = left;
+ if (max_data_allocation_size) {
+ len = std::min(max_data_allocation_size, len);
+ }
+ auto bp = create_extent_ptr_zero(len);
auto start = ext.start.is_delayed()
? ext.start
: ext.start + (ext.len - left);
allocs.emplace_back(alloc_result_t{start, std::move(bp), gen});
SUBDEBUGT(seastore_epm,
- "allocated {} {}B extent at {}, hint={}, gen={}",
+ "allocated {} 0x{:x}B extent at {}, hint={}, gen={}",
t, type, len, start, hint, gen);
left -= len;
}
diff --git a/src/crimson/os/seastore/extentmap_manager.cc b/src/crimson/os/seastore/extentmap_manager.cc
deleted file mode 100644
index b0dc1b8c8a8..00000000000
--- a/src/crimson/os/seastore/extentmap_manager.cc
+++ /dev/null
@@ -1,33 +0,0 @@
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-#include <experimental/iterator>
-#include <iostream>
-
-#include "crimson/os/seastore/transaction_manager.h"
-#include "crimson/os/seastore/extentmap_manager.h"
-#include "crimson/os/seastore/extentmap_manager/btree/btree_extentmap_manager.h"
-namespace crimson::os::seastore::extentmap_manager {
-
-ExtentMapManagerRef create_extentmap_manager(
- TransactionManager &trans_manager) {
- return ExtentMapManagerRef(new BtreeExtentMapManager(trans_manager));
-}
-
-}
-
-namespace crimson::os::seastore {
-
-std::ostream &operator<<(std::ostream &out, const extent_mapping_t &rhs)
-{
- return out << "extent_mapping_t (" << rhs.logical_offset << "~" << rhs.length
- << "->" << rhs.laddr << ")";
-}
-
-std::ostream &operator<<(std::ostream &out, const extent_map_list_t &rhs)
-{
- out << '[';
- std::copy(std::begin(rhs), std::end(rhs), std::experimental::make_ostream_joiner(out, ", "));
- return out << ']';
-}
-
-}
diff --git a/src/crimson/os/seastore/journal.h b/src/crimson/os/seastore/journal.h
index a5c9029c43c..298935bd22e 100644
--- a/src/crimson/os/seastore/journal.h
+++ b/src/crimson/os/seastore/journal.h
@@ -59,13 +59,13 @@ public:
crimson::ct_error::erange,
crimson::ct_error::input_output_error
>;
- using submit_record_ret = submit_record_ertr::future<
- record_locator_t
- >;
- virtual submit_record_ret submit_record(
+ using on_submission_func_t = std::function<
+ void(record_locator_t)>;
+ virtual submit_record_ertr::future<> submit_record(
record_t &&record,
- OrderingHandle &handle
- ) = 0;
+ OrderingHandle &handle,
+ transaction_type_t t_src,
+ on_submission_func_t &&on_submission) = 0;
/**
* flush
@@ -101,9 +101,6 @@ public:
virtual replay_ret replay(
delta_handler_t &&delta_handler) = 0;
- virtual seastar::future<> finish_commit(
- transaction_type_t type) = 0;
-
virtual ~Journal() {}
virtual backend_type_t get_type() = 0;
diff --git a/src/crimson/os/seastore/journal/circular_bounded_journal.cc b/src/crimson/os/seastore/journal/circular_bounded_journal.cc
index 9ee8b1b997f..41ff8318aba 100644
--- a/src/crimson/os/seastore/journal/circular_bounded_journal.cc
+++ b/src/crimson/os/seastore/journal/circular_bounded_journal.cc
@@ -58,35 +58,52 @@ CircularBoundedJournal::close_ertr::future<> CircularBoundedJournal::close()
return record_submitter.close();
}
-CircularBoundedJournal::submit_record_ret
+CircularBoundedJournal::submit_record_ertr::future<>
CircularBoundedJournal::submit_record(
record_t &&record,
- OrderingHandle &handle)
+ OrderingHandle &handle,
+ transaction_type_t t_src,
+ on_submission_func_t &&on_submission)
{
LOG_PREFIX(CircularBoundedJournal::submit_record);
DEBUG("H{} {} start ...", (void*)&handle, record);
assert(write_pipeline);
- return do_submit_record(std::move(record), handle);
+ return do_submit_record(
+ std::move(record), handle, std::move(on_submission)
+ ).safe_then([this, t_src] {
+ if (is_trim_transaction(t_src)) {
+ return update_journal_tail(
+ trimmer.get_dirty_tail(),
+ trimmer.get_alloc_tail());
+ } else {
+ return seastar::now();
+ }
+ });
}
-CircularBoundedJournal::submit_record_ret
+CircularBoundedJournal::submit_record_ertr::future<>
CircularBoundedJournal::do_submit_record(
record_t &&record,
- OrderingHandle &handle)
+ OrderingHandle &handle,
+ on_submission_func_t &&on_submission)
{
LOG_PREFIX(CircularBoundedJournal::do_submit_record);
if (!record_submitter.is_available()) {
DEBUG("H{} wait ...", (void*)&handle);
return record_submitter.wait_available(
- ).safe_then([this, record=std::move(record), &handle]() mutable {
- return do_submit_record(std::move(record), handle);
+ ).safe_then([this, record=std::move(record), &handle,
+ on_submission=std::move(on_submission)]() mutable {
+ return do_submit_record(
+ std::move(record), handle, std::move(on_submission));
});
}
auto action = record_submitter.check_action(record.size);
if (action == RecordSubmitter::action_t::ROLL) {
return record_submitter.roll_segment(
- ).safe_then([this, record=std::move(record), &handle]() mutable {
- return do_submit_record(std::move(record), handle);
+ ).safe_then([this, record=std::move(record), &handle,
+ on_submission=std::move(on_submission)]() mutable {
+ return do_submit_record(
+ std::move(record), handle, std::move(on_submission));
});
}
@@ -99,13 +116,16 @@ CircularBoundedJournal::do_submit_record(
return handle.enter(write_pipeline->device_submission
).then([submit_fut=std::move(submit_ret.future)]() mutable {
return std::move(submit_fut);
- }).safe_then([FNAME, this, &handle](record_locator_t result) {
+ }).safe_then([FNAME, this, &handle, on_submission=std::move(on_submission)
+ ](record_locator_t result) mutable {
return handle.enter(write_pipeline->finalize
- ).then([FNAME, this, result, &handle] {
+ ).then([FNAME, this, result, &handle,
+ on_submission=std::move(on_submission)] {
DEBUG("H{} finish with {}", (void*)&handle, result);
auto new_committed_to = result.write_result.get_end_seq();
record_submitter.update_committed_to(new_committed_to);
- return result;
+ std::invoke(on_submission, result);
+ return seastar::now();
});
});
}
@@ -392,13 +412,4 @@ Journal::replay_ret CircularBoundedJournal::replay(
});
}
-seastar::future<> CircularBoundedJournal::finish_commit(transaction_type_t type) {
- if (is_trim_transaction(type)) {
- return update_journal_tail(
- trimmer.get_dirty_tail(),
- trimmer.get_alloc_tail());
- }
- return seastar::now();
-}
-
}
diff --git a/src/crimson/os/seastore/journal/circular_bounded_journal.h b/src/crimson/os/seastore/journal/circular_bounded_journal.h
index 874bd8dc086..16278df6cfe 100644
--- a/src/crimson/os/seastore/journal/circular_bounded_journal.h
+++ b/src/crimson/os/seastore/journal/circular_bounded_journal.h
@@ -80,9 +80,11 @@ public:
return backend_type_t::RANDOM_BLOCK;
}
- submit_record_ret submit_record(
+ submit_record_ertr::future<> submit_record(
record_t &&record,
- OrderingHandle &handle
+ OrderingHandle &handle,
+ transaction_type_t t_src,
+ on_submission_func_t &&on_submission
) final;
seastar::future<> flush(
@@ -148,8 +150,6 @@ public:
return cjs.get_records_start();
}
- seastar::future<> finish_commit(transaction_type_t type) final;
-
using cbj_delta_handler_t = std::function<
replay_ertr::future<bool>(
const record_locator_t&,
@@ -160,7 +160,10 @@ public:
cbj_delta_handler_t &&delta_handler,
journal_seq_t tail);
- submit_record_ret do_submit_record(record_t &&record, OrderingHandle &handle);
+ submit_record_ertr::future<> do_submit_record(
+ record_t &&record,
+ OrderingHandle &handle,
+ on_submission_func_t &&on_submission);
void try_read_rolled_header(scan_valid_records_cursor &cursor) {
paddr_t addr = convert_abs_addr_to_paddr(
diff --git a/src/crimson/os/seastore/journal/record_submitter.cc b/src/crimson/os/seastore/journal/record_submitter.cc
index adf8251b8a7..4976eee96e7 100644
--- a/src/crimson/os/seastore/journal/record_submitter.cc
+++ b/src/crimson/os/seastore/journal/record_submitter.cc
@@ -24,7 +24,7 @@ RecordBatch::add_pending(
LOG_PREFIX(RecordBatch::add_pending);
auto new_size = get_encoded_length_after(record, block_size);
auto dlength_offset = pending.size.dlength;
- TRACE("{} batches={}, write_size={}, dlength_offset={} ...",
+ TRACE("{} batches={}, write_size=0x{:x}, dlength_offset=0x{:x} ...",
name,
pending.get_size() + 1,
new_size.get_encoded_length(),
@@ -144,7 +144,7 @@ RecordSubmitter::RecordSubmitter(
batches(new RecordBatch[io_depth + 1])
{
LOG_PREFIX(RecordSubmitter);
- INFO("{} io_depth_limit={}, batch_capacity={}, batch_flush_size={}, "
+ INFO("{} io_depth_limit={}, batch_capacity={}, batch_flush_size=0x{:x}, "
"preferred_fullness={}",
get_name(), io_depth, batch_capacity,
batch_flush_size, preferred_fullness);
diff --git a/src/crimson/os/seastore/journal/segment_allocator.cc b/src/crimson/os/seastore/journal/segment_allocator.cc
index 11f3cc8fd31..5405662b91e 100644
--- a/src/crimson/os/seastore/journal/segment_allocator.cc
+++ b/src/crimson/os/seastore/journal/segment_allocator.cc
@@ -189,7 +189,7 @@ SegmentAllocator::write(ceph::bufferlist&& to_write)
auto write_length = to_write.length();
auto write_start_offset = written_to;
if (unlikely(LOCAL_LOGGER.is_enabled(seastar::log_level::trace))) {
- TRACE("{} {}~{}", print_name, get_written_to(), write_length);
+ TRACE("{} {}~0x{:x}", print_name, get_written_to(), write_length);
}
assert(write_length > 0);
assert((write_length % get_block_size()) == 0);
@@ -250,7 +250,7 @@ SegmentAllocator::close_segment()
close_seg_info.num_extents};
ceph::bufferlist bl;
encode(tail, bl);
- INFO("{} close segment {}, written_to={}",
+ INFO("{} close segment {}, written_to=0x{:x}",
print_name,
tail,
written_to);
diff --git a/src/crimson/os/seastore/journal/segmented_journal.cc b/src/crimson/os/seastore/journal/segmented_journal.cc
index eca45f113c2..67c0b3fb8ac 100644
--- a/src/crimson/os/seastore/journal/segmented_journal.cc
+++ b/src/crimson/os/seastore/journal/segmented_journal.cc
@@ -368,25 +368,30 @@ seastar::future<> SegmentedJournal::flush(OrderingHandle &handle)
});
}
-SegmentedJournal::submit_record_ret
+SegmentedJournal::submit_record_ertr::future<>
SegmentedJournal::do_submit_record(
record_t &&record,
- OrderingHandle &handle)
+ OrderingHandle &handle,
+ on_submission_func_t &&on_submission)
{
LOG_PREFIX(SegmentedJournal::do_submit_record);
if (!record_submitter.is_available()) {
DEBUG("H{} wait ...", (void*)&handle);
return record_submitter.wait_available(
- ).safe_then([this, record=std::move(record), &handle]() mutable {
- return do_submit_record(std::move(record), handle);
+ ).safe_then([this, record=std::move(record), &handle,
+ on_submission=std::move(on_submission)]() mutable {
+ return do_submit_record(
+ std::move(record), handle, std::move(on_submission));
});
}
auto action = record_submitter.check_action(record.size);
if (action == RecordSubmitter::action_t::ROLL) {
DEBUG("H{} roll, unavailable ...", (void*)&handle);
return record_submitter.roll_segment(
- ).safe_then([this, record=std::move(record), &handle]() mutable {
- return do_submit_record(std::move(record), handle);
+ ).safe_then([this, record=std::move(record), &handle,
+ on_submission=std::move(on_submission)]() mutable {
+ return do_submit_record(
+ std::move(record), handle, std::move(on_submission));
});
} else { // SUBMIT_FULL/NOT_FULL
DEBUG("H{} submit {} ...",
@@ -398,22 +403,27 @@ SegmentedJournal::do_submit_record(
return handle.enter(write_pipeline->device_submission
).then([submit_fut=std::move(submit_ret.future)]() mutable {
return std::move(submit_fut);
- }).safe_then([FNAME, this, &handle](record_locator_t result) {
+ }).safe_then([FNAME, this, &handle, on_submission=std::move(on_submission)
+ ](record_locator_t result) mutable {
return handle.enter(write_pipeline->finalize
- ).then([FNAME, this, result, &handle] {
+ ).then([FNAME, this, result, &handle,
+ on_submission=std::move(on_submission)] {
DEBUG("H{} finish with {}", (void*)&handle, result);
auto new_committed_to = result.write_result.get_end_seq();
record_submitter.update_committed_to(new_committed_to);
- return result;
+ std::invoke(on_submission, result);
+ return seastar::now();
});
});
}
}
-SegmentedJournal::submit_record_ret
+SegmentedJournal::submit_record_ertr::future<>
SegmentedJournal::submit_record(
record_t &&record,
- OrderingHandle &handle)
+ OrderingHandle &handle,
+ transaction_type_t t_src,
+ on_submission_func_t &&on_submission)
{
LOG_PREFIX(SegmentedJournal::submit_record);
DEBUG("H{} {} start ...", (void*)&handle, record);
@@ -424,12 +434,13 @@ SegmentedJournal::submit_record(
).get_encoded_length();
auto max_record_length = journal_segment_allocator.get_max_write_length();
if (expected_size > max_record_length) {
- ERROR("H{} {} exceeds max record size {}",
+ ERROR("H{} {} exceeds max record size 0x{:x}",
(void*)&handle, record, max_record_length);
return crimson::ct_error::erange::make();
}
- return do_submit_record(std::move(record), handle);
+ return do_submit_record(
+ std::move(record), handle, std::move(on_submission));
}
}
diff --git a/src/crimson/os/seastore/journal/segmented_journal.h b/src/crimson/os/seastore/journal/segmented_journal.h
index 891de7ec306..3f51de70fb3 100644
--- a/src/crimson/os/seastore/journal/segmented_journal.h
+++ b/src/crimson/os/seastore/journal/segmented_journal.h
@@ -44,9 +44,11 @@ public:
close_ertr::future<> close() final;
- submit_record_ret submit_record(
+ submit_record_ertr::future<> submit_record(
record_t &&record,
- OrderingHandle &handle) final;
+ OrderingHandle &handle,
+ transaction_type_t t_src,
+ on_submission_func_t &&on_submission) final;
seastar::future<> flush(OrderingHandle &handle) final;
@@ -59,9 +61,6 @@ public:
backend_type_t get_type() final {
return backend_type_t::SEGMENTED;
}
- seastar::future<> finish_commit(transaction_type_t type) {
- return seastar::now();
- }
bool is_checksum_needed() final {
// segmented journal always requires checksum
@@ -69,10 +68,10 @@ public:
}
private:
- submit_record_ret do_submit_record(
+ submit_record_ertr::future<> do_submit_record(
record_t &&record,
- OrderingHandle &handle
- );
+ OrderingHandle &handle,
+ on_submission_func_t &&on_submission);
SegmentSeqAllocatorRef segment_seq_allocator;
SegmentAllocator journal_segment_allocator;
diff --git a/src/crimson/os/seastore/lba_manager.h b/src/crimson/os/seastore/lba_manager.h
index a050b2cdf47..9a34bf56157 100644
--- a/src/crimson/os/seastore/lba_manager.h
+++ b/src/crimson/os/seastore/lba_manager.h
@@ -19,6 +19,7 @@
#include "crimson/os/seastore/cache.h"
#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/lba_mapping.h"
namespace crimson::os::seastore {
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
index b7a1d8f8ba9..888d3c359ac 100644
--- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.cc
@@ -52,28 +52,22 @@ const get_phy_tree_root_node_ret get_phy_tree_root_node<
ceph_assert(lba_root->is_initial_pending()
== root_block->is_pending());
return {true,
- trans_intr::make_interruptible(
- c.cache.get_extent_viewable_by_trans(c.trans, lba_root))};
+ c.cache.get_extent_viewable_by_trans(c.trans, lba_root)};
} else if (root_block->is_pending()) {
auto &prior = static_cast<RootBlock&>(*root_block->get_prior_instance());
lba_root = prior.lba_root_node;
if (lba_root) {
return {true,
- trans_intr::make_interruptible(
- c.cache.get_extent_viewable_by_trans(c.trans, lba_root))};
+ c.cache.get_extent_viewable_by_trans(c.trans, lba_root)};
} else {
c.cache.account_absent_access(c.trans.get_src());
return {false,
- trans_intr::make_interruptible(
- Cache::get_extent_ertr::make_ready_future<
- CachedExtentRef>())};
+ Cache::get_extent_iertr::make_ready_future<CachedExtentRef>()};
}
} else {
c.cache.account_absent_access(c.trans.get_src());
return {false,
- trans_intr::make_interruptible(
- Cache::get_extent_ertr::make_ready_future<
- CachedExtentRef>())};
+ Cache::get_extent_iertr::make_ready_future<CachedExtentRef>()};
}
}
@@ -100,6 +94,45 @@ void unlink_phy_tree_root_node<laddr_t>(RootBlockRef &root_block) {
namespace crimson::os::seastore::lba_manager::btree {
+get_child_ret_t<LogicalCachedExtent>
+BtreeLBAMapping::get_logical_extent(Transaction &t)
+{
+ ceph_assert(is_parent_viewable());
+ assert(pos != std::numeric_limits<uint16_t>::max());
+ ceph_assert(t.get_trans_id() == ctx.trans.get_trans_id());
+ auto &p = static_cast<LBALeafNode&>(*parent);
+ auto k = this->is_indirect()
+ ? this->get_intermediate_base()
+ : get_key();
+ auto v = p.template get_child<LogicalCachedExtent>(ctx, pos, k);
+ if (!v.has_child()) {
+ this->child_pos = v.get_child_pos();
+ }
+ return v;
+}
+
+bool BtreeLBAMapping::is_stable() const
+{
+ assert(!this->parent_modified());
+ assert(pos != std::numeric_limits<uint16_t>::max());
+ auto &p = static_cast<LBALeafNode&>(*parent);
+ auto k = this->is_indirect()
+ ? this->get_intermediate_base()
+ : get_key();
+ return p.is_child_stable(ctx, pos, k);
+}
+
+bool BtreeLBAMapping::is_data_stable() const
+{
+ assert(!this->parent_modified());
+ assert(pos != std::numeric_limits<uint16_t>::max());
+ auto &p = static_cast<LBALeafNode&>(*parent);
+ auto k = this->is_indirect()
+ ? this->get_intermediate_base()
+ : get_key();
+ return p.is_child_data_stable(ctx, pos, k);
+}
+
BtreeLBAManager::mkfs_ret
BtreeLBAManager::mkfs(
Transaction &t)
diff --git a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
index ef10ff9623b..e0902053d0e 100644
--- a/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
+++ b/src/crimson/os/seastore/lba_manager/btree/btree_lba_manager.h
@@ -23,11 +23,15 @@
#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
#include "crimson/os/seastore/btree/btree_range_pin.h"
+namespace crimson::os::seastore {
+class LogicalCachedExtent;
+}
+
namespace crimson::os::seastore::lba_manager::btree {
struct LBALeafNode;
-class BtreeLBAMapping : public BtreeNodeMapping<laddr_t, paddr_t> {
+class BtreeLBAMapping : public LBAMapping {
// To support cloning, there are two kinds of lba mappings:
// 1. physical lba mapping: the pladdr in the value of which is the paddr of
// the corresponding extent;
@@ -61,14 +65,14 @@ class BtreeLBAMapping : public BtreeNodeMapping<laddr_t, paddr_t> {
// their keys.
public:
BtreeLBAMapping(op_context_t<laddr_t> ctx)
- : BtreeNodeMapping(ctx) {}
+ : LBAMapping(ctx) {}
BtreeLBAMapping(
op_context_t<laddr_t> c,
LBALeafNodeRef parent,
uint16_t pos,
lba_map_val_t &val,
lba_node_meta_t meta)
- : BtreeNodeMapping(
+ : LBAMapping(
c,
parent,
pos,
@@ -190,8 +194,12 @@ public:
SUBDEBUGT(seastore_lba, "new pin {}", ctx.trans, static_cast<LBAMapping&>(*new_pin));
return new_pin;
}
+ bool is_stable() const final;
+ bool is_data_stable() const final;
+ get_child_ret_t<LogicalCachedExtent> get_logical_extent(Transaction &t);
+
protected:
- std::unique_ptr<BtreeNodeMapping<laddr_t, paddr_t>> _duplicate(
+ LBAMappingRef _duplicate(
op_context_t<laddr_t> ctx) const final {
auto pin = std::unique_ptr<BtreeLBAMapping>(new BtreeLBAMapping(ctx));
pin->key = key;
diff --git a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h
index ad5d336815b..524bf23dd58 100644
--- a/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h
+++ b/src/crimson/os/seastore/lba_manager/btree/lba_btree_node.h
@@ -259,7 +259,7 @@ struct LBALeafNode
}
// See LBAInternalNode, same concept
- void resolve_relative_addrs(paddr_t base);
+ void resolve_relative_addrs(paddr_t base) final;
void node_resolve_vals(
internal_iterator_t from,
internal_iterator_t to) const final
diff --git a/src/crimson/os/seastore/lba_mapping.cc b/src/crimson/os/seastore/lba_mapping.cc
new file mode 100644
index 00000000000..90fae09ce21
--- /dev/null
+++ b/src/crimson/os/seastore/lba_mapping.cc
@@ -0,0 +1,44 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "lba_mapping.h"
+
+namespace crimson::os::seastore {
+
+std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs)
+{
+ out << "LBAMapping(" << rhs.get_key()
+ << "~0x" << std::hex << rhs.get_length() << std::dec
+ << "->" << rhs.get_val();
+ if (rhs.is_indirect()) {
+ out << ",indirect(" << rhs.get_intermediate_base()
+ << "~0x" << std::hex << rhs.get_intermediate_length()
+ << "@0x" << rhs.get_intermediate_offset() << std::dec
+ << ")";
+ }
+ out << ")";
+ return out;
+}
+
+std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs)
+{
+ bool first = true;
+ out << '[';
+ for (const auto &i: rhs) {
+ out << (first ? "" : ",") << *i;
+ first = false;
+ }
+ return out << ']';
+}
+
+LBAMappingRef LBAMapping::duplicate() const {
+ auto ret = _duplicate(ctx);
+ ret->range = range;
+ ret->value = value;
+ ret->parent = parent;
+ ret->len = len;
+ ret->pos = pos;
+ return ret;
+}
+
+} // namespace crimson::os::seastore
diff --git a/src/crimson/os/seastore/lba_mapping.h b/src/crimson/os/seastore/lba_mapping.h
new file mode 100644
index 00000000000..338d4d53f55
--- /dev/null
+++ b/src/crimson/os/seastore/lba_mapping.h
@@ -0,0 +1,73 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/cached_extent.h"
+#include "crimson/os/seastore/btree/btree_range_pin.h"
+
+namespace crimson::os::seastore {
+
+class LBAMapping;
+using LBAMappingRef = std::unique_ptr<LBAMapping>;
+
+class LogicalCachedExtent;
+
+class LBAMapping : public BtreeNodeMapping<laddr_t, paddr_t> {
+public:
+ LBAMapping(op_context_t<laddr_t> ctx)
+ : BtreeNodeMapping<laddr_t, paddr_t>(ctx) {}
+ template <typename... T>
+ LBAMapping(T&&... t)
+ : BtreeNodeMapping<laddr_t, paddr_t>(std::forward<T>(t)...)
+ {
+ if (!parent->is_pending()) {
+ this->child_pos = {parent, pos};
+ }
+ }
+
+ // An lba pin may be indirect, see comments in lba_manager/btree/btree_lba_manager.h
+ virtual bool is_indirect() const = 0;
+ virtual laddr_t get_intermediate_key() const = 0;
+ virtual laddr_t get_intermediate_base() const = 0;
+ virtual extent_len_t get_intermediate_length() const = 0;
+ // The start offset of the pin, must be 0 if the pin is not indirect
+ virtual extent_len_t get_intermediate_offset() const = 0;
+
+ virtual get_child_ret_t<LogicalCachedExtent>
+ get_logical_extent(Transaction &t) = 0;
+
+ void link_child(ChildableCachedExtent *c) {
+ ceph_assert(child_pos);
+ child_pos->link_child(c);
+ }
+ virtual LBAMappingRef refresh_with_pending_parent() = 0;
+
+ // For reserved mappings, the return values are
+ // undefined although it won't crash
+ virtual bool is_stable() const = 0;
+ virtual bool is_data_stable() const = 0;
+ virtual bool is_clone() const = 0;
+ bool is_zero_reserved() const {
+ return !get_val().is_real();
+ }
+
+ LBAMappingRef duplicate() const;
+
+ virtual ~LBAMapping() {}
+protected:
+ virtual LBAMappingRef _duplicate(op_context_t<laddr_t>) const = 0;
+ std::optional<child_pos_t> child_pos = std::nullopt;
+};
+
+std::ostream &operator<<(std::ostream &out, const LBAMapping &rhs);
+using lba_pin_list_t = std::list<LBAMappingRef>;
+
+std::ostream &operator<<(std::ostream &out, const lba_pin_list_t &rhs);
+
+} // namespace crimson::os::seastore
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::LBAMapping> : fmt::ostream_formatter {};
+template <> struct fmt::formatter<crimson::os::seastore::lba_pin_list_t> : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/object_data_handler.cc b/src/crimson/os/seastore/object_data_handler.cc
index 20f86da5d3d..8f817a521cf 100644
--- a/src/crimson/os/seastore/object_data_handler.cc
+++ b/src/crimson/os/seastore/object_data_handler.cc
@@ -50,6 +50,8 @@ struct extent_to_write_t {
extent_to_write_t(const extent_to_write_t &) = delete;
extent_to_write_t(extent_to_write_t &&) = default;
+ extent_to_write_t& operator=(const extent_to_write_t&) = delete;
+ extent_to_write_t& operator=(extent_to_write_t&&) = default;
bool is_data() const {
return type == type_t::DATA;
@@ -523,7 +525,7 @@ ObjectDataHandler::write_ret do_insertions(
if (region.is_data()) {
assert_aligned(region.len);
ceph_assert(region.len == region.bl->length());
- DEBUGT("allocating extent: {}~{}",
+ DEBUGT("allocating extent: {}~0x{:x}",
ctx.t,
region.addr,
region.len);
@@ -554,7 +556,7 @@ ObjectDataHandler::write_ret do_insertions(
ObjectDataHandler::write_iertr::pass_further{}
);
} else if (region.is_zero()) {
- DEBUGT("reserving: {}~{}",
+ DEBUGT("reserving: {}~0x{:x}",
ctx.t,
region.addr,
region.len);
@@ -696,7 +698,7 @@ public:
<< ", aligned_data_end=" << overwrite_plan.aligned_data_end
<< ", left_operation=" << overwrite_plan.left_operation
<< ", right_operation=" << overwrite_plan.right_operation
- << ", block_size=" << overwrite_plan.block_size
+ << ", block_size=0x" << std::hex << overwrite_plan.block_size << std::dec
<< ", is_left_fresh=" << overwrite_plan.is_left_fresh
<< ", is_right_fresh=" << overwrite_plan.is_right_fresh
<< ")";
@@ -827,7 +829,7 @@ namespace crimson::os::seastore {
*/
using operate_ret_bare = std::pair<
std::optional<extent_to_write_t>,
- std::optional<bufferptr>>;
+ std::optional<ceph::bufferlist>>;
using operate_ret = get_iertr::future<operate_ret_bare>;
operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan_t &overwrite_plan)
{
@@ -839,19 +841,26 @@ operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan
if (overwrite_plan.left_operation == overwrite_operation_t::OVERWRITE_ZERO) {
assert(pin->get_val().is_zero());
+
auto zero_extent_len = overwrite_plan.get_left_extent_size();
assert_aligned(zero_extent_len);
+ std::optional<extent_to_write_t> extent_to_write;
+ if (zero_extent_len != 0) {
+ extent_to_write = extent_to_write_t::create_zero(
+ overwrite_plan.pin_begin, zero_extent_len);
+ }
+
auto zero_prepend_len = overwrite_plan.get_left_alignment_size();
+ std::optional<ceph::bufferlist> prepend_bl;
+ if (zero_prepend_len != 0) {
+ ceph::bufferlist zero_bl;
+ zero_bl.append_zero(zero_prepend_len);
+ prepend_bl = std::move(zero_bl);
+ }
+
return get_iertr::make_ready_future<operate_ret_bare>(
- (zero_extent_len == 0
- ? std::nullopt
- : std::make_optional(extent_to_write_t::create_zero(
- overwrite_plan.pin_begin, zero_extent_len))),
- (zero_prepend_len == 0
- ? std::nullopt
- : std::make_optional(bufferptr(
- ceph::buffer::create(zero_prepend_len, 0))))
- );
+ std::move(extent_to_write),
+ std::move(prepend_bl));
} else if (overwrite_plan.left_operation == overwrite_operation_t::MERGE_EXISTING) {
auto prepend_len = overwrite_plan.get_left_size();
if (prepend_len == 0) {
@@ -859,16 +868,15 @@ operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan
std::nullopt,
std::nullopt);
} else {
- extent_len_t off = pin->get_intermediate_offset();
return ctx.tm.read_pin<ObjectDataBlock>(
ctx.t, pin->duplicate()
- ).si_then([prepend_len, off](auto left_extent) {
+ ).si_then([prepend_len](auto maybe_indirect_left_extent) {
+ auto read_bl = maybe_indirect_left_extent.get_bl();
+ ceph::bufferlist prepend_bl;
+ prepend_bl.substr_of(read_bl, 0, prepend_len);
return get_iertr::make_ready_future<operate_ret_bare>(
std::nullopt,
- std::make_optional(bufferptr(
- left_extent->get_bptr(),
- off,
- prepend_len)));
+ std::move(prepend_bl));
});
}
} else {
@@ -888,18 +896,17 @@ operate_ret operate_left(context_t ctx, LBAMappingRef &pin, const overwrite_plan
std::move(left_to_write_extent),
std::nullopt);
} else {
- extent_len_t off = pin->get_intermediate_offset();
return ctx.tm.read_pin<ObjectDataBlock>(
ctx.t, pin->duplicate()
- ).si_then([prepend_offset=extent_len + off, prepend_len,
+ ).si_then([prepend_offset=extent_len, prepend_len,
left_to_write_extent=std::move(left_to_write_extent)]
- (auto left_extent) mutable {
+ (auto left_maybe_indirect_extent) mutable {
+ auto read_bl = left_maybe_indirect_extent.get_bl();
+ ceph::bufferlist prepend_bl;
+ prepend_bl.substr_of(read_bl, prepend_offset, prepend_len);
return get_iertr::make_ready_future<operate_ret_bare>(
std::move(left_to_write_extent),
- std::make_optional(bufferptr(
- left_extent->get_bptr(),
- prepend_offset,
- prepend_len)));
+ std::move(prepend_bl));
});
}
}
@@ -922,19 +929,26 @@ operate_ret operate_right(context_t ctx, LBAMappingRef &pin, const overwrite_pla
assert(overwrite_plan.data_end >= right_pin_begin);
if (overwrite_plan.right_operation == overwrite_operation_t::OVERWRITE_ZERO) {
assert(pin->get_val().is_zero());
+
auto zero_suffix_len = overwrite_plan.get_right_alignment_size();
+ std::optional<ceph::bufferlist> suffix_bl;
+ if (zero_suffix_len != 0) {
+ ceph::bufferlist zero_bl;
+ zero_bl.append_zero(zero_suffix_len);
+ suffix_bl = std::move(zero_bl);
+ }
+
auto zero_extent_len = overwrite_plan.get_right_extent_size();
assert_aligned(zero_extent_len);
+ std::optional<extent_to_write_t> extent_to_write;
+ if (zero_extent_len != 0) {
+ extent_to_write = extent_to_write_t::create_zero(
+ overwrite_plan.aligned_data_end, zero_extent_len);
+ }
+
return get_iertr::make_ready_future<operate_ret_bare>(
- (zero_extent_len == 0
- ? std::nullopt
- : std::make_optional(extent_to_write_t::create_zero(
- overwrite_plan.aligned_data_end, zero_extent_len))),
- (zero_suffix_len == 0
- ? std::nullopt
- : std::make_optional(bufferptr(
- ceph::buffer::create(zero_suffix_len, 0))))
- );
+ std::move(extent_to_write),
+ std::move(suffix_bl));
} else if (overwrite_plan.right_operation == overwrite_operation_t::MERGE_EXISTING) {
auto append_len = overwrite_plan.get_right_size();
if (append_len == 0) {
@@ -944,17 +958,17 @@ operate_ret operate_right(context_t ctx, LBAMappingRef &pin, const overwrite_pla
} else {
auto append_offset =
overwrite_plan.data_end.get_byte_distance<
- extent_len_t>(right_pin_begin)
- + pin->get_intermediate_offset();
+ extent_len_t>(right_pin_begin);
return ctx.tm.read_pin<ObjectDataBlock>(
ctx.t, pin->duplicate()
- ).si_then([append_offset, append_len](auto right_extent) {
+ ).si_then([append_offset, append_len]
+ (auto right_maybe_indirect_extent) {
+ auto read_bl = right_maybe_indirect_extent.get_bl();
+ ceph::bufferlist suffix_bl;
+ suffix_bl.substr_of(read_bl, append_offset, append_len);
return get_iertr::make_ready_future<operate_ret_bare>(
std::nullopt,
- std::make_optional(bufferptr(
- right_extent->get_bptr(),
- append_offset,
- append_len)));
+ std::move(suffix_bl));
});
}
} else {
@@ -976,19 +990,18 @@ operate_ret operate_right(context_t ctx, LBAMappingRef &pin, const overwrite_pla
} else {
auto append_offset =
overwrite_plan.data_end.get_byte_distance<
- extent_len_t>(right_pin_begin)
- + pin->get_intermediate_offset();
+ extent_len_t>(right_pin_begin);
return ctx.tm.read_pin<ObjectDataBlock>(
ctx.t, pin->duplicate()
).si_then([append_offset, append_len,
right_to_write_extent=std::move(right_to_write_extent)]
- (auto right_extent) mutable {
+ (auto maybe_indirect_right_extent) mutable {
+ auto read_bl = maybe_indirect_right_extent.get_bl();
+ ceph::bufferlist suffix_bl;
+ suffix_bl.substr_of(read_bl, append_offset, append_len);
return get_iertr::make_ready_future<operate_ret_bare>(
std::move(right_to_write_extent),
- std::make_optional(bufferptr(
- right_extent->get_bptr(),
- append_offset,
- append_len)));
+ std::move(suffix_bl));
});
}
}
@@ -1046,13 +1059,13 @@ ObjectDataHandler::write_ret ObjectDataHandler::prepare_data_reservation(
ceph_assert(size <= max_object_size);
if (!object_data.is_null()) {
ceph_assert(object_data.get_reserved_data_len() == max_object_size);
- DEBUGT("reservation present: {}~{}",
+ DEBUGT("reservation present: {}~0x{:x}",
ctx.t,
object_data.get_reserved_data_base(),
object_data.get_reserved_data_len());
return write_iertr::now();
} else {
- DEBUGT("reserving: {}~{}",
+ DEBUGT("reserving: {}~0x{:x}",
ctx.t,
ctx.onode.get_data_hint(),
max_object_size);
@@ -1085,7 +1098,7 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
LOG_PREFIX(ObjectDataHandler::trim_data_reservation);
auto data_base = object_data.get_reserved_data_base();
auto data_len = object_data.get_reserved_data_len();
- DEBUGT("object_data: {}~{}", ctx.t, data_base, data_len);
+ DEBUGT("object_data: {}~0x{:x}", ctx.t, data_base, data_len);
laddr_t aligned_start = (data_base + size).get_aligned_laddr();
loffset_t aligned_length =
data_len - aligned_start.get_byte_distance<loffset_t>(data_base);
@@ -1121,7 +1134,7 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
if (append_len == 0) {
LOG_PREFIX(ObjectDataHandler::trim_data_reservation);
TRACET("First pin overlaps the boundary and has aligned data"
- "create existing at addr:{}, len:{}",
+ "create existing at addr:{}, len:0x{:x}",
ctx.t, pin.get_key(), size - pin_offset);
to_write.push_back(extent_to_write_t::create_existing(
pin.duplicate(),
@@ -1136,22 +1149,18 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
ctx.t,
pin.duplicate()
).si_then([ctx, size, pin_offset, append_len, roundup_size,
- &pin, &object_data, &to_write](auto extent) {
- bufferlist bl;
- bl.append(
- bufferptr(
- extent->get_bptr(),
- pin.get_intermediate_offset(),
- size - pin_offset
- ));
- bl.append_zero(append_len);
+ &pin, &object_data, &to_write](auto maybe_indirect_extent) {
+ auto read_bl = maybe_indirect_extent.get_bl();
+ ceph::bufferlist write_bl;
+ write_bl.substr_of(read_bl, 0, size - pin_offset);
+ write_bl.append_zero(append_len);
LOG_PREFIX(ObjectDataHandler::trim_data_reservation);
TRACET("First pin overlaps the boundary and has unaligned data"
- "create data at addr:{}, len:{}",
- ctx.t, pin.get_key(), bl.length());
+ "create data at addr:{}, len:0x{:x}",
+ ctx.t, pin.get_key(), write_bl.length());
to_write.push_back(extent_to_write_t::create_data(
pin.get_key(),
- bl));
+ write_bl));
to_write.push_back(extent_to_write_t::create_zero(
(object_data.get_reserved_data_base() + roundup_size).checked_to_laddr(),
object_data.get_reserved_data_len() - roundup_size));
@@ -1184,44 +1193,45 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
* get_to_writes_with_zero_buffer
*
* Returns extent_to_write_t's reflecting a zero region extending
- * from offset~len with headptr optionally on the left and tailptr
+ * from offset~len with headbl optionally on the left and tailbl
* optionally on the right.
*/
extent_to_write_list_t get_to_writes_with_zero_buffer(
laddr_t data_base,
const extent_len_t block_size,
objaddr_t offset, extent_len_t len,
- std::optional<bufferptr> &&headptr, std::optional<bufferptr> &&tailptr)
+ std::optional<ceph::bufferlist> &&headbl,
+ std::optional<ceph::bufferlist> &&tailbl)
{
auto zero_left = p2roundup(offset, (objaddr_t)block_size);
auto zero_right = p2align(offset + len, (objaddr_t)block_size);
- auto left = headptr ? (offset - headptr->length()) : offset;
- auto right = tailptr ?
- (offset + len + tailptr->length()) :
+ auto left = headbl ? (offset - headbl->length()) : offset;
+ auto right = tailbl ?
+ (offset + len + tailbl->length()) :
(offset + len);
assert(
- (headptr && ((zero_left - left) ==
- p2roundup(headptr->length(), block_size))) ^
- (!headptr && (zero_left == left)));
+ (headbl && ((zero_left - left) ==
+ p2roundup(headbl->length(), block_size))) ^
+ (!headbl && (zero_left == left)));
assert(
- (tailptr && ((right - zero_right) ==
- p2roundup(tailptr->length(), block_size))) ^
- (!tailptr && (right == zero_right)));
+ (tailbl && ((right - zero_right) ==
+ p2roundup(tailbl->length(), block_size))) ^
+ (!tailbl && (right == zero_right)));
assert(right > left);
// zero region too small for a reserved section,
- // headptr and tailptr in same extent
+ // headbl and tailbl in same extent
if (zero_right <= zero_left) {
bufferlist bl;
- if (headptr) {
- bl.append(*headptr);
+ if (headbl) {
+ bl.append(*headbl);
}
bl.append_zero(
- right - left - bl.length() - (tailptr ? tailptr->length() : 0));
- if (tailptr) {
- bl.append(*tailptr);
+ right - left - bl.length() - (tailbl ? tailbl->length() : 0));
+ if (tailbl) {
+ bl.append(*tailbl);
}
assert(bl.length() % block_size == 0);
assert(bl.length() == (right - left));
@@ -1230,16 +1240,16 @@ extent_to_write_list_t get_to_writes_with_zero_buffer(
(data_base + left).checked_to_laddr(), bl));
return ret;
} else {
- // reserved section between ends, headptr and tailptr in different extents
+ // reserved section between ends, headbl and tailbl in different extents
extent_to_write_list_t ret;
- if (headptr) {
- bufferlist headbl;
- headbl.append(*headptr);
- headbl.append_zero(zero_left - left - headbl.length());
- assert(headbl.length() % block_size == 0);
- assert(headbl.length() > 0);
+ if (headbl) {
+ bufferlist head_zero_bl;
+ head_zero_bl.append(*headbl);
+ head_zero_bl.append_zero(zero_left - left - head_zero_bl.length());
+ assert(head_zero_bl.length() % block_size == 0);
+ assert(head_zero_bl.length() > 0);
ret.push_back(extent_to_write_t::create_data(
- (data_base + left).checked_to_laddr(), headbl));
+ (data_base + left).checked_to_laddr(), head_zero_bl));
}
// reserved zero region
ret.push_back(extent_to_write_t::create_zero(
@@ -1247,14 +1257,14 @@ extent_to_write_list_t get_to_writes_with_zero_buffer(
zero_right - zero_left));
assert(ret.back().len % block_size == 0);
assert(ret.back().len > 0);
- if (tailptr) {
- bufferlist tailbl;
- tailbl.append(*tailptr);
- tailbl.append_zero(right - zero_right - tailbl.length());
- assert(tailbl.length() % block_size == 0);
- assert(tailbl.length() > 0);
+ if (tailbl) {
+ bufferlist tail_zero_bl;
+ tail_zero_bl.append(*tailbl);
+ tail_zero_bl.append_zero(right - zero_right - tail_zero_bl.length());
+ assert(tail_zero_bl.length() % block_size == 0);
+ assert(tail_zero_bl.length() > 0);
ret.push_back(extent_to_write_t::create_data(
- (data_base + zero_right).checked_to_laddr(), tailbl));
+ (data_base + zero_right).checked_to_laddr(), tail_zero_bl));
}
return ret;
}
@@ -1293,7 +1303,7 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
(auto &pins, auto &to_write) mutable
{
LOG_PREFIX(ObjectDataHandler::overwrite);
- DEBUGT("overwrite: {}~{}",
+ DEBUGT("overwrite: 0x{:x}~0x{:x}",
ctx.t,
offset,
len);
@@ -1306,13 +1316,13 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
overwrite_plan
).si_then([ctx, data_base, len, offset, overwrite_plan, bl=std::move(bl),
&to_write, &pins, this](auto p) mutable {
- auto &[left_extent, headptr] = p;
+ auto &[left_extent, headbl] = p;
if (left_extent) {
ceph_assert(left_extent->addr == overwrite_plan.pin_begin);
append_extent_to_write(to_write, std::move(*left_extent));
}
- if (headptr) {
- assert(headptr->length() > 0);
+ if (headbl) {
+ assert(headbl->length() > 0);
}
return operate_right(
ctx,
@@ -1321,19 +1331,19 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
).si_then([ctx, data_base, len, offset,
pin_begin=overwrite_plan.pin_begin,
pin_end=overwrite_plan.pin_end,
- bl=std::move(bl), headptr=std::move(headptr),
+ bl=std::move(bl), headbl=std::move(headbl),
&to_write, &pins, this](auto p) mutable {
- auto &[right_extent, tailptr] = p;
+ auto &[right_extent, tailbl] = p;
if (bl.has_value()) {
auto write_offset = offset;
bufferlist write_bl;
- if (headptr) {
- write_bl.append(*headptr);
- write_offset = write_offset - headptr->length();
+ if (headbl) {
+ write_bl.append(*headbl);
+ write_offset = write_offset - headbl->length();
}
write_bl.claim_append(*bl);
- if (tailptr) {
- write_bl.append(*tailptr);
+ if (tailbl) {
+ write_bl.append(*tailbl);
assert_aligned(write_bl.length());
}
splice_extent_to_write(
@@ -1347,8 +1357,8 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
ctx.tm.get_block_size(),
offset,
len,
- std::move(headptr),
- std::move(tailptr)));
+ std::move(headbl),
+ std::move(tailbl)));
}
if (right_extent) {
ceph_assert(right_extent->get_end_addr() == pin_end);
@@ -1383,7 +1393,7 @@ ObjectDataHandler::zero_ret ObjectDataHandler::zero(
ctx,
[this, ctx, offset, len](auto &object_data) {
LOG_PREFIX(ObjectDataHandler::zero);
- DEBUGT("zero to {}~{}, object_data: {}~{}, is_null {}",
+ DEBUGT("zero to 0x{:x}~0x{:x}, object_data: {}~0x{:x}, is_null {}",
ctx.t,
offset,
len,
@@ -1424,7 +1434,7 @@ ObjectDataHandler::write_ret ObjectDataHandler::write(
ctx,
[this, ctx, offset, &bl](auto &object_data) {
LOG_PREFIX(ObjectDataHandler::write);
- DEBUGT("writing to {}~{}, object_data: {}~{}, is_null {}",
+ DEBUGT("writing to 0x{:x}~0x{:x}, object_data: {}~0x{:x}, is_null {}",
ctx.t,
offset,
bl.length(),
@@ -1469,7 +1479,7 @@ ObjectDataHandler::read_ret ObjectDataHandler::read(
ctx,
[ctx, obj_offset, len, &ret](const auto &object_data) {
LOG_PREFIX(ObjectDataHandler::read);
- DEBUGT("reading {}~{}",
+ DEBUGT("reading {}~0x{:x}",
ctx.t,
object_data.get_reserved_data_base(),
object_data.get_reserved_data_len());
@@ -1501,83 +1511,74 @@ ObjectDataHandler::read_ret ObjectDataHandler::read(
pins,
[FNAME, ctx, l_start, l_end,
&l_current, &ret](auto &pin) -> read_iertr::future<> {
- auto pin_key = pin->get_key();
- if (l_current == l_start) {
- ceph_assert(l_current >= pin_key);
- } else {
+ auto pin_start = pin->get_key();
+ extent_len_t read_start;
+ extent_len_t read_start_aligned;
+ if (l_current == l_start) { // first pin may skip head
+ ceph_assert(l_current.get_aligned_laddr() >= pin_start);
+ read_start = l_current.template
+ get_byte_distance<extent_len_t>(pin_start);
+ read_start_aligned = p2align(read_start, ctx.tm.get_block_size());
+ } else { // non-first pin must match start
assert(l_current > l_start);
- ceph_assert(l_current == pin_key);
+ ceph_assert(l_current == pin_start);
+ read_start = 0;
+ read_start_aligned = 0;
}
+
ceph_assert(l_current < l_end);
auto pin_len = pin->get_length();
assert(pin_len > 0);
- laddr_offset_t l_pin_end = pin_key + pin_len;
- ceph_assert(l_current < l_pin_end);
- laddr_offset_t l_current_end = std::min(l_pin_end, l_end);
+ laddr_offset_t pin_end = pin_start + pin_len;
+ assert(l_current < pin_end);
+ laddr_offset_t l_current_end = std::min(pin_end, l_end);
+ extent_len_t read_len =
+ l_current_end.get_byte_distance<extent_len_t>(l_current);
+
if (pin->get_val().is_zero()) {
- DEBUGT("got {}~{} from zero-pin {}~{}",
+ DEBUGT("got {}~0x{:x} from zero-pin {}~0x{:x}",
ctx.t,
l_current,
- l_current_end.get_byte_distance<loffset_t>(l_current),
- pin_key,
+ read_len,
+ pin_start,
pin_len);
- ret.append_zero(
- l_current_end.get_byte_distance<
- extent_len_t>(l_current));
+ ret.append_zero(read_len);
l_current = l_current_end;
return seastar::now();
}
// non-zero pin
- bool is_indirect = pin->is_indirect();
- laddr_t e_key;
- extent_len_t e_len;
- extent_len_t e_off;
- if (is_indirect) {
- e_key = pin->get_intermediate_base();
- e_len = pin->get_intermediate_length();
- e_off = pin->get_intermediate_offset();
- DEBUGT("reading {}~{} from indirect-pin {}~{}, direct-pin {}~{}(off={})",
- ctx.t,
- l_current,
- l_current_end.get_byte_distance<extent_len_t>(l_current),
- pin_key,
- pin_len,
- e_key,
- e_len,
- e_off);
- assert(e_key <= pin->get_intermediate_key());
- assert(e_off + pin_len <= e_len);
- } else {
- DEBUGT("reading {}~{} from pin {}~{}",
- ctx.t,
- l_current,
- l_current_end.get_byte_distance<
- extent_len_t>(l_current),
- pin_key,
- pin_len);
- e_key = pin_key;
- e_len = pin_len;
- e_off = 0;
- }
- extent_len_t e_current_off = (l_current + e_off)
- .template get_byte_distance<extent_len_t>(pin_key);
+ laddr_t l_current_end_aligned = l_current_end.get_roundup_laddr();
+ extent_len_t read_len_aligned =
+ l_current_end_aligned.get_byte_distance<extent_len_t>(pin_start);
+ read_len_aligned -= read_start_aligned;
+ extent_len_t unalign_start_offset = read_start - read_start_aligned;
+ DEBUGT("reading {}~0x{:x} from pin {}~0x{:x}",
+ ctx.t,
+ l_current,
+ read_len,
+ pin_start,
+ pin_len);
return ctx.tm.read_pin<ObjectDataBlock>(
ctx.t,
- std::move(pin)
+ std::move(pin),
+ read_start_aligned,
+ read_len_aligned
).si_then([&ret, &l_current, l_current_end,
-#ifndef NDEBUG
- e_key, e_len, e_current_off](auto extent) {
-#else
- e_current_off](auto extent) {
-#endif
- assert(e_key == extent->get_laddr());
- assert(e_len == extent->get_length());
- ret.append(
- bufferptr(
- extent->get_bptr(),
- e_current_off,
- l_current_end.get_byte_distance<extent_len_t>(l_current)));
+ read_start_aligned, read_len_aligned,
+ unalign_start_offset, read_len](auto maybe_indirect_extent) {
+ auto aligned_bl = maybe_indirect_extent.get_range(
+ read_start_aligned, read_len_aligned);
+ if (read_len < read_len_aligned) {
+ ceph::bufferlist unaligned_bl;
+ unaligned_bl.substr_of(
+ aligned_bl, unalign_start_offset, read_len);
+ ret.append(std::move(unaligned_bl));
+ } else {
+ assert(read_len == read_len_aligned);
+ assert(unalign_start_offset == 0);
+ ret.append(std::move(aligned_bl));
+ }
l_current = l_current_end;
return seastar::now();
}).handle_error_interruptible(
@@ -1608,7 +1609,7 @@ ObjectDataHandler::fiemap_ret ObjectDataHandler::fiemap(
[ctx, obj_offset, len, &ret](const auto &object_data) {
LOG_PREFIX(ObjectDataHandler::fiemap);
DEBUGT(
- "{}~{}, reservation {}~{}",
+ "0x{:x}~0x{:x}, reservation {}~0x{:x}",
ctx.t,
obj_offset,
len,
@@ -1663,7 +1664,7 @@ ObjectDataHandler::truncate_ret ObjectDataHandler::truncate(
ctx,
[this, ctx, offset](auto &object_data) {
LOG_PREFIX(ObjectDataHandler::truncate);
- DEBUGT("truncating {}~{} offset: {}",
+ DEBUGT("truncating {}~0x{:x} offset: 0x{:x}",
ctx.t,
object_data.get_reserved_data_base(),
object_data.get_reserved_data_len(),
@@ -1706,7 +1707,7 @@ ObjectDataHandler::clone_ret ObjectDataHandler::clone_extents(
laddr_t data_base)
{
LOG_PREFIX(ObjectDataHandler::clone_extents);
- TRACET(" object_data: {}~{}, data_base: {}",
+ TRACET("object_data: {}~0x{:x}, data_base: 0x{:x}",
ctx.t,
object_data.get_reserved_data_base(),
object_data.get_reserved_data_len(),
@@ -1791,7 +1792,7 @@ ObjectDataHandler::clone_ret ObjectDataHandler::clone(
auto len = object_data.get_reserved_data_len();
object_data.clear();
LOG_PREFIX(ObjectDataHandler::clone);
- DEBUGT("cloned obj reserve_data_base: {}, len {}",
+ DEBUGT("cloned obj reserve_data_base: {}, len 0x{:x}",
ctx.t,
d_object_data.get_reserved_data_base(),
d_object_data.get_reserved_data_len());
@@ -1801,7 +1802,7 @@ ObjectDataHandler::clone_ret ObjectDataHandler::clone(
d_object_data.get_reserved_data_len()
).si_then([&d_object_data, ctx, &object_data, base, len, this] {
LOG_PREFIX("ObjectDataHandler::clone");
- DEBUGT("head obj reserve_data_base: {}, len {}",
+ DEBUGT("head obj reserve_data_base: {}, len 0x{:x}",
ctx.t,
object_data.get_reserved_data_base(),
object_data.get_reserved_data_len());
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h
index 795daeddb11..7c2392731c0 100644
--- a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h
+++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node.h
@@ -48,7 +48,8 @@ struct OMapNode : LogicalCachedExtent {
need_merge(n_merge) {}
};
- OMapNode(ceph::bufferptr &&ptr) : LogicalCachedExtent(std::move(ptr)) {}
+ explicit OMapNode(ceph::bufferptr &&ptr) : LogicalCachedExtent(std::move(ptr)) {}
+ explicit OMapNode(extent_len_t length) : LogicalCachedExtent(length) {}
OMapNode(const OMapNode &other)
: LogicalCachedExtent(other) {}
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc
index 8d06accef1e..df97f394a0d 100644
--- a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc
+++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.cc
@@ -734,23 +734,28 @@ omap_load_extent(omap_context_t oc, laddr_t laddr, depth_t depth)
{
ceph_assert(depth > 0);
if (depth > 1) {
- return oc.tm.read_extent<OMapInnerNode>(oc.t, laddr,
- OMAP_INNER_BLOCK_SIZE)
- .handle_error_interruptible(
+ return oc.tm.read_extent<OMapInnerNode>(
+ oc.t, laddr, OMAP_INNER_BLOCK_SIZE
+ ).handle_error_interruptible(
omap_load_extent_iertr::pass_further{},
crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" }
- ).si_then(
- [](auto&& e) {
- return seastar::make_ready_future<OMapNodeRef>(std::move(e));
+ ).si_then([](auto maybe_indirect_extent) {
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
+ return seastar::make_ready_future<OMapNodeRef>(
+ std::move(maybe_indirect_extent.extent));
});
} else {
- return oc.tm.read_extent<OMapLeafNode>(oc.t, laddr, OMAP_LEAF_BLOCK_SIZE
+ return oc.tm.read_extent<OMapLeafNode>(
+ oc.t, laddr, OMAP_LEAF_BLOCK_SIZE
).handle_error_interruptible(
omap_load_extent_iertr::pass_further{},
crimson::ct_error::assert_all{ "Invalid error in omap_load_extent" }
- ).si_then(
- [](auto&& e) {
- return seastar::make_ready_future<OMapNodeRef>(std::move(e));
+ ).si_then([](auto maybe_indirect_extent) {
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
+ return seastar::make_ready_future<OMapNodeRef>(
+ std::move(maybe_indirect_extent.extent));
});
}
}
diff --git a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h
index a2b51bbb0e1..2267942f035 100644
--- a/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h
+++ b/src/crimson/os/seastore/omap_manager/btree/omap_btree_node_impl.h
@@ -31,10 +31,18 @@ struct OMapInnerNode
StringKVInnerNodeLayout {
using OMapInnerNodeRef = TCachedExtentRef<OMapInnerNode>;
using internal_iterator_t = const_iterator;
- template <typename... T>
- OMapInnerNode(T&&... t) :
- OMapNode(std::forward<T>(t)...),
- StringKVInnerNodeLayout(get_bptr().c_str()) {}
+
+ explicit OMapInnerNode(ceph::bufferptr &&ptr)
+ : OMapNode(std::move(ptr)) {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
+ // Must be identical with OMapInnerNode(ptr) after on_fully_loaded()
+ explicit OMapInnerNode(extent_len_t length)
+ : OMapNode(length) {}
+ OMapInnerNode(const OMapInnerNode &rhs)
+ : OMapNode(rhs) {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
omap_node_meta_t get_node_meta() const final { return get_meta(); }
bool extent_will_overflow(size_t ksize, std::optional<size_t> vsize) const {
@@ -46,6 +54,10 @@ struct OMapInnerNode
bool extent_is_below_min() const { return below_min(); }
uint32_t get_node_size() { return get_size(); }
+ void on_fully_loaded() final {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
+
CachedExtentRef duplicate_for_write(Transaction&) final {
assert(delta_buffer.empty());
return CachedExtentRef(new OMapInnerNode(*this));
@@ -148,10 +160,18 @@ struct OMapLeafNode
using OMapLeafNodeRef = TCachedExtentRef<OMapLeafNode>;
using internal_iterator_t = const_iterator;
- template <typename... T>
- OMapLeafNode(T&&... t) :
- OMapNode(std::forward<T>(t)...),
- StringKVLeafNodeLayout(get_bptr().c_str()) {}
+
+ explicit OMapLeafNode(ceph::bufferptr &&ptr)
+ : OMapNode(std::move(ptr)) {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
+ // Must be identical with OMapLeafNode(ptr) after on_fully_loaded()
+ explicit OMapLeafNode(extent_len_t length)
+ : OMapNode(length) {}
+ OMapLeafNode(const OMapLeafNode &rhs)
+ : OMapNode(rhs) {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
omap_node_meta_t get_node_meta() const final { return get_meta(); }
bool extent_will_overflow(
@@ -164,6 +184,10 @@ struct OMapLeafNode
bool extent_is_below_min() const { return below_min(); }
uint32_t get_node_size() { return get_size(); }
+ void on_fully_loaded() final {
+ this->set_layout_buf(this->get_bptr().c_str());
+ }
+
CachedExtentRef duplicate_for_write(Transaction&) final {
assert(delta_buffer.empty());
return CachedExtentRef(new OMapLeafNode(*this));
diff --git a/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h
index 72b13fedfb1..3825ebef145 100644
--- a/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h
+++ b/src/crimson/os/seastore/omap_manager/btree/string_kv_node_layout.h
@@ -504,8 +504,13 @@ public:
inner_remove(iter);
}
- StringKVInnerNodeLayout(char *buf) :
- buf(buf) {}
+ StringKVInnerNodeLayout() : buf(nullptr) {}
+
+ void set_layout_buf(char *_buf) {
+ assert(buf == nullptr);
+ assert(_buf != nullptr);
+ buf = _buf;
+ }
uint32_t get_size() const {
ceph_le32 &size = *layout.template Pointer<0>(buf);
@@ -1120,8 +1125,13 @@ public:
leaf_remove(iter);
}
- StringKVLeafNodeLayout(char *buf) :
- buf(buf) {}
+ StringKVLeafNodeLayout() : buf(nullptr) {}
+
+ void set_layout_buf(char *_buf) {
+ assert(buf == nullptr);
+ assert(_buf != nullptr);
+ buf = _buf;
+ }
const_iterator iter_begin() const {
return const_iterator(
diff --git a/src/crimson/os/seastore/onode.cc b/src/crimson/os/seastore/onode.cc
index f3fd6eb18a5..dc8f6e87c8e 100644
--- a/src/crimson/os/seastore/onode.cc
+++ b/src/crimson/os/seastore/onode.cc
@@ -11,7 +11,7 @@ std::ostream& operator<<(std::ostream &out, const Onode &rhs)
auto &layout = rhs.get_layout();
return out << "Onode("
<< "hobj=" << rhs.hobj << ", "
- << "size=" << static_cast<uint32_t>(layout.size)
+ << "size=0x" << std::hex << static_cast<uint32_t>(layout.size) << std::dec
<< ")";
}
diff --git a/src/crimson/os/seastore/onode.h b/src/crimson/os/seastore/onode.h
index 072c57864be..fa2ed65c0f3 100644
--- a/src/crimson/os/seastore/onode.h
+++ b/src/crimson/os/seastore/onode.h
@@ -36,8 +36,8 @@ struct onode_layout_t {
object_data_le_t object_data;
- char oi[MAX_OI_LENGTH];
- char ss[MAX_SS_LENGTH];
+ char oi[MAX_OI_LENGTH] = {0};
+ char ss[MAX_SS_LENGTH] = {0};
} __attribute__((packed));
class Transaction;
diff --git a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h
index 9230051cc50..04b959f767d 100644
--- a/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h
+++ b/src/crimson/os/seastore/onode_manager/staged-fltree/node_extent_manager/seastore.h
@@ -41,8 +41,10 @@ class SeastoreSuper final: public Super {
class SeastoreNodeExtent final: public NodeExtent {
public:
- SeastoreNodeExtent(ceph::bufferptr &&ptr)
+ explicit SeastoreNodeExtent(ceph::bufferptr &&ptr)
: NodeExtent(std::move(ptr)) {}
+ explicit SeastoreNodeExtent(extent_len_t length)
+ : NodeExtent(length) {}
SeastoreNodeExtent(const SeastoreNodeExtent& other)
: NodeExtent(other) {}
~SeastoreNodeExtent() override = default;
@@ -111,10 +113,14 @@ class SeastoreNodeExtentManager final: public TransactionManagerHandle {
}
}
return tm.read_extent<SeastoreNodeExtent>(t, addr
- ).si_then([addr, &t](auto&& e) -> read_iertr::future<NodeExtentRef> {
+ ).si_then([addr, &t](auto maybe_indirect_extent)
+ -> read_iertr::future<NodeExtentRef> {
+ auto e = maybe_indirect_extent.extent;
SUBTRACET(seastore_onode,
"read {}B at {} -- {}",
t, e->get_length(), e->get_laddr(), *e);
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
assert(e->get_laddr() == addr);
std::ignore = addr;
return read_iertr::make_ready_future<NodeExtentRef>(e);
diff --git a/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc
index 9f6a566d15c..97b7902edf5 100644
--- a/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc
+++ b/src/crimson/os/seastore/random_block_manager/block_rb_manager.cc
@@ -188,10 +188,10 @@ BlockRBManager::write_ertr::future<> BlockRBManager::write(
void BlockRBManager::prefill_fragmented_device()
{
LOG_PREFIX(BlockRBManager::prefill_fragmented_device);
- // the first 2 blocks must be allocated to lba root
+ // the first 3 blocks must be allocated to lba root
// and backref root during mkfs
- for (size_t block = get_block_size() * 2;
- block <= get_size() - get_block_size() * 2;
+ for (size_t block = get_block_size() * 3;
+ block <= get_size() - get_block_size() * 3;
block += get_block_size() * 2) {
DEBUG("marking {}~{} used",
get_start_rbm_addr() + block,
diff --git a/src/crimson/os/seastore/record_scanner.cc b/src/crimson/os/seastore/record_scanner.cc
index 5fab11505ce..172ba77577e 100644
--- a/src/crimson/os/seastore/record_scanner.cc
+++ b/src/crimson/os/seastore/record_scanner.cc
@@ -18,7 +18,7 @@ RecordScanner::scan_valid_records(
{
LOG_PREFIX(RecordScanner::scan_valid_records);
initialize_cursor(cursor);
- DEBUG("starting at {}, budget={}", cursor, budget);
+ DEBUG("starting at {}, budget=0x{:x}", cursor, budget);
auto retref = std::make_unique<size_t>(0);
auto &budget_used = *retref;
return crimson::repeat(
@@ -91,7 +91,7 @@ RecordScanner::scan_valid_records(
}
}().safe_then([=, &budget_used, &cursor] {
if (cursor.is_complete() || budget_used >= budget) {
- DEBUG("finish at {}, budget_used={}, budget={}",
+ DEBUG("finish at {}, budget_used=0x{:x}, budget=0x{:x}",
cursor, budget_used, budget);
return seastar::stop_iteration::yes;
} else {
@@ -112,13 +112,13 @@ RecordScanner::read_validate_record_metadata(
paddr_t start = cursor.seq.offset;
auto block_size = cursor.get_block_size();
if (get_segment_off(cursor.seq.offset) + block_size > get_segment_end_offset(cursor.seq.offset)) {
- DEBUG("failed -- record group header block {}~4096 > segment_size {}",
- start, get_segment_end_offset(cursor.seq.offset));
+ DEBUG("failed -- record group header block {}~0x{:x} > segment_size 0x{:x}",
+ start, block_size, get_segment_end_offset(cursor.seq.offset));
return read_validate_record_metadata_ret(
read_validate_record_metadata_ertr::ready_future_marker{},
std::nullopt);
}
- TRACE("reading record group header block {}~4096", start);
+ TRACE("reading record group header block {}~0x{:x}", start, block_size);
return read(start, block_size
).safe_then([this, FNAME, nonce, block_size, &cursor](bufferptr bptr)
-> read_validate_record_metadata_ret {
@@ -159,7 +159,7 @@ RecordScanner::read_validate_record_metadata(
paddr_t rest_start = cursor.seq.offset.add_offset(block_size);
auto rest_len = header.mdlength - block_size;
- TRACE("reading record group header rest {}~{}", rest_start, rest_len);
+ TRACE("reading record group header rest {}~0x{:x}", rest_start, rest_len);
return read(rest_start, rest_len
).safe_then([header=std::move(header), bl=std::move(bl)
](auto&& bptail) mutable {
@@ -189,7 +189,7 @@ RecordScanner::read_validate_data_ret RecordScanner::read_validate_data(
{
LOG_PREFIX(RecordScanner::read_validate_data);
auto data_addr = record_base.add_offset(header.mdlength);
- TRACE("reading record group data blocks {}~{}", data_addr, header.dlength);
+ TRACE("reading record group data blocks {}~0x{:x}", data_addr, header.dlength);
return read(
data_addr,
header.dlength
@@ -220,7 +220,7 @@ RecordScanner::consume_next_records(
total_length
}
};
- DEBUG("processing {} at {}, budget_used={}",
+ DEBUG("processing {} at {}, budget_used=0x{:x}",
next.header, locator, budget_used);
return handler(
locator,
diff --git a/src/crimson/os/seastore/root_block.h b/src/crimson/os/seastore/root_block.h
index 942434dd596..26b8604500d 100644
--- a/src/crimson/os/seastore/root_block.h
+++ b/src/crimson/os/seastore/root_block.h
@@ -41,7 +41,7 @@ struct RootBlock : CachedExtent {
CachedExtent* lba_root_node = nullptr;
CachedExtent* backref_root_node = nullptr;
- RootBlock() : CachedExtent(zero_length_t()) {};
+ RootBlock() : CachedExtent(root_construct_t()) {};
RootBlock(const RootBlock &rhs)
: CachedExtent(rhs),
diff --git a/src/crimson/os/seastore/root_meta.h b/src/crimson/os/seastore/root_meta.h
new file mode 100644
index 00000000000..edf082f1e38
--- /dev/null
+++ b/src/crimson/os/seastore/root_meta.h
@@ -0,0 +1,76 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include "crimson/os/seastore/cached_extent.h"
+
+namespace crimson::os::seastore {
+
+struct RootMetaBlock : LogicalCachedExtent {
+ using meta_t = std::map<std::string, std::string>;
+ using Ref = TCachedExtentRef<RootMetaBlock>;
+ static constexpr size_t SIZE = 4096;
+ static constexpr int MAX_META_LENGTH = 1024;
+
+ explicit RootMetaBlock(ceph::bufferptr &&ptr)
+ : LogicalCachedExtent(std::move(ptr)) {}
+ explicit RootMetaBlock(extent_len_t length)
+ : LogicalCachedExtent(length) {}
+ RootMetaBlock(const RootMetaBlock &rhs)
+ : LogicalCachedExtent(rhs) {}
+
+ CachedExtentRef duplicate_for_write(Transaction&) final {
+ return CachedExtentRef(new RootMetaBlock(*this));
+ }
+
+ static constexpr extent_types_t TYPE = extent_types_t::ROOT_META;
+ extent_types_t get_type() const final {
+ return extent_types_t::ROOT_META;
+ }
+
+ /// dumps root meta as delta
+ ceph::bufferlist get_delta() final {
+ ceph::bufferlist bl;
+ ceph::buffer::ptr bptr(get_bptr(), 0, MAX_META_LENGTH);
+ bl.append(bptr);
+ return bl;
+ }
+
+ /// overwrites root
+ void apply_delta(const ceph::bufferlist &_bl) final
+ {
+ assert(_bl.length() == MAX_META_LENGTH);
+ ceph::bufferlist bl = _bl;
+ bl.rebuild();
+ get_bptr().copy_in(0, MAX_META_LENGTH, bl.front().c_str());
+ }
+
+ meta_t get_meta() const {
+ bufferlist bl;
+ bl.append(get_bptr());
+ meta_t ret;
+ auto iter = bl.cbegin();
+ decode(ret, iter);
+ return ret;
+ }
+
+ void set_meta(const meta_t &m) {
+ ceph::bufferlist bl;
+ encode(m, bl);
+ ceph_assert(bl.length() <= MAX_META_LENGTH);
+ bl.rebuild();
+ get_bptr().zero(0, MAX_META_LENGTH);
+ get_bptr().copy_in(0, bl.length(), bl.front().c_str());
+ }
+
+};
+using RootMetaBlockRef = RootMetaBlock::Ref;
+
+} // crimson::os::seastore
+
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::os::seastore::RootMetaBlock>
+ : fmt::ostream_formatter {};
+#endif
diff --git a/src/crimson/os/seastore/seastore.cc b/src/crimson/os/seastore/seastore.cc
index d90edbb20db..6a866cb1f9b 100644
--- a/src/crimson/os/seastore/seastore.cc
+++ b/src/crimson/os/seastore/seastore.cc
@@ -408,6 +408,7 @@ SeaStore::Shard::mkfs_managers()
return transaction_manager->with_transaction_intr(
Transaction::src_t::MUTATE,
"mkfs_seastore",
+ CACHE_HINT_TOUCH,
[this](auto& t)
{
LOG_PREFIX(SeaStoreS::mkfs_managers);
@@ -897,9 +898,10 @@ get_ranges(CollectionRef ch,
seastar::future<std::tuple<std::vector<ghobject_t>, ghobject_t>>
SeaStore::Shard::list_objects(CollectionRef ch,
- const ghobject_t& start,
- const ghobject_t& end,
- uint64_t limit) const
+ const ghobject_t& start,
+ const ghobject_t& end,
+ uint64_t limit,
+ uint32_t op_flags) const
{
++(shard_stats.read_num);
++(shard_stats.pending_read_num);
@@ -910,13 +912,14 @@ SeaStore::Shard::list_objects(CollectionRef ch,
return seastar::do_with(
RetType(std::vector<ghobject_t>(), start),
std::move(limit),
- [this, ch, start, end](auto& ret, auto& limit) {
- return repeat_eagain([this, ch, start, end, &limit, &ret] {
+ [this, ch, start, end, op_flags](auto& ret, auto& limit) {
+ return repeat_eagain([this, ch, start, end, &limit, &ret, op_flags] {
++(shard_stats.repeat_read_num);
return transaction_manager->with_transaction_intr(
Transaction::src_t::READ,
"list_objects",
+ op_flags,
[this, ch, start, end, &limit, &ret](auto &t)
{
LOG_PREFIX(SeaStoreS::list_objects);
@@ -1054,6 +1057,7 @@ SeaStore::Shard::list_collections()
return transaction_manager->with_transaction_intr(
Transaction::src_t::READ,
"list_collections",
+ CACHE_HINT_TOUCH,
[this, &ret](auto& t)
{
LOG_PREFIX(SeaStoreS::list_collections);
@@ -1137,6 +1141,7 @@ SeaStore::Shard::read(
Transaction::src_t::READ,
"read",
op_type_t::READ,
+ op_flags,
[this, offset, len, op_flags](auto &t, auto &onode) {
return _read(t, onode, offset, len, op_flags);
}).finally([this] {
@@ -1148,7 +1153,8 @@ SeaStore::Shard::read(
SeaStore::Shard::base_errorator::future<bool>
SeaStore::Shard::exists(
CollectionRef c,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
LOG_PREFIX(SeaStoreS::exists);
++(shard_stats.read_num);
@@ -1160,6 +1166,7 @@ SeaStore::Shard::exists(
Transaction::src_t::READ,
"exists",
op_type_t::READ,
+ op_flags,
[FNAME](auto& t, auto&) {
DEBUGT("exists", t);
return seastar::make_ready_future<bool>(true);
@@ -1240,7 +1247,8 @@ SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist>
SeaStore::Shard::get_attr(
CollectionRef ch,
const ghobject_t& oid,
- std::string_view name) const
+ std::string_view name,
+ uint32_t op_flags) const
{
++(shard_stats.read_num);
++(shard_stats.pending_read_num);
@@ -1251,6 +1259,7 @@ SeaStore::Shard::get_attr(
Transaction::src_t::READ,
"get_attr",
op_type_t::GET_ATTR,
+ op_flags,
[this, name](auto &t, auto& onode) {
return _get_attr(t, onode, name);
}).handle_error(
@@ -1296,7 +1305,8 @@ SeaStore::Shard::_get_attrs(
SeaStore::Shard::get_attrs_ertr::future<SeaStore::Shard::attrs_t>
SeaStore::Shard::get_attrs(
CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
++(shard_stats.read_num);
++(shard_stats.pending_read_num);
@@ -1307,6 +1317,7 @@ SeaStore::Shard::get_attrs(
Transaction::src_t::READ,
"get_attrs",
op_type_t::GET_ATTRS,
+ op_flags,
[this](auto &t, auto& onode) {
return _get_attrs(t, onode);
}).handle_error(
@@ -1331,14 +1342,15 @@ seastar::future<struct stat> SeaStore::Shard::_stat(
st.st_blksize = device->get_block_size();
st.st_blocks = (st.st_size + st.st_blksize - 1) / st.st_blksize;
st.st_nlink = 1;
- DEBUGT("oid={}, size={}, blksize={}",
+ DEBUGT("oid={}, size=0x{:x}, blksize=0x{:x}",
t, oid, st.st_size, st.st_blksize);
return seastar::make_ready_future<struct stat>(st);
}
seastar::future<struct stat> SeaStore::Shard::stat(
CollectionRef c,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
++(shard_stats.read_num);
++(shard_stats.pending_read_num);
@@ -1349,6 +1361,7 @@ seastar::future<struct stat> SeaStore::Shard::stat(
Transaction::src_t::READ,
"stat",
op_type_t::STAT,
+ op_flags,
[this, oid](auto &t, auto &onode) {
return _stat(t, onode, oid);
}).handle_error(
@@ -1364,9 +1377,10 @@ seastar::future<struct stat> SeaStore::Shard::stat(
SeaStore::Shard::get_attr_errorator::future<ceph::bufferlist>
SeaStore::Shard::omap_get_header(
CollectionRef ch,
- const ghobject_t& oid)
+ const ghobject_t& oid,
+ uint32_t op_flags)
{
- return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY);
+ return get_attr(ch, oid, OMAP_HEADER_XATTR_KEY, op_flags);
}
SeaStore::base_iertr::future<SeaStore::Shard::omap_values_t>
@@ -1389,7 +1403,8 @@ SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_t>
SeaStore::Shard::omap_get_values(
CollectionRef ch,
const ghobject_t &oid,
- const omap_keys_t &keys)
+ const omap_keys_t &keys,
+ uint32_t op_flags)
{
++(shard_stats.read_num);
++(shard_stats.pending_read_num);
@@ -1400,6 +1415,7 @@ SeaStore::Shard::omap_get_values(
Transaction::src_t::READ,
"omap_get_values",
op_type_t::OMAP_GET_VALUES,
+ op_flags,
[this, keys](auto &t, auto &onode) {
return do_omap_get_values(t, onode, keys);
}).finally([this] {
@@ -1529,7 +1545,8 @@ SeaStore::Shard::read_errorator::future<SeaStore::Shard::omap_values_paged_t>
SeaStore::Shard::omap_get_values(
CollectionRef ch,
const ghobject_t &oid,
- const std::optional<std::string> &start)
+ const std::optional<std::string> &start,
+ uint32_t op_flags)
{
++(shard_stats.read_num);
++(shard_stats.pending_read_num);
@@ -1540,6 +1557,7 @@ SeaStore::Shard::omap_get_values(
Transaction::src_t::READ,
"omap_get_values2",
op_type_t::OMAP_GET_VALUES2,
+ op_flags,
[this, start](auto &t, auto &onode) {
return do_omap_get_values(t, onode, start);
}).finally([this] {
@@ -1589,7 +1607,8 @@ SeaStore::Shard::fiemap(
CollectionRef ch,
const ghobject_t& oid,
uint64_t off,
- uint64_t len)
+ uint64_t len,
+ uint32_t op_flags)
{
++(shard_stats.read_num);
++(shard_stats.pending_read_num);
@@ -1600,6 +1619,7 @@ SeaStore::Shard::fiemap(
Transaction::src_t::READ,
"fiemap",
op_type_t::READ,
+ op_flags,
[this, off, len](auto &t, auto &onode) {
return _fiemap(t, onode, off, len);
}).finally([this] {
@@ -1640,7 +1660,7 @@ seastar::future<> SeaStore::Shard::do_transaction_no_callbacks(
[this, num_bytes](auto &ctx) {
LOG_PREFIX(SeaStoreS::do_transaction_no_callbacks);
return with_trans_intr(*ctx.transaction, [&ctx, this, FNAME, num_bytes](auto &t) {
- DEBUGT("cid={}, {} operations, {} bytes, {} colls, {} objects ...",
+ DEBUGT("cid={}, {} operations, 0x{:x} bytes, {} colls, {} objects ...",
t, ctx.ch->get_cid(),
ctx.ext_transaction.get_num_ops(),
num_bytes,
@@ -2677,6 +2697,7 @@ seastar::future<> SeaStore::Shard::write_meta(
return transaction_manager->with_transaction_intr(
Transaction::src_t::MUTATE,
"write_meta",
+ CACHE_HINT_NOCACHE,
[this, &key, &value](auto& t)
{
LOG_PREFIX(SeaStoreS::write_meta);
@@ -2721,6 +2742,13 @@ SeaStore::read_meta(const std::string& key)
);
}
+seastar::future<std::string> SeaStore::get_default_device_class()
+{
+ using crimson::common::get_conf;
+ std::string type = get_conf<std::string>("seastore_main_device_type");
+ return seastar::make_ready_future<std::string>(type);
+}
+
uuid_d SeaStore::Shard::get_fsid() const
{
return device->get_meta().seastore_id;
diff --git a/src/crimson/os/seastore/seastore.h b/src/crimson/os/seastore/seastore.h
index 185072744f2..e2a993b9e20 100644
--- a/src/crimson/os/seastore/seastore.h
+++ b/src/crimson/os/seastore/seastore.h
@@ -101,7 +101,8 @@ public:
seastar::future<struct stat> stat(
CollectionRef c,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
read_errorator::future<ceph::bufferlist> read(
CollectionRef c,
@@ -118,32 +119,38 @@ public:
base_errorator::future<bool> exists(
CollectionRef c,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
get_attr_errorator::future<ceph::bufferlist> get_attr(
CollectionRef c,
const ghobject_t& oid,
- std::string_view name) const final;
+ std::string_view name,
+ uint32_t op_flags = 0) const final;
get_attrs_ertr::future<attrs_t> get_attrs(
CollectionRef c,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
read_errorator::future<omap_values_t> omap_get_values(
CollectionRef c,
const ghobject_t& oid,
- const omap_keys_t& keys) final;
+ const omap_keys_t& keys,
+ uint32_t op_flags = 0) final;
/// Retrieves paged set of values > start (if present)
read_errorator::future<omap_values_paged_t> omap_get_values(
CollectionRef c, ///< [in] collection
const ghobject_t &oid, ///< [in] oid
- const std::optional<std::string> &start ///< [in] start, empty for begin
+ const std::optional<std::string> &start, ///< [in] start, empty for begin
+ uint32_t op_flags = 0
) final; ///< @return <done, values> values.empty() iff done
get_attr_errorator::future<bufferlist> omap_get_header(
CollectionRef c,
- const ghobject_t& oid) final;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) final;
/// std::get<1>(ret) returns end if and only if the listing has listed all
/// the items within the range, otherwise it returns the next key to be listed.
@@ -151,7 +158,8 @@ public:
CollectionRef c,
const ghobject_t& start,
const ghobject_t& end,
- uint64_t limit) const final;
+ uint64_t limit,
+ uint32_t op_flags = 0) const final;
seastar::future<CollectionRef> create_new_collection(const coll_t& cid) final;
seastar::future<CollectionRef> open_collection(const coll_t& cid) final;
@@ -170,7 +178,8 @@ public:
CollectionRef ch,
const ghobject_t& oid,
uint64_t off,
- uint64_t len) final;
+ uint64_t len,
+ uint32_t op_flags = 0) final;
unsigned get_max_attr_name_length() const final {
return 256;
@@ -191,6 +200,8 @@ public:
seastar::future<> write_meta(const std::string& key,
const std::string& value);
+ seastar::future<std::string> get_default_device_class();
+
store_statfs_t stat() const;
uuid_d get_fsid() const;
@@ -249,7 +260,8 @@ public:
return seastar::do_with(
internal_context_t(
ch, std::move(t),
- transaction_manager->create_transaction(src, tname)),
+ transaction_manager->create_transaction(
+ src, tname, t.get_fadvise_flags())),
std::forward<F>(f),
[this, op_type](auto &ctx, auto &f) {
assert(shard_stats.starting_io_num);
@@ -296,20 +308,22 @@ public:
Transaction::src_t src,
const char* tname,
op_type_t op_type,
+ cache_hint_t cache_hint_flags,
F &&f) const {
auto begin_time = std::chrono::steady_clock::now();
return seastar::do_with(
oid, Ret{}, std::forward<F>(f),
- [this, ch, src, op_type, begin_time, tname
+ [this, ch, src, op_type, begin_time, tname, cache_hint_flags
](auto &oid, auto &ret, auto &f)
{
- return repeat_eagain([&, this, ch, src, tname] {
+ return repeat_eagain([&, this, ch, src, tname, cache_hint_flags] {
assert(src == Transaction::src_t::READ);
++(shard_stats.repeat_read_num);
return transaction_manager->with_transaction_intr(
src,
tname,
+ cache_hint_flags,
[&, this, ch, tname](auto& t)
{
LOG_PREFIX(SeaStoreS::repeat_with_onode);
@@ -567,6 +581,8 @@ public:
seastar::future<std::vector<coll_core_t>> list_collections() final;
+ seastar::future<std::string> get_default_device_class() final;
+
FuturizedStore::Shard& get_sharded_store() final {
return shard_stores.local();
}
diff --git a/src/crimson/os/seastore/seastore_types.cc b/src/crimson/os/seastore/seastore_types.cc
index f379dd0117c..a57f56d4ab4 100644
--- a/src/crimson/os/seastore/seastore_types.cc
+++ b/src/crimson/os/seastore/seastore_types.cc
@@ -246,6 +246,8 @@ std::ostream &operator<<(std::ostream &out, extent_types_t t)
return out << "LADDR_LEAF";
case extent_types_t::ONODE_BLOCK_STAGED:
return out << "ONODE_BLOCK_STAGED";
+ case extent_types_t::ROOT_META:
+ return out << "ROOT_META";
case extent_types_t::OMAP_INNER:
return out << "OMAP_INNER";
case extent_types_t::OMAP_LEAF:
@@ -349,11 +351,11 @@ std::ostream &operator<<(std::ostream &out, const delta_info_t &delta)
<< "type: " << delta.type
<< ", paddr: " << delta.paddr
<< ", laddr: " << delta.laddr
- << ", prev_crc: " << delta.prev_crc
- << ", final_crc: " << delta.final_crc
- << ", length: " << delta.length
+ << ", prev_crc: 0x" << std::hex << delta.prev_crc
+ << ", final_crc: 0x" << delta.final_crc
+ << ", length: 0x" << delta.length << std::dec
<< ", pversion: " << delta.pversion
- << ", ext_seq: " << delta.ext_seq
+ << ", ext_seq: " << segment_seq_printer_t{delta.ext_seq}
<< ", seg_type: " << delta.seg_type
<< ")";
}
@@ -371,7 +373,7 @@ std::ostream &operator<<(std::ostream &out, const extent_info_t &info)
return out << "extent_info_t("
<< "type: " << info.type
<< ", addr: " << info.addr
- << ", len: " << info.len
+ << ", len: 0x" << std::hex << info.len << std::dec
<< ")";
}
@@ -385,7 +387,7 @@ std::ostream &operator<<(std::ostream &out, const segment_header_t &header)
<< " " << rewrite_gen_printer_t{header.generation}
<< ", dirty_tail=" << header.dirty_tail
<< ", alloc_tail=" << header.alloc_tail
- << ", segment_nonce=" << header.segment_nonce
+ << ", segment_nonce=0x" << std::hex << header.segment_nonce << std::dec
<< ", modify_time=" << mod_time_point_printer_t{header.modify_time}
<< ")";
}
@@ -396,7 +398,7 @@ std::ostream &operator<<(std::ostream &out, const segment_tail_t &tail)
<< tail.physical_segment_id
<< " " << tail.type
<< " " << segment_seq_printer_t{tail.segment_seq}
- << ", segment_nonce=" << tail.segment_nonce
+ << ", segment_nonce=0x" << std::hex << tail.segment_nonce << std::dec
<< ", modify_time=" << mod_time_point_printer_t{tail.modify_time}
<< ", num_extents=" << tail.num_extents
<< ")";
@@ -462,8 +464,8 @@ std::ostream &operator<<(std::ostream& out, const record_size_t& rsize)
{
return out << "record_size_t("
<< "record_type=" << rsize.record_type
- << "raw_md=" << rsize.get_raw_mdlength()
- << ", data=" << rsize.dlength
+ << "raw_md=0x" << std::hex << rsize.get_raw_mdlength()
+ << ", data=0x" << rsize.dlength << std::dec
<< ")";
}
@@ -507,11 +509,11 @@ std::ostream& operator<<(std::ostream& out, const record_group_header_t& h)
{
return out << "record_group_header_t("
<< "num_records=" << h.records
- << ", mdlength=" << h.mdlength
- << ", dlength=" << h.dlength
- << ", nonce=" << h.segment_nonce
+ << ", mdlength=0x" << std::hex << h.mdlength
+ << ", dlength=0x" << h.dlength
+ << ", segment_nonce=0x" << h.segment_nonce << std::dec
<< ", committed_to=" << h.committed_to
- << ", data_crc=" << h.data_crc
+ << ", data_crc=0x" << std::hex << h.data_crc << std::dec
<< ")";
}
@@ -554,9 +556,9 @@ std::ostream& operator<<(std::ostream& out, const record_group_size_t& size)
{
return out << "record_group_size_t("
<< "record_type=" << size.record_type
- << "raw_md=" << size.get_raw_mdlength()
- << ", data=" << size.dlength
- << ", block_size=" << size.block_size
+ << "raw_md=0x" << std::hex << size.get_raw_mdlength()
+ << ", data=0x" << size.dlength
+ << ", block_size=0x" << size.block_size << std::dec
<< ", fullness=" << size.get_fullness()
<< ")";
}
@@ -911,7 +913,7 @@ std::ostream& operator<<(std::ostream& out, const write_result_t& w)
{
return out << "write_result_t("
<< "start=" << w.start_seq
- << ", length=" << w.length
+ << ", length=0x" << std::hex << w.length << std::dec
<< ")";
}
diff --git a/src/crimson/os/seastore/seastore_types.h b/src/crimson/os/seastore/seastore_types.h
index df5c184e7ab..5930469ca07 100644
--- a/src/crimson/os/seastore/seastore_types.h
+++ b/src/crimson/os/seastore/seastore_types.h
@@ -3,6 +3,7 @@
#pragma once
+#include <deque>
#include <limits>
#include <numeric>
#include <optional>
@@ -14,13 +15,47 @@
#include "include/byteorder.h"
#include "include/denc.h"
+#include "include/encoding.h"
#include "include/buffer.h"
#include "include/intarith.h"
#include "include/interval_set.h"
#include "include/uuid.h"
+#include "include/rados.h"
namespace crimson::os::seastore {
+class cache_hint_t {
+ enum hint_t {
+ TOUCH,
+ NOCACHE
+ };
+public:
+ static constexpr cache_hint_t get_touch() {
+ return hint_t::TOUCH;
+ }
+ static constexpr cache_hint_t get_nocache() {
+ return hint_t::NOCACHE;
+ }
+ cache_hint_t(uint32_t flags) {
+ if (unlikely(flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) ||
+ unlikely(flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE)) {
+ hint = NOCACHE;
+ }
+ }
+ bool operator==(const cache_hint_t &other) const {
+ return hint == other.hint;
+ }
+ bool operator!=(const cache_hint_t &other) const {
+ return hint != other.hint;
+ }
+private:
+ constexpr cache_hint_t(hint_t hint) : hint(hint) {}
+ hint_t hint = hint_t::TOUCH;
+};
+
+inline constexpr cache_hint_t CACHE_HINT_TOUCH = cache_hint_t::get_touch();
+inline constexpr cache_hint_t CACHE_HINT_NOCACHE = cache_hint_t::get_nocache();
+
/* using a special xattr key "omap_header" to store omap header */
const std::string OMAP_HEADER_XATTR_KEY = "omap_header";
@@ -1226,7 +1261,6 @@ constexpr laddr_t L_ADDR_MAX = laddr_t::from_raw_uint(laddr_t::RAW_VALUE_MAX);
constexpr laddr_t L_ADDR_MIN = laddr_t::from_raw_uint(0);
constexpr laddr_t L_ADDR_NULL = L_ADDR_MAX;
constexpr laddr_t L_ADDR_ROOT = laddr_t::from_raw_uint(laddr_t::RAW_VALUE_MAX - 1);
-constexpr laddr_t L_ADDR_LBAT = laddr_t::from_raw_uint(laddr_t::RAW_VALUE_MAX - 2);
struct __attribute__((packed)) laddr_le_t {
ceph_le64 laddr;
@@ -1378,23 +1412,24 @@ enum class extent_types_t : uint8_t {
LADDR_INTERNAL = 1,
LADDR_LEAF = 2,
DINK_LADDR_LEAF = 3, // should only be used for unitttests
- OMAP_INNER = 4,
- OMAP_LEAF = 5,
- ONODE_BLOCK_STAGED = 6,
- COLL_BLOCK = 7,
- OBJECT_DATA_BLOCK = 8,
- RETIRED_PLACEHOLDER = 9,
+ ROOT_META = 4,
+ OMAP_INNER = 5,
+ OMAP_LEAF = 6,
+ ONODE_BLOCK_STAGED = 7,
+ COLL_BLOCK = 8,
+ OBJECT_DATA_BLOCK = 9,
+ RETIRED_PLACEHOLDER = 10,
// the following two types are not extent types,
// they are just used to indicates paddr allocation deltas
- ALLOC_INFO = 10,
- JOURNAL_TAIL = 11,
+ ALLOC_INFO = 11,
+ JOURNAL_TAIL = 12,
// Test Block Types
- TEST_BLOCK = 12,
- TEST_BLOCK_PHYSICAL = 13,
- BACKREF_INTERNAL = 14,
- BACKREF_LEAF = 15,
+ TEST_BLOCK = 13,
+ TEST_BLOCK_PHYSICAL = 14,
+ BACKREF_INTERNAL = 15,
+ BACKREF_LEAF = 16,
// None and the number of valid extent_types_t
- NONE = 16,
+ NONE = 17,
};
using extent_types_le_t = uint8_t;
constexpr auto EXTENT_TYPES_MAX = static_cast<uint8_t>(extent_types_t::NONE);
@@ -1409,12 +1444,12 @@ constexpr bool is_data_type(extent_types_t type) {
}
constexpr bool is_logical_metadata_type(extent_types_t type) {
- return type >= extent_types_t::OMAP_INNER &&
+ return type >= extent_types_t::ROOT_META &&
type <= extent_types_t::COLL_BLOCK;
}
constexpr bool is_logical_type(extent_types_t type) {
- if ((type >= extent_types_t::OMAP_INNER &&
+ if ((type >= extent_types_t::ROOT_META &&
type <= extent_types_t::OBJECT_DATA_BLOCK) ||
type == extent_types_t::TEST_BLOCK) {
assert(is_logical_metadata_type(type) ||
@@ -1466,6 +1501,23 @@ constexpr bool is_physical_type(extent_types_t type) {
}
}
+constexpr bool is_backref_mapped_type(extent_types_t type) {
+ if ((type >= extent_types_t::LADDR_INTERNAL &&
+ type <= extent_types_t::OBJECT_DATA_BLOCK) ||
+ type == extent_types_t::TEST_BLOCK ||
+ type == extent_types_t::TEST_BLOCK_PHYSICAL) {
+ assert(is_logical_type(type) ||
+ is_lba_node(type) ||
+ type == extent_types_t::TEST_BLOCK_PHYSICAL);
+ return true;
+ } else {
+ assert(!is_logical_type(type) &&
+ !is_lba_node(type) &&
+ type != extent_types_t::TEST_BLOCK_PHYSICAL);
+ return false;
+ }
+}
+
constexpr bool is_real_type(extent_types_t type) {
if (type <= extent_types_t::OBJECT_DATA_BLOCK ||
(type >= extent_types_t::TEST_BLOCK &&
@@ -1617,8 +1669,8 @@ struct delta_info_t {
extent_types_t type = extent_types_t::NONE; ///< delta type
paddr_t paddr; ///< physical address
laddr_t laddr = L_ADDR_NULL; ///< logical address
- uint32_t prev_crc = 0;
- uint32_t final_crc = 0;
+ checksum_t prev_crc = 0;
+ checksum_t final_crc = 0;
extent_len_t length = 0; ///< extent length
extent_version_t pversion; ///< prior version
segment_seq_t ext_seq; ///< seq of the extent's segment
@@ -1926,54 +1978,29 @@ using backref_root_t = phy_tree_root_t;
* TODO: generalize this to permit more than one lba_manager implementation
*/
struct __attribute__((packed)) root_t {
- using meta_t = std::map<std::string, std::string>;
-
- static constexpr int MAX_META_LENGTH = 1024;
-
backref_root_t backref_root;
lba_root_t lba_root;
laddr_le_t onode_root;
coll_root_le_t collection_root;
+ laddr_le_t meta;
- char meta[MAX_META_LENGTH];
-
- root_t() {
- set_meta(meta_t{});
- }
+ root_t() = default;
void adjust_addrs_from_base(paddr_t base) {
lba_root.adjust_addrs_from_base(base);
backref_root.adjust_addrs_from_base(base);
}
-
- meta_t get_meta() {
- bufferlist bl;
- bl.append(ceph::buffer::create_static(MAX_META_LENGTH, meta));
- meta_t ret;
- auto iter = bl.cbegin();
- decode(ret, iter);
- return ret;
- }
-
- void set_meta(const meta_t &m) {
- ceph::bufferlist bl;
- encode(m, bl);
- ceph_assert(bl.length() < MAX_META_LENGTH);
- bl.rebuild();
- auto &bptr = bl.front();
- ::memset(meta, 0, MAX_META_LENGTH);
- ::memcpy(meta, bptr.c_str(), bl.length());
- }
};
struct alloc_blk_t {
alloc_blk_t(
- paddr_t paddr,
- laddr_t laddr,
+ const paddr_t& paddr,
+ const laddr_t& laddr,
extent_len_t len,
extent_types_t type)
- : paddr(paddr), laddr(laddr), len(len), type(type)
- {}
+ : paddr(paddr), laddr(laddr), len(len), type(type) {
+ assert(len > 0);
+ }
explicit alloc_blk_t() = default;
@@ -1989,6 +2016,25 @@ struct alloc_blk_t {
denc(v.type, p);
DENC_FINISH(p);
}
+
+ static alloc_blk_t create_alloc(
+ const paddr_t& paddr,
+ const laddr_t& laddr,
+ extent_len_t len,
+ extent_types_t type) {
+ assert(is_backref_mapped_type(type));
+ assert(laddr != L_ADDR_NULL);
+ return alloc_blk_t(paddr, laddr, len, type);
+ }
+
+ static alloc_blk_t create_retire(
+ const paddr_t& paddr,
+ extent_len_t len,
+ extent_types_t type) {
+ assert(is_backref_mapped_type(type) ||
+ is_retired_placeholder_type(type));
+ return alloc_blk_t(paddr, L_ADDR_NULL, len, type);
+ }
};
// use absolute address
diff --git a/src/crimson/os/seastore/segment_manager.cc b/src/crimson/os/seastore/segment_manager.cc
index 1be9cce5f6b..3eced41081e 100644
--- a/src/crimson/os/seastore/segment_manager.cc
+++ b/src/crimson/os/seastore/segment_manager.cc
@@ -16,10 +16,10 @@ namespace crimson::os::seastore {
std::ostream& operator<<(std::ostream& out, const block_shard_info_t& sf)
{
out << "("
- << "size=" << sf.size
- << ", segments=" <<sf.segments
- << ", tracker_offset=" <<sf.tracker_offset
- << ", first_segment_offset=" <<sf.first_segment_offset
+ << "size=0x" << std::hex << sf.size << std::dec
+ << ", segments=" << sf.segments
+ << ", tracker_offset=0x" << std::hex << sf.tracker_offset
+ << ", first_segment_offset=0x" << sf.first_segment_offset << std::dec
<<")";
return out;
}
@@ -28,8 +28,8 @@ std::ostream& operator<<(std::ostream& out, const block_sm_superblock_t& sb)
{
out << "superblock("
<< "shard_num=" << sb.shard_num
- << ", segment_size=" << sb.segment_size
- << ", block_size=" << sb.block_size
+ << ", segment_size=0x" << std::hex << sb.segment_size
+ << ", block_size=0x" << sb.block_size << std::dec
<< ", shard_info:";
for (auto &sf : sb.shard_infos) {
out << sf
diff --git a/src/crimson/os/seastore/segment_manager/block.cc b/src/crimson/os/seastore/segment_manager/block.cc
index 0500271f81a..7077aad7407 100644
--- a/src/crimson/os/seastore/segment_manager/block.cc
+++ b/src/crimson/os/seastore/segment_manager/block.cc
@@ -60,7 +60,7 @@ static write_ertr::future<> do_write(
{
LOG_PREFIX(block_do_write);
auto len = bptr.length();
- TRACE("{} poffset={}~{} ...",
+ TRACE("{} poffset=0x{:x}~0x{:x} ...",
device_id_printer_t{device_id}, offset, len);
return device.dma_write(
offset,
@@ -68,16 +68,16 @@ static write_ertr::future<> do_write(
len
).handle_exception(
[FNAME, device_id, offset, len](auto e) -> write_ertr::future<size_t> {
- ERROR("{} poffset={}~{} got error -- {}",
+ ERROR("{} poffset=0x{:x}~0x{:x} got error -- {}",
device_id_printer_t{device_id}, offset, len, e);
return crimson::ct_error::input_output_error::make();
}).then([FNAME, device_id, offset, len](auto result) -> write_ertr::future<> {
if (result != len) {
- ERROR("{} poffset={}~{} write len={} inconsistent",
+ ERROR("{} poffset=0x{:x}~0x{:x} write len=0x{:x} inconsistent",
device_id_printer_t{device_id}, offset, len, result);
return crimson::ct_error::input_output_error::make();
}
- TRACE("{} poffset={}~{} done", device_id_printer_t{device_id}, offset, len);
+ TRACE("{} poffset=0x{:x}~0x{:x} done", device_id_printer_t{device_id}, offset, len);
return write_ertr::now();
});
}
@@ -90,7 +90,7 @@ static write_ertr::future<> do_writev(
size_t block_size)
{
LOG_PREFIX(block_do_writev);
- TRACE("{} poffset={}~{}, {} buffers",
+ TRACE("{} poffset=0x{:x}~0x{:x}, {} buffers",
device_id_printer_t{device_id}, offset, bl.length(), bl.get_num_buffers());
// writev requires each buffer to be aligned to the disks' block
@@ -109,22 +109,22 @@ static write_ertr::future<> do_writev(
auto off = offset + p.offset;
auto len = p.length;
auto& iov = p.iov;
- TRACE("{} poffset={}~{} dma_write ...",
+ TRACE("{} poffset=0x{:x}~0x{:x} dma_write ...",
device_id_printer_t{device_id}, off, len);
return device.dma_write(off, std::move(iov)
).handle_exception(
[FNAME, device_id, off, len](auto e) -> write_ertr::future<size_t>
{
- ERROR("{} poffset={}~{} dma_write got error -- {}",
+ ERROR("{} poffset=0x{:x}~0x{:x} dma_write got error -- {}",
device_id_printer_t{device_id}, off, len, e);
return crimson::ct_error::input_output_error::make();
}).then([FNAME, device_id, off, len](size_t written) -> write_ertr::future<> {
if (written != len) {
- ERROR("{} poffset={}~{} dma_write len={} inconsistent",
+ ERROR("{} poffset=0x{:x}~0x{:x} dma_write len=0x{:x} inconsistent",
device_id_printer_t{device_id}, off, len, written);
return crimson::ct_error::input_output_error::make();
}
- TRACE("{} poffset={}~{} dma_write done",
+ TRACE("{} poffset=0x{:x}~0x{:x} dma_write done",
device_id_printer_t{device_id}, off, len);
return write_ertr::now();
});
@@ -140,7 +140,7 @@ static read_ertr::future<> do_read(
bufferptr &bptr)
{
LOG_PREFIX(block_do_read);
- TRACE("{} poffset={}~{} ...", device_id_printer_t{device_id}, offset, len);
+ TRACE("{} poffset=0x{:x}~0x{:x} ...", device_id_printer_t{device_id}, offset, len);
assert(len <= bptr.length());
return device.dma_read(
offset,
@@ -153,16 +153,16 @@ static read_ertr::future<> do_read(
// once seastar::future<T>::handle_exception() returns seastar::futurize_t<T>
[FNAME, device_id, offset, len](auto e) -> read_ertr::future<size_t>
{
- ERROR("{} poffset={}~{} got error -- {}",
+ ERROR("{} poffset=0x{:x}~0x{:x} got error -- {}",
device_id_printer_t{device_id}, offset, len, e);
return crimson::ct_error::input_output_error::make();
}).then([FNAME, device_id, offset, len](auto result) -> read_ertr::future<> {
if (result != len) {
- ERROR("{} poffset={}~{} read len={} inconsistent",
+ ERROR("{} poffset=0x{:x}~0x{:x} read len=0x{:x} inconsistent",
device_id_printer_t{device_id}, offset, len, result);
return crimson::ct_error::input_output_error::make();
}
- TRACE("{} poffset={}~{} done", device_id_printer_t{device_id}, offset, len);
+ TRACE("{} poffset=0x{:x}~0x{:x} done", device_id_printer_t{device_id}, offset, len);
return read_ertr::now();
});
}
@@ -174,7 +174,7 @@ SegmentStateTracker::write_out(
uint64_t offset)
{
LOG_PREFIX(SegmentStateTracker::write_out);
- DEBUG("{} poffset={}~{}",
+ DEBUG("{} poffset=0x{:x}~0x{:x}",
device_id_printer_t{device_id}, offset, bptr.length());
return do_write(device_id, device, offset, bptr);
}
@@ -186,7 +186,7 @@ SegmentStateTracker::read_in(
uint64_t offset)
{
LOG_PREFIX(SegmentStateTracker::read_in);
- DEBUG("{} poffset={}~{}",
+ DEBUG("{} poffset=0x{:x}~0x{:x}",
device_id_printer_t{device_id}, offset, bptr.length());
return do_read(
device_id,
@@ -230,7 +230,7 @@ block_sm_superblock_t make_superblock(
+ i * segments_per_shard * config_segment_size;
}
- INFO("{} disk_size={}, segment_size={}, block_size={}",
+ INFO("{} disk_size=0x{:x}, segment_size=0x{:x}, block_size=0x{:x}",
device_id_printer_t{device_id},
size,
uint64_t(config_segment_size),
@@ -255,7 +255,7 @@ static check_create_device_ret check_create_device(
size_t size)
{
LOG_PREFIX(block_check_create_device);
- INFO("path={}, size={}", path, size);
+ INFO("path={}, size=0x{:x}", path, size);
return seastar::open_file_dma(
path,
seastar::open_flags::exclusive |
@@ -266,7 +266,7 @@ static check_create_device_ret check_create_device(
file,
[size, FNAME, &path](auto &f) -> seastar::future<>
{
- DEBUG("path={} created, truncating to {}", path, size);
+ DEBUG("path={} created, truncating to 0x{:x}", path, size);
ceph_assert(f);
return f.truncate(
size
@@ -318,8 +318,8 @@ open_device_ret open_device(
).then([stat, &path, FNAME](auto file) mutable {
return file.size().then([stat, file, &path, FNAME](auto size) mutable {
stat.size = size;
- INFO("path={} successful, size={}, block_size={}",
- path, stat.size, stat.block_size);
+ INFO("path={} successful, size=0x{:x}, block_size=0x{:x}",
+ path, stat.size, stat.block_size);
return std::make_pair(file, stat);
});
});
@@ -410,19 +410,19 @@ Segment::write_ertr::future<> BlockSegment::write(
{
LOG_PREFIX(BlockSegment::write);
auto paddr = paddr_t::make_seg_paddr(id, offset);
- DEBUG("{} offset={}~{} poffset={} ...",
+ DEBUG("{} offset=0x{:x}~0x{:x} poffset=0x{:x} ...",
id, offset, bl.length(), manager.get_offset(paddr));
if (offset < write_pointer ||
offset % manager.superblock.block_size != 0 ||
bl.length() % manager.superblock.block_size != 0) {
- ERROR("{} offset={}~{} poffset={} invalid write",
+ ERROR("{} offset=0x{:x}~0x{:x} poffset=0x{:x} invalid write",
id, offset, bl.length(), manager.get_offset(paddr));
return crimson::ct_error::invarg::make();
}
if (offset + bl.length() > manager.superblock.segment_size) {
- ERROR("{} offset={}~{} poffset={} write out of the range {}",
+ ERROR("{} offset=0x{:x}~0x{:x} poffset=0x{:x} write out of the range 0x{:x}",
id, offset, bl.length(), manager.get_offset(paddr),
manager.superblock.segment_size);
return crimson::ct_error::enospc::make();
@@ -443,7 +443,7 @@ Segment::close_ertr::future<> BlockSegmentManager::segment_close(
LOG_PREFIX(BlockSegmentManager::segment_close);
auto s_id = id.device_segment_id();
int unused_bytes = get_segment_size() - write_pointer;
- INFO("{} unused_bytes={} ...", id, unused_bytes);
+ INFO("{} unused_bytes=0x{:x} ...", id, unused_bytes);
assert(unused_bytes >= 0);
assert(id.device_id() == get_device_id());
@@ -693,24 +693,24 @@ SegmentManager::read_ertr::future<> BlockSegmentManager::read(
auto s_id = id.device_segment_id();
auto s_off = seg_addr.get_segment_off();
auto p_off = get_offset(addr);
- DEBUG("{} offset={}~{} poffset={} ...", id, s_off, len, p_off);
+ DEBUG("{} offset=0x{:x}~0x{:x} poffset=0x{:x} ...", id, s_off, len, p_off);
assert(addr.get_device_id() == get_device_id());
if (s_off % superblock.block_size != 0 ||
len % superblock.block_size != 0) {
- ERROR("{} offset={}~{} poffset={} invalid read", id, s_off, len, p_off);
+ ERROR("{} offset=0x{:x}~0x{:x} poffset=0x{:x} invalid read", id, s_off, len, p_off);
return crimson::ct_error::invarg::make();
}
if (s_id >= get_num_segments()) {
- ERROR("{} offset={}~{} poffset={} segment-id out of range {}",
+ ERROR("{} offset=0x{:x}~0x{:x} poffset=0x{:x} segment-id out of range {}",
id, s_off, len, p_off, get_num_segments());
return crimson::ct_error::invarg::make();
}
if (s_off + len > superblock.segment_size) {
- ERROR("{} offset={}~{} poffset={} read out of range {}",
+ ERROR("{} offset=0x{:x}~0x{:x} poffset=0x{:x} read out of range 0x{:x}",
id, s_off, len, p_off, superblock.segment_size);
return crimson::ct_error::invarg::make();
}
@@ -718,7 +718,7 @@ SegmentManager::read_ertr::future<> BlockSegmentManager::read(
if (tracker->get(s_id) == segment_state_t::EMPTY) {
// XXX: not an error during scanning,
// might need refactor to increase the log level
- DEBUG("{} offset={}~{} poffset={} invalid state {}",
+ DEBUG("{} offset=0x{:x}~0x{:x} poffset=0x{:x} invalid state {}",
id, s_off, len, p_off, tracker->get(s_id));
return crimson::ct_error::enoent::make();
}
diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.cc b/src/crimson/os/seastore/segment_manager/ephemeral.cc
index 4a4873afb94..bdd97e88733 100644
--- a/src/crimson/os/seastore/segment_manager/ephemeral.cc
+++ b/src/crimson/os/seastore/segment_manager/ephemeral.cc
@@ -20,8 +20,11 @@ namespace {
namespace crimson::os::seastore::segment_manager {
std::ostream &operator<<(std::ostream &lhs, const ephemeral_config_t &c) {
- return lhs << "ephemeral_config_t(size=" << c.size << ", block_size=" << c.block_size
- << ", segment_size=" << c.segment_size << ")";
+ return lhs << "ephemeral_config_t(size=0x"
+ << std::hex << c.size
+ << ", block_size=0x" << c.block_size
+ << ", segment_size=0x" << c.segment_size
+ << std::dec << ")";
}
EphemeralSegmentManagerRef create_test_ephemeral() {
@@ -141,7 +144,8 @@ Segment::write_ertr::future<> EphemeralSegmentManager::segment_write(
{
auto& seg_addr = addr.as_seg_paddr();
logger().debug(
- "segment_write to segment {} at offset {}, physical offset {}, len {}, crc {}",
+ "segment_write to segment {} at offset 0x{:x}, "
+ "physical offset 0x{:x}, len 0x{:x}, crc 0x{:x}",
seg_addr.get_segment_id(),
seg_addr.get_segment_off(),
get_offset(addr),
@@ -268,7 +272,7 @@ SegmentManager::read_ertr::future<> EphemeralSegmentManager::read(
if (seg_addr.get_segment_off() + len > config.segment_size) {
logger().error(
- "EphemeralSegmentManager::read: invalid offset {}~{}!",
+ "EphemeralSegmentManager::read: invalid offset {}~0x{:x}!",
addr,
len);
return crimson::ct_error::invarg::make();
@@ -279,7 +283,8 @@ SegmentManager::read_ertr::future<> EphemeralSegmentManager::read(
bufferlist bl;
bl.push_back(out);
logger().debug(
- "segment_read to segment {} at offset {}, physical offset {}, length {}, crc {}",
+ "segment_read to segment {} at offset 0x{:x}, "
+ "physical offset 0x{:x}, length 0x{:x}, crc 0x{:x}",
seg_addr.get_segment_id().device_segment_id(),
seg_addr.get_segment_off(),
get_offset(addr),
diff --git a/src/crimson/os/seastore/segment_manager/zbd.cc b/src/crimson/os/seastore/segment_manager/zbd.cc
index 88521a947f8..22efbed5940 100644
--- a/src/crimson/os/seastore/segment_manager/zbd.cc
+++ b/src/crimson/os/seastore/segment_manager/zbd.cc
@@ -56,7 +56,7 @@ static open_device_ret open_device(
path, seastar::follow_symlink::yes
).then([FNAME, mode, &path](auto stat) mutable {
return seastar::open_file_dma(path, mode).then([=](auto file) {
- DEBUG("open of device {} successful, size {}",
+ DEBUG("open of device {} successful, size 0x{:x}",
path,
stat.size);
return std::make_pair(file, stat);
@@ -100,11 +100,12 @@ static zbd_sm_metadata_t make_metadata(
WARN("Ignoring configuration values for device and segment size");
INFO(
- "device size: {}, available size: {}, block size: {}, allocated size: {},"
- " total zones {}, zone size: {}, zone capacity: {},"
- " total segments: {}, zones per segment: {}, segment size: {}"
+ "device size: 0x{:x}, available size: 0x{:x},"
+ " block size: 0x{:x}, allocated size: 0x{:x},"
+ " total zones {}, zone size: 0x{:x}, zone capacity: 0x{:x},"
+ " total segments: {}, zones per segment: {}, segment size: 0x{:x}"
" conv zones: {}, swr zones: {}, per shard segments: {}"
- " per shard available size: {}",
+ " per shard available size: 0x{:x}",
total_size,
available_size,
data.block_size,
@@ -126,8 +127,8 @@ static zbd_sm_metadata_t make_metadata(
shard_infos[i].segments = per_shard_segments;
shard_infos[i].first_segment_offset = zone_size * skipped_zones
+ i * segment_size * per_shard_segments;
- INFO("First segment offset for shard {} is: {}",
- i, shard_infos[i].first_segment_offset);
+ INFO("First segment offset for shard {} is: 0x{:x}",
+ i, shard_infos[i].first_segment_offset);
}
zbd_sm_metadata_t ret = zbd_sm_metadata_t{
@@ -248,7 +249,7 @@ static write_ertr::future<> do_write(
bufferptr &bptr)
{
LOG_PREFIX(ZBDSegmentManager::do_write);
- DEBUG("offset {} len {}",
+ DEBUG("offset 0x{:x} len 0x{:x}",
offset,
bptr.length());
return device.dma_write(
@@ -277,7 +278,7 @@ static write_ertr::future<> do_writev(
size_t block_size)
{
LOG_PREFIX(ZBDSegmentManager::do_writev);
- DEBUG("{} offset {} len {}",
+ DEBUG("{} offset 0x{:x} len 0x{:x}",
device_id_printer_t{device_id}, offset, bl.length());
// writev requires each buffer to be aligned to the disks' block
// size, we need to rebuild here
@@ -295,23 +296,23 @@ static write_ertr::future<> do_writev(
auto off = offset + p.offset;
auto len = p.length;
auto& iov = p.iov;
- DEBUG("{} poffset={}~{} dma_write ...",
+ DEBUG("{} poffset=0x{:x}~0x{:x} dma_write ...",
device_id_printer_t{device_id},
off, len);
return device.dma_write(off, std::move(iov)
).handle_exception(
[FNAME, device_id, off, len](auto e) -> write_ertr::future<size_t>
{
- ERROR("{} poffset={}~{} dma_write got error -- {}",
+ ERROR("{} poffset=0x{:x}~0x{:x} dma_write got error -- {}",
device_id_printer_t{device_id}, off, len, e);
return crimson::ct_error::input_output_error::make();
}).then([FNAME, device_id, off, len](size_t written) -> write_ertr::future<> {
if (written != len) {
- ERROR("{} poffset={}~{} dma_write len={} inconsistent",
+ ERROR("{} poffset=0x{:x}~0x{:x} dma_write len=0x{:x} inconsistent",
device_id_printer_t{device_id}, off, len, written);
return crimson::ct_error::input_output_error::make();
}
- DEBUG("{} poffset={}~{} dma_write done",
+ DEBUG("{} poffset=0x{:x}~0x{:x} dma_write done",
device_id_printer_t{device_id},
off, len);
return write_ertr::now();
@@ -329,12 +330,12 @@ write_metadata(seastar::file &device, zbd_sm_metadata_t sb)
bufferptr(ceph::buffer::create_page_aligned(sb.block_size)),
[=, &device](auto &bp) {
LOG_PREFIX(ZBDSegmentManager::write_metadata);
- DEBUG("block_size {}", sb.block_size);
+ DEBUG("block_size 0x{:x}", sb.block_size);
bufferlist bl;
encode(sb, bl);
auto iter = bl.begin();
assert(bl.length() < sb.block_size);
- DEBUG("buffer length {}", bl.length());
+ DEBUG("buffer length 0x{:x}", bl.length());
iter.copy(bl.length(), bp.c_str());
DEBUG("doing writeout");
return do_write(device, 0, bp);
@@ -349,7 +350,7 @@ static read_ertr::future<> do_read(
{
LOG_PREFIX(ZBDSegmentManager::do_read);
assert(len <= bptr.length());
- DEBUG("offset {} len {}",
+ DEBUG("offset 0x{:x} len 0x{:x}",
offset,
len);
return device.dma_read(
@@ -659,7 +660,7 @@ SegmentManager::read_ertr::future<> ZBDSegmentManager::read(
}
if (seg_addr.get_segment_off() + len > metadata.segment_capacity) {
- ERROR("invalid read offset {}, len {}",
+ ERROR("invalid read offset {}, len 0x{:x}",
addr,
len);
return crimson::ct_error::invarg::make();
@@ -703,7 +704,7 @@ Segment::write_ertr::future<> ZBDSegmentManager::segment_write(
assert(addr.get_device_id() == get_device_id());
assert((bl.length() % metadata.block_size) == 0);
auto& seg_addr = addr.as_seg_paddr();
- DEBUG("write to segment {} at offset {}, physical offset {}, len {}",
+ DEBUG("write to segment {} at offset 0x{:x}, physical offset 0x{:x}, len 0x{:x}",
seg_addr.get_segment_id(),
seg_addr.get_segment_off(),
get_offset(addr),
@@ -756,7 +757,7 @@ Segment::write_ertr::future<> ZBDSegment::write(
LOG_PREFIX(ZBDSegment::write);
if (offset != write_pointer || offset % manager.metadata.block_size != 0) {
ERROR("Segment offset and zone write pointer mismatch. "
- "segment {} segment-offset {} write pointer {}",
+ "segment {} segment-offset 0x{:x} write pointer 0x{:x}",
id, offset, write_pointer);
return crimson::ct_error::invarg::make();
}
@@ -772,7 +773,7 @@ Segment::write_ertr::future<> ZBDSegment::write_padding_bytes(
size_t padding_bytes)
{
LOG_PREFIX(ZBDSegment::write_padding_bytes);
- DEBUG("Writing {} padding bytes to segment {} at wp {}",
+ DEBUG("Writing 0x{:x} padding bytes to segment {} at wp 0x{:x}",
padding_bytes, id, write_pointer);
return crimson::repeat([FNAME, padding_bytes, this] () mutable {
@@ -804,7 +805,7 @@ Segment::write_ertr::future<> ZBDSegment::advance_wp(
{
LOG_PREFIX(ZBDSegment::advance_wp);
- DEBUG("Advancing write pointer from {} to {}", write_pointer, offset);
+ DEBUG("Advancing write pointer from 0x{:x} to 0x{:x}", write_pointer, offset);
if (offset < write_pointer) {
return crimson::ct_error::invarg::make();
}
diff --git a/src/crimson/os/seastore/segment_manager_group.cc b/src/crimson/os/seastore/segment_manager_group.cc
index 332b794b70e..f4822c9a18c 100644
--- a/src/crimson/os/seastore/segment_manager_group.cc
+++ b/src/crimson/os/seastore/segment_manager_group.cc
@@ -26,13 +26,13 @@ SegmentManagerGroup::read_segment_tail(segment_id_t segment)
}
).safe_then([=, &segment_manager](bufferptr bptr) -> read_segment_tail_ret {
LOG_PREFIX(SegmentManagerGroup::read_segment_tail);
- DEBUG("segment {} bptr size {}", segment, bptr.length());
+ DEBUG("segment {} bptr size 0x{:x}", segment, bptr.length());
segment_tail_t tail;
bufferlist bl;
bl.push_back(bptr);
- DEBUG("segment {} block crc {}",
+ DEBUG("segment {} block crc 0x{:x}",
segment,
bl.begin().crc32c(segment_manager.get_block_size(), 0));
@@ -66,13 +66,13 @@ SegmentManagerGroup::read_segment_header(segment_id_t segment)
}
).safe_then([=, &segment_manager](bufferptr bptr) -> read_segment_header_ret {
LOG_PREFIX(SegmentManagerGroup::read_segment_header);
- DEBUG("segment {} bptr size {}", segment, bptr.length());
+ DEBUG("segment {} bptr size 0x{:x}", segment, bptr.length());
segment_header_t header;
bufferlist bl;
bl.push_back(bptr);
- DEBUG("segment {} block crc {}",
+ DEBUG("segment {} block crc 0x{:x}",
segment,
bl.begin().crc32c(segment_manager.get_block_size(), 0));
@@ -111,7 +111,7 @@ SegmentManagerGroup::read(paddr_t start, size_t len)
LOG_PREFIX(SegmentManagerGroup::read);
assert(has_device(start.get_device_id()));
auto& segment_manager = *segment_managers[start.get_device_id()];
- TRACE("reading data {}~{}", start, len);
+ TRACE("reading data {}~0x{:x}", start, len);
return segment_manager.read(
start,
len
diff --git a/src/crimson/os/seastore/transaction.h b/src/crimson/os/seastore/transaction.h
index 5d8ad00ba22..cd8c333c69f 100644
--- a/src/crimson/os/seastore/transaction.h
+++ b/src/crimson/os/seastore/transaction.h
@@ -8,16 +8,17 @@
#include <boost/intrusive/list.hpp>
#include "crimson/common/log.h"
+#include "crimson/os/seastore/backref_entry.h"
+#include "crimson/os/seastore/cached_extent.h"
#include "crimson/os/seastore/logging.h"
#include "crimson/os/seastore/ordering_handle.h"
-#include "crimson/os/seastore/seastore_types.h"
-#include "crimson/os/seastore/cached_extent.h"
#include "crimson/os/seastore/root_block.h"
+#include "crimson/os/seastore/seastore_types.h"
+#include "crimson/os/seastore/transaction_interruptor.h"
namespace crimson::os::seastore {
class SeaStore;
-class Transaction;
struct io_stat_t {
uint64_t num = 0;
@@ -408,12 +409,14 @@ public:
src_t src,
journal_seq_t initiated_after,
on_destruct_func_t&& f,
- transaction_id_t trans_id
+ transaction_id_t trans_id,
+ cache_hint_t cache_hint
) : weak(weak),
handle(std::move(handle)),
on_destruct(std::move(f)),
src(src),
- trans_id(trans_id)
+ trans_id(trans_id),
+ cache_hint(cache_hint)
{}
void invalidate_clear_write_set() {
@@ -460,6 +463,7 @@ public:
ool_write_stats = {};
rewrite_stats = {};
conflicted = false;
+ assert(backref_entries.empty());
if (!has_reset) {
has_reset = true;
}
@@ -571,10 +575,23 @@ public:
return pre_alloc_list;
}
+ cache_hint_t get_cache_hint() const {
+ return cache_hint;
+ }
+
private:
friend class Cache;
friend Ref make_test_transaction();
+ void set_backref_entries(backref_entry_refs_t&& entries) {
+ assert(backref_entries.empty());
+ backref_entries = std::move(entries);
+ }
+
+ backref_entry_refs_t move_backref_entries() {
+ return std::move(backref_entries);
+ }
+
/**
* If set, *this may not be used to perform writes and will not provide
* consistentency allowing operations using to avoid maintaining a read_set.
@@ -669,6 +686,10 @@ private:
transaction_id_t trans_id = TRANS_ID_NULL;
seastar::lw_shared_ptr<rbm_pending_ool_t> pending_ool;
+
+ backref_entry_refs_t backref_entries;
+
+ cache_hint_t cache_hint = CACHE_HINT_TOUCH;
};
using TransactionRef = Transaction::Ref;
@@ -681,67 +702,11 @@ inline TransactionRef make_test_transaction() {
Transaction::src_t::MUTATE,
JOURNAL_SEQ_NULL,
[](Transaction&) {},
- ++next_id
+ ++next_id,
+ CACHE_HINT_TOUCH
);
}
-struct TransactionConflictCondition {
- class transaction_conflict final : public std::exception {
- public:
- const char* what() const noexcept final {
- return "transaction conflict detected";
- }
- };
-
-public:
- TransactionConflictCondition(Transaction &t) : t(t) {}
-
- template <typename Fut>
- std::optional<Fut> may_interrupt() {
- if (t.conflicted) {
- return seastar::futurize<Fut>::make_exception_future(
- transaction_conflict());
- } else {
- return std::optional<Fut>();
- }
- }
-
- template <typename T>
- static constexpr bool is_interruption_v =
- std::is_same_v<T, transaction_conflict>;
-
-
- static bool is_interruption(std::exception_ptr& eptr) {
- return *eptr.__cxa_exception_type() == typeid(transaction_conflict);
- }
-
-private:
- Transaction &t;
-};
-
-using trans_intr = crimson::interruptible::interruptor<
- TransactionConflictCondition
- >;
-
-template <typename E>
-using trans_iertr =
- crimson::interruptible::interruptible_errorator<
- TransactionConflictCondition,
- E
- >;
-
-template <typename F, typename... Args>
-auto with_trans_intr(Transaction &t, F &&f, Args&&... args) {
- return trans_intr::with_interruption_to_error<crimson::ct_error::eagain>(
- std::move(f),
- TransactionConflictCondition(t),
- t,
- std::forward<Args>(args)...);
-}
-
-template <typename T>
-using with_trans_ertr = typename T::base_ertr::template extend<crimson::ct_error::eagain>;
-
}
#if FMT_VERSION >= 90000
diff --git a/src/crimson/os/seastore/transaction_interruptor.cc b/src/crimson/os/seastore/transaction_interruptor.cc
new file mode 100644
index 00000000000..d22f760f2db
--- /dev/null
+++ b/src/crimson/os/seastore/transaction_interruptor.cc
@@ -0,0 +1,15 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "crimson/os/seastore/transaction_interruptor.h"
+
+#include "crimson/os/seastore/transaction.h"
+
+namespace crimson::os::seastore {
+
+bool TransactionConflictCondition::is_conflicted() const
+{
+ return t.conflicted;
+}
+
+}
diff --git a/src/crimson/os/seastore/transaction_interruptor.h b/src/crimson/os/seastore/transaction_interruptor.h
new file mode 100644
index 00000000000..d0522c23c19
--- /dev/null
+++ b/src/crimson/os/seastore/transaction_interruptor.h
@@ -0,0 +1,77 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#pragma once
+
+#include <exception>
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+#include "crimson/common/errorator.h"
+#include "crimson/common/interruptible_future.h"
+
+namespace crimson::os::seastore {
+
+class Transaction;
+
+struct TransactionConflictCondition {
+ class transaction_conflict final : public std::exception {
+ public:
+ const char* what() const noexcept final {
+ return "transaction conflict detected";
+ }
+ };
+
+public:
+ TransactionConflictCondition(Transaction &t) : t(t) {}
+
+ template <typename Fut>
+ std::optional<Fut> may_interrupt() {
+ if (is_conflicted()) {
+ return seastar::futurize<Fut>::make_exception_future(
+ transaction_conflict());
+ } else {
+ return std::optional<Fut>();
+ }
+ }
+
+ template <typename T>
+ static constexpr bool is_interruption_v =
+ std::is_same_v<T, transaction_conflict>;
+
+
+ static bool is_interruption(std::exception_ptr& eptr) {
+ return *eptr.__cxa_exception_type() == typeid(transaction_conflict);
+ }
+
+private:
+ bool is_conflicted() const;
+
+ Transaction &t;
+};
+
+using trans_intr = crimson::interruptible::interruptor<
+ TransactionConflictCondition
+ >;
+
+template <typename E>
+using trans_iertr =
+ crimson::interruptible::interruptible_errorator<
+ TransactionConflictCondition,
+ E
+ >;
+
+template <typename F, typename... Args>
+auto with_trans_intr(Transaction &t, F &&f, Args&&... args) {
+ return trans_intr::with_interruption_to_error<crimson::ct_error::eagain>(
+ std::move(f),
+ TransactionConflictCondition(t),
+ t,
+ std::forward<Args>(args)...);
+}
+
+template <typename T>
+using with_trans_ertr = typename T::base_ertr::template extend<crimson::ct_error::eagain>;
+
+} // namespace crimson::os::seastore
diff --git a/src/crimson/os/seastore/transaction_manager.cc b/src/crimson/os/seastore/transaction_manager.cc
index f4e3b0858f2..807d88b2cbc 100644
--- a/src/crimson/os/seastore/transaction_manager.cc
+++ b/src/crimson/os/seastore/transaction_manager.cc
@@ -66,6 +66,7 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
return with_transaction_intr(
Transaction::src_t::MUTATE,
"mkfs_tm",
+ CACHE_HINT_TOUCH,
[this, FNAME](auto& t)
{
cache->init();
@@ -74,6 +75,8 @@ TransactionManager::mkfs_ertr::future<> TransactionManager::mkfs()
return lba_manager->mkfs(t);
}).si_then([this, &t] {
return backref_manager->mkfs(t);
+ }).si_then([this, &t] {
+ return init_root_meta(t);
}).si_then([this, FNAME, &t] {
INFOT("submitting mkfs transaction", t);
return submit_transaction_direct(t);
@@ -129,6 +132,7 @@ TransactionManager::mount()
journal->get_trimmer().set_journal_head(start_seq);
return with_transaction_weak(
"mount",
+ CACHE_HINT_TOUCH,
[this](auto &t)
{
return cache->init_cached_extents(t, [this](auto &t, auto &e) {
@@ -219,7 +223,7 @@ TransactionManager::ref_ret TransactionManager::inc_ref(
TRACET("{}", t, offset);
return lba_manager->incref_extent(t, offset
).si_then([FNAME, offset, &t](auto result) {
- DEBUGT("extent refcount is incremented to {} -- {}~{}, {}",
+ DEBUGT("extent refcount is incremented to {} -- {}~0x{:x}, {}",
t, result.refcount, offset, result.length, result.addr);
return result.refcount;
});
@@ -459,8 +463,12 @@ TransactionManager::do_submit_transaction(
}
SUBTRACET(seastore_t, "submitting record", tref);
- return journal->submit_record(std::move(record), tref.get_handle()
- ).safe_then([this, FNAME, &tref](auto submit_result) mutable {
+ return journal->submit_record(
+ std::move(record),
+ tref.get_handle(),
+ tref.get_src(),
+ [this, FNAME, &tref](record_locator_t submit_result)
+ {
SUBDEBUGT(seastore_t, "committed with {}", tref, submit_result);
auto start_seq = submit_result.write_result.start_seq;
journal->get_trimmer().set_journal_head(start_seq);
@@ -471,10 +479,8 @@ TransactionManager::do_submit_transaction(
journal->get_trimmer().update_journal_tails(
cache->get_oldest_dirty_from().value_or(start_seq),
cache->get_oldest_backref_dirty_from().value_or(start_seq));
- return journal->finish_commit(tref.get_src()
- ).then([&tref] {
- return tref.get_handle().complete();
- });
+ }).safe_then([&tref] {
+ return tref.get_handle().complete();
}).handle_error(
submit_transaction_iertr::pass_further{},
crimson::ct_error::assert_all{"Hit error submitting to journal"}
@@ -506,7 +512,7 @@ TransactionManager::get_next_dirty_extents(
size_t max_bytes)
{
LOG_PREFIX(TransactionManager::get_next_dirty_extents);
- DEBUGT("max_bytes={}B, seq={}", t, max_bytes, seq);
+ DEBUGT("max_bytes=0x{:x}B, seq={}", t, max_bytes, seq);
return cache->get_next_dirty_extents(t, seq, max_bytes);
}
@@ -521,101 +527,111 @@ TransactionManager::rewrite_logical_extent(
ceph_abort();
}
- auto lextent = extent->cast<LogicalCachedExtent>();
- cache->retire_extent(t, extent);
- if (get_extent_category(lextent->get_type()) == data_category_t::METADATA) {
- auto nlextent = cache->alloc_new_extent_by_type(
+ if (get_extent_category(extent->get_type()) == data_category_t::METADATA) {
+ assert(extent->is_fully_loaded());
+ cache->retire_extent(t, extent);
+ auto nextent = cache->alloc_new_extent_by_type(
t,
- lextent->get_type(),
- lextent->get_length(),
- lextent->get_user_hint(),
+ extent->get_type(),
+ extent->get_length(),
+ extent->get_user_hint(),
// get target rewrite generation
- lextent->get_rewrite_generation())->cast<LogicalCachedExtent>();
- nlextent->rewrite(t, *lextent, 0);
+ extent->get_rewrite_generation())->cast<LogicalCachedExtent>();
+ nextent->rewrite(t, *extent, 0);
- DEBUGT("rewriting meta -- {} to {}", t, *lextent, *nlextent);
+ DEBUGT("rewriting meta -- {} to {}", t, *extent, *nextent);
#ifndef NDEBUG
- if (get_checksum_needed(lextent->get_paddr())) {
- assert(lextent->get_last_committed_crc() == lextent->calc_crc32c());
+ if (get_checksum_needed(extent->get_paddr())) {
+ assert(extent->get_last_committed_crc() == extent->calc_crc32c());
} else {
- assert(lextent->get_last_committed_crc() == CRC_NULL);
+ assert(extent->get_last_committed_crc() == CRC_NULL);
}
#endif
- nlextent->set_last_committed_crc(lextent->get_last_committed_crc());
+ nextent->set_last_committed_crc(extent->get_last_committed_crc());
/* This update_mapping is, strictly speaking, unnecessary for delayed_alloc
* extents since we're going to do it again once we either do the ool write
* or allocate a relative inline addr. TODO: refactor AsyncCleaner to
* avoid this complication. */
return lba_manager->update_mapping(
t,
- lextent->get_laddr(),
- lextent->get_length(),
- lextent->get_paddr(),
- nlextent->get_length(),
- nlextent->get_paddr(),
- nlextent->get_last_committed_crc(),
- nlextent.get()).discard_result();
+ extent->get_laddr(),
+ extent->get_length(),
+ extent->get_paddr(),
+ nextent->get_length(),
+ nextent->get_paddr(),
+ nextent->get_last_committed_crc(),
+ nextent.get()
+ ).discard_result();
} else {
- assert(get_extent_category(lextent->get_type()) == data_category_t::DATA);
- auto extents = cache->alloc_new_data_extents_by_type(
- t,
- lextent->get_type(),
- lextent->get_length(),
- lextent->get_user_hint(),
- // get target rewrite generation
- lextent->get_rewrite_generation());
- return seastar::do_with(
- std::move(extents),
- 0,
- lextent->get_length(),
- extent_ref_count_t(0),
- [this, FNAME, lextent, &t]
- (auto &extents, auto &off, auto &left, auto &refcount) {
- return trans_intr::do_for_each(
- extents,
- [lextent, this, FNAME, &t, &off, &left, &refcount](auto &nextent) {
- bool first_extent = (off == 0);
- ceph_assert(left >= nextent->get_length());
- auto nlextent = nextent->template cast<LogicalCachedExtent>();
- nlextent->rewrite(t, *lextent, off);
- DEBUGT("rewriting data -- {} to {}", t, *lextent, *nlextent);
-
- /* This update_mapping is, strictly speaking, unnecessary for delayed_alloc
- * extents since we're going to do it again once we either do the ool write
- * or allocate a relative inline addr. TODO: refactor AsyncCleaner to
- * avoid this complication. */
- auto fut = base_iertr::now();
- if (first_extent) {
- fut = lba_manager->update_mapping(
- t,
- (lextent->get_laddr() + off).checked_to_laddr(),
- lextent->get_length(),
- lextent->get_paddr(),
- nlextent->get_length(),
- nlextent->get_paddr(),
- nlextent->get_last_committed_crc(),
- nlextent.get()
- ).si_then([&refcount](auto c) {
- refcount = c;
- });
- } else {
- ceph_assert(refcount != 0);
- fut = lba_manager->alloc_extent(
- t,
- (lextent->get_laddr() + off).checked_to_laddr(),
- *nlextent,
- refcount
- ).si_then([lextent, nlextent, off](auto mapping) {
- ceph_assert(mapping->get_key() == lextent->get_laddr() + off);
- ceph_assert(mapping->get_val() == nlextent->get_paddr());
+ assert(get_extent_category(extent->get_type()) == data_category_t::DATA);
+ auto length = extent->get_length();
+ return cache->read_extent_maybe_partial(
+ t, std::move(extent), 0, length
+ ).si_then([this, FNAME, &t](auto extent) {
+ assert(extent->is_fully_loaded());
+ cache->retire_extent(t, extent);
+ auto extents = cache->alloc_new_data_extents_by_type(
+ t,
+ extent->get_type(),
+ extent->get_length(),
+ extent->get_user_hint(),
+ // get target rewrite generation
+ extent->get_rewrite_generation());
+ return seastar::do_with(
+ std::move(extents),
+ 0,
+ extent->get_length(),
+ extent_ref_count_t(0),
+ [this, FNAME, extent, &t]
+ (auto &extents, auto &off, auto &left, auto &refcount)
+ {
+ return trans_intr::do_for_each(
+ extents,
+ [extent, this, FNAME, &t, &off, &left, &refcount](auto &_nextent)
+ {
+ auto nextent = _nextent->template cast<LogicalCachedExtent>();
+ bool first_extent = (off == 0);
+ ceph_assert(left >= nextent->get_length());
+ nextent->rewrite(t, *extent, off);
+ DEBUGT("rewriting data -- {} to {}", t, *extent, *nextent);
+
+ /* This update_mapping is, strictly speaking, unnecessary for delayed_alloc
+ * extents since we're going to do it again once we either do the ool write
+ * or allocate a relative inline addr. TODO: refactor AsyncCleaner to
+ * avoid this complication. */
+ auto fut = base_iertr::now();
+ if (first_extent) {
+ fut = lba_manager->update_mapping(
+ t,
+ (extent->get_laddr() + off).checked_to_laddr(),
+ extent->get_length(),
+ extent->get_paddr(),
+ nextent->get_length(),
+ nextent->get_paddr(),
+ nextent->get_last_committed_crc(),
+ nextent.get()
+ ).si_then([&refcount](auto c) {
+ refcount = c;
+ });
+ } else {
+ ceph_assert(refcount != 0);
+ fut = lba_manager->alloc_extent(
+ t,
+ (extent->get_laddr() + off).checked_to_laddr(),
+ *nextent,
+ refcount
+ ).si_then([extent, nextent, off](auto mapping) {
+ ceph_assert(mapping->get_key() == extent->get_laddr() + off);
+ ceph_assert(mapping->get_val() == nextent->get_paddr());
+ return seastar::now();
+ });
+ }
+ return fut.si_then([&off, &left, nextent] {
+ off += nextent->get_length();
+ left -= nextent->get_length();
return seastar::now();
});
- }
- return fut.si_then([&off, &left, nlextent] {
- off += nlextent->get_length();
- left -= nlextent->get_length();
- return seastar::now();
});
});
});
@@ -714,7 +730,7 @@ TransactionManager::get_extents_if_live(
ceph_assert(paddr.get_addr_type() == paddr_types_t::SEGMENT);
return cache->get_extent_if_cached(t, paddr, type
- ).si_then([=, this, &t](auto extent)
+ ).si_then([this, FNAME, type, paddr, laddr, len, &t](auto extent)
-> get_extents_if_live_ret {
if (extent && extent->get_length() == len) {
DEBUGT("{} {}~0x{:x} {} is cached and alive -- {}",
@@ -731,19 +747,24 @@ TransactionManager::get_extents_if_live(
t,
laddr,
len
- ).si_then([=, this, &t](lba_pin_list_t pin_list) {
+ ).si_then([this, FNAME, type, paddr, laddr, len, &t](lba_pin_list_t pin_list) {
return seastar::do_with(
std::list<CachedExtentRef>(),
- [=, this, &t, pin_list=std::move(pin_list)](
- std::list<CachedExtentRef> &list) mutable
+ std::move(pin_list),
+ [this, FNAME, type, paddr, laddr, len, &t]
+ (std::list<CachedExtentRef> &extent_list, auto& pin_list)
{
auto paddr_seg_id = paddr.as_seg_paddr().get_segment_id();
return trans_intr::parallel_for_each(
pin_list,
- [=, this, &list, &t](
- LBAMappingRef &pin) -> Cache::get_extent_iertr::future<>
+ [this, FNAME, type, paddr_seg_id, &extent_list, &t](
+ LBAMappingRef& pin) -> Cache::get_extent_iertr::future<>
{
+ DEBUGT("got pin, try read in parallel ... -- {}", t, *pin);
auto pin_paddr = pin->get_val();
+ if (pin_paddr.get_addr_type() != paddr_types_t::SEGMENT) {
+ return seastar::now();
+ }
auto &pin_seg_paddr = pin_paddr.as_seg_paddr();
auto pin_paddr_seg_id = pin_seg_paddr.get_segment_id();
// auto pin_len = pin->get_length();
@@ -767,16 +788,16 @@ TransactionManager::get_extents_if_live(
// ceph_assert(pin_seg_paddr >= paddr &&
// pin_seg_paddr.add_offset(pin_len) <= paddr.add_offset(len));
return read_pin_by_type(t, std::move(pin), type
- ).si_then([&list](auto ret) {
- list.emplace_back(std::move(ret));
+ ).si_then([&extent_list](auto ret) {
+ extent_list.emplace_back(std::move(ret));
return seastar::now();
});
- }).si_then([&list, &t, FNAME, type, laddr, len, paddr] {
+ }).si_then([&extent_list, &t, FNAME, type, laddr, len, paddr] {
DEBUGT("{} {}~0x{:x} {} is alive as {} extents",
- t, type, laddr, len, paddr, list.size());
+ t, type, laddr, len, paddr, extent_list.size());
return get_extents_if_live_ret(
interruptible::ready_future_marker{},
- std::move(list));
+ std::move(extent_list));
});
});
}).handle_error_interruptible(crimson::ct_error::enoent::handle([] {
diff --git a/src/crimson/os/seastore/transaction_manager.h b/src/crimson/os/seastore/transaction_manager.h
index c7a94a9ef11..e574460894a 100644
--- a/src/crimson/os/seastore/transaction_manager.h
+++ b/src/crimson/os/seastore/transaction_manager.h
@@ -23,6 +23,7 @@
#include "crimson/os/seastore/logging.h"
#include "crimson/os/seastore/seastore_types.h"
#include "crimson/os/seastore/cache.h"
+#include "crimson/os/seastore/root_meta.h"
#include "crimson/os/seastore/lba_manager.h"
#include "crimson/os/seastore/backref_manager.h"
#include "crimson/os/seastore/journal.h"
@@ -136,14 +137,66 @@ public:
}
/**
+ * maybe_indirect_extent_t
+ *
+ * Contains necessary information in case the extent is loaded from an
+ * indirect pin.
+ */
+ struct indirect_info_t {
+ extent_len_t intermediate_offset = 0;
+ extent_len_t length = 0;
+ };
+ template <typename T>
+ struct maybe_indirect_extent_t {
+ TCachedExtentRef<T> extent;
+ std::optional<indirect_info_t> maybe_indirect_info;
+ bool is_clone = false;
+
+ bool is_indirect() const {
+ return maybe_indirect_info.has_value();
+ }
+
+ ceph::bufferlist get_bl() const {
+ if (is_indirect()) {
+ return do_get_indirect_range(0, maybe_indirect_info->length);
+ } else {
+ assert(extent->is_fully_loaded());
+ bufferlist bl;
+ bl.append(extent->get_bptr());
+ return bl;
+ }
+ }
+
+ ceph::bufferlist get_range(
+ extent_len_t offset, extent_len_t length) const {
+ if (is_indirect()) {
+ return do_get_indirect_range(offset, length);
+ } else {
+ return extent->get_range(offset, length);
+ }
+ }
+ private:
+ ceph::bufferlist do_get_indirect_range(
+ extent_len_t offset, extent_len_t length) const {
+ assert(is_indirect());
+ assert(maybe_indirect_info->intermediate_offset + offset + length <=
+ extent->get_length());
+ assert(offset + length <= maybe_indirect_info->length);
+ return extent->get_range(
+ maybe_indirect_info->intermediate_offset + offset,
+ length);
+ }
+ };
+
+ /**
* read_extent
*
* Read extent of type T at offset~length
*/
using read_extent_iertr = get_pin_iertr;
template <typename T>
- using read_extent_ret = read_extent_iertr::future<
- TCachedExtentRef<T>>;
+ using read_extent_ret =
+ read_extent_iertr::future<maybe_indirect_extent_t<T>>;
template <typename T>
read_extent_ret<T> read_extent(
Transaction &t,
@@ -191,12 +244,30 @@ public:
}
template <typename T>
- base_iertr::future<TCachedExtentRef<T>> read_pin(
+ base_iertr::future<maybe_indirect_extent_t<T>> read_pin(
Transaction &t,
- LBAMappingRef pin)
+ LBAMappingRef pin,
+ extent_len_t partial_off,
+ extent_len_t partial_len)
{
+ static_assert(is_logical_type(T::TYPE));
+ assert(is_aligned(partial_off, get_block_size()));
+ assert(is_aligned(partial_len, get_block_size()));
+
+ extent_len_t direct_partial_off = partial_off;
+ bool is_clone = pin->is_clone();
+ std::optional<indirect_info_t> maybe_indirect_info;
+ if (pin->is_indirect()) {
+ auto intermediate_offset = pin->get_intermediate_offset();
+ direct_partial_off = intermediate_offset + partial_off;
+ maybe_indirect_info = indirect_info_t{
+ intermediate_offset, pin->get_length()};
+ }
+
LOG_PREFIX(TransactionManager::read_pin);
- SUBDEBUGT(seastore_tm, "{} {} ...", t, T::TYPE, *pin);
+ SUBDEBUGT(seastore_tm, "{} {} 0x{:x}~0x{:x} direct_off=0x{:x} ...",
+ t, T::TYPE, *pin, partial_off, partial_len, direct_partial_off);
+
auto fut = base_iertr::make_ready_future<LBAMappingRef>();
if (!pin->is_parent_viewable()) {
if (pin->is_parent_valid()) {
@@ -213,21 +284,42 @@ public:
pin->maybe_fix_pos();
fut = base_iertr::make_ready_future<LBAMappingRef>(std::move(pin));
}
- return fut.si_then([&t, this](auto npin) mutable {
+ return fut.si_then([&t, this, direct_partial_off, partial_len](auto npin) {
// checking the lba child must be atomic with creating
// and linking the absent child
auto ret = get_extent_if_linked<T>(t, std::move(npin));
if (ret.index() == 1) {
- return std::move(std::get<1>(ret));
+ return std::get<1>(ret
+ ).si_then([direct_partial_off, partial_len, this, &t](auto extent) {
+ return cache->read_extent_maybe_partial(
+ t, std::move(extent), direct_partial_off, partial_len);
+ });
} else {
- return this->pin_to_extent<T>(t, std::move(std::get<0>(ret)));
+ return this->pin_to_extent<T>(
+ t, std::move(std::get<0>(ret)), direct_partial_off, partial_len);
}
- }).si_then([FNAME, &t](TCachedExtentRef<T> ext) {
- SUBDEBUGT(seastore_tm, "got {}", t, *ext);
- return ext;
+ }).si_then([FNAME, maybe_indirect_info, is_clone, &t](TCachedExtentRef<T> ext) {
+ if (maybe_indirect_info.has_value()) {
+ SUBDEBUGT(seastore_tm, "got indirect +0x{:x}~0x{:x} is_clone={} {}",
+ t, maybe_indirect_info->intermediate_offset,
+ maybe_indirect_info->length, is_clone, *ext);
+ } else {
+ SUBDEBUGT(seastore_tm, "got direct is_clone={} {}",
+ t, is_clone, *ext);
+ }
+ return maybe_indirect_extent_t<T>{ext, maybe_indirect_info, is_clone};
});
}
+ template <typename T>
+ base_iertr::future<maybe_indirect_extent_t<T>> read_pin(
+ Transaction &t,
+ LBAMappingRef pin)
+ {
+ auto& pin_ref = *pin;
+ return read_pin<T>(t, std::move(pin), 0, pin_ref.get_length());
+ }
+
/// Obtain mutable copy of extent
LogicalCachedExtentRef get_mutable_extent(Transaction &t, LogicalCachedExtentRef ref) {
LOG_PREFIX(TransactionManager::get_mutable_extent);
@@ -303,10 +395,6 @@ public:
len,
placement_hint,
INIT_GENERATION);
- if (!ext) {
- SUBERRORT(seastore_tm, "insufficient space!", t);
- return crimson::ct_error::enospc::make();
- }
return lba_manager->alloc_extent(
t,
laddr_hint,
@@ -342,10 +430,6 @@ public:
len,
placement_hint,
INIT_GENERATION);
- if (exts.empty()) {
- SUBERRORT(seastore_tm, "insufficient space!", t);
- return crimson::ct_error::enospc::make();
- }
return lba_manager->alloc_extents(
t,
laddr_hint,
@@ -362,7 +446,8 @@ public:
}
template <typename T>
- read_extent_ret<T> get_mutable_extent_by_laddr(
+ get_pin_iertr::future<TCachedExtentRef<T>>
+ get_mutable_extent_by_laddr(
Transaction &t,
laddr_t laddr,
extent_len_t len) {
@@ -374,8 +459,11 @@ public:
ceph_assert(!pin->is_clone());
ceph_assert(pin->get_length() == len);
return this->read_pin<T>(t, std::move(pin));
- }).si_then([this, &t, FNAME](auto extent) {
- auto ext = get_mutable_extent(t, extent)->template cast<T>();
+ }).si_then([this, &t, FNAME](auto maybe_indirect_extent) {
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
+ auto ext = get_mutable_extent(
+ t, maybe_indirect_extent.extent)->template cast<T>();
SUBDEBUGT(seastore_tm, "got mutable {}", t, *ext);
return read_extent_iertr::make_ready_future<TCachedExtentRef<T>>(
std::move(ext));
@@ -438,6 +526,7 @@ public:
// The according extent might be stable or pending.
auto fut = base_iertr::now();
if (!pin->is_indirect()) {
+ ceph_assert(!pin->is_clone());
if (!pin->is_parent_viewable()) {
if (pin->is_parent_valid()) {
pin = pin->refresh_with_pending_parent();
@@ -458,7 +547,12 @@ public:
fut = fut.si_then([this, &t, &pin] {
if (full_extent_integrity_check) {
- return read_pin<T>(t, pin->duplicate());
+ return read_pin<T>(t, pin->duplicate()
+ ).si_then([](auto maybe_indirect_extent) {
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
+ return maybe_indirect_extent.extent;
+ });
} else {
auto ret = get_extent_if_linked<T>(t, pin->duplicate());
if (ret.index() == 1) {
@@ -475,6 +569,7 @@ public:
? (ext && ext->is_fully_loaded())
: true);
std::optional<ceph::bufferptr> original_bptr;
+ // TODO: preserve the bufferspace if partially loaded
if (ext && ext->is_fully_loaded()) {
ceph_assert(!ext->is_mutable());
ceph_assert(ext->get_length() >= original_len);
@@ -646,8 +741,9 @@ public:
TransactionRef create_transaction(
Transaction::src_t src,
const char* name,
+ cache_hint_t cache_hint = CACHE_HINT_TOUCH,
bool is_weak=false) final {
- return cache->create_transaction(src, name, is_weak);
+ return cache->create_transaction(src, name, cache_hint, is_weak);
}
using ExtentCallbackInterface::submit_transaction_direct_ret;
@@ -690,9 +786,14 @@ public:
const std::string &key) {
return cache->get_root(
t
- ).si_then([&key, &t](auto root) {
+ ).si_then([&t, this](auto root) {
+ return read_extent<RootMetaBlock>(t, root->root.meta);
+ }).si_then([key, &t](auto maybe_indirect_extent) {
LOG_PREFIX(TransactionManager::read_root_meta);
- auto meta = root->root.get_meta();
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
+ auto& mblock = maybe_indirect_extent.extent;
+ auto meta = mblock->get_meta();
auto iter = meta.find(key);
if (iter == meta.end()) {
SUBDEBUGT(seastore_tm, "{} -> nullopt", t, key);
@@ -701,7 +802,35 @@ public:
SUBDEBUGT(seastore_tm, "{} -> {}", t, key, iter->second);
return seastar::make_ready_future<read_root_meta_bare>(iter->second);
}
- });
+ }).handle_error_interruptible(
+ crimson::ct_error::input_output_error::pass_further{},
+ crimson::ct_error::assert_all{"unexpected error!"}
+ );
+ }
+
+ /**
+ * init_root_meta
+ *
+ * create the root meta block
+ */
+ using init_root_meta_iertr = base_iertr;
+ using init_root_meta_ret = init_root_meta_iertr::future<>;
+ init_root_meta_ret init_root_meta(Transaction &t) {
+ return alloc_non_data_extent<RootMetaBlock>(
+ t, L_ADDR_MIN, RootMetaBlock::SIZE
+ ).si_then([this, &t](auto meta) {
+ meta->set_meta(RootMetaBlock::meta_t{});
+ return cache->get_root(t
+ ).si_then([this, &t, meta](auto root) {
+ auto mroot = cache->duplicate_for_write(
+ t, root)->template cast<RootBlock>();
+ mroot->root.meta = meta->get_laddr();
+ return seastar::now();
+ });
+ }).handle_error_interruptible(
+ crimson::ct_error::input_output_error::pass_further{},
+ crimson::ct_error::assert_all{"unexpected error!"}
+ );
}
/**
@@ -719,15 +848,24 @@ public:
SUBDEBUGT(seastore_tm, "seastore_tm, {} -> {} ...", t, key, value);
return cache->get_root(
t
- ).si_then([this, &t, &key, &value](RootBlockRef root) {
- root = cache->duplicate_for_write(t, root)->cast<RootBlock>();
-
- auto meta = root->root.get_meta();
+ ).si_then([this, &t](RootBlockRef root) {
+ return read_extent<RootMetaBlock>(t, root->root.meta);
+ }).si_then([this, key, value, &t](auto maybe_indirect_extent) {
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
+ auto& mblock = maybe_indirect_extent.extent;
+ mblock = get_mutable_extent(t, mblock
+ )->template cast<RootMetaBlock>();
+
+ auto meta = mblock->get_meta();
meta[key] = value;
- root->root.set_meta(meta);
+ mblock->set_meta(meta);
return seastar::now();
- });
+ }).handle_error_interruptible(
+ crimson::ct_error::input_output_error::pass_further{},
+ crimson::ct_error::assert_all{"unexpected error!"}
+ );
}
/**
@@ -817,7 +955,7 @@ private:
shard_stats_t& shard_stats;
template <typename T>
- std::variant<LBAMappingRef, base_iertr::future<TCachedExtentRef<T>>>
+ std::variant<LBAMappingRef, get_child_ifut<T>>
get_extent_if_linked(
Transaction &t,
LBAMappingRef pin)
@@ -827,7 +965,8 @@ private:
// and linking the absent child
auto v = pin->get_logical_extent(t);
if (v.has_child()) {
- return v.get_child_fut().safe_then([pin=std::move(pin)](auto extent) {
+ return v.get_child_fut(
+ ).si_then([pin=std::move(pin)](auto extent) {
#ifndef NDEBUG
auto lextent = extent->template cast<LogicalCachedExtent>();
auto pin_laddr = pin->get_key();
@@ -849,11 +988,17 @@ private:
extent_types_t type)
{
ceph_assert(!pin->parent_modified());
+ assert(!pin->is_indirect());
+ // Note: pin might be a clone
auto v = pin->get_logical_extent(t);
// checking the lba child must be atomic with creating
// and linking the absent child
if (v.has_child()) {
- return std::move(v.get_child_fut());
+ return std::move(v.get_child_fut()
+ ).si_then([type](auto ext) {
+ ceph_assert(ext->get_type() == type);
+ return ext;
+ });
} else {
return pin_to_extent_by_type(t, std::move(pin), type);
}
@@ -877,6 +1022,7 @@ private:
* pin_to_extent
*
* Get extent mapped at pin.
+ * partially load buffer from direct_partial_off~partial_len if not present.
*/
using pin_to_extent_iertr = base_iertr;
template <typename T>
@@ -885,18 +1031,28 @@ private:
template <typename T>
pin_to_extent_ret<T> pin_to_extent(
Transaction &t,
- LBAMappingRef pin) {
- LOG_PREFIX(TransactionManager::pin_to_extent);
- SUBTRACET(seastore_tm, "getting absent extent from pin {} ...", t, *pin);
+ LBAMappingRef pin,
+ extent_len_t direct_partial_off,
+ extent_len_t partial_len) {
static_assert(is_logical_type(T::TYPE));
using ret = pin_to_extent_ret<T>;
auto &pref = *pin;
+ auto direct_length = pref.is_indirect() ?
+ pref.get_intermediate_length() :
+ pref.get_length();
+ if (full_extent_integrity_check) {
+ direct_partial_off = 0;
+ partial_len = direct_length;
+ }
+ LOG_PREFIX(TransactionManager::pin_to_extent);
+ SUBTRACET(seastore_tm, "getting absent extent from pin {}, 0x{:x}~0x{:x} ...",
+ t, *pin, direct_partial_off, partial_len);
return cache->get_absent_extent<T>(
t,
pref.get_val(),
- pref.is_indirect() ?
- pref.get_intermediate_length() :
- pref.get_length(),
+ direct_length,
+ direct_partial_off,
+ partial_len,
[&pref]
(T &extent) mutable {
assert(!extent.has_laddr());
@@ -907,30 +1063,33 @@ private:
extent.maybe_set_intermediate_laddr(pref);
}
).si_then([FNAME, &t, pin=std::move(pin), this](auto ref) mutable -> ret {
- auto crc = ref->calc_crc32c();
- SUBTRACET(
- seastore_tm,
- "got extent -- {}, chksum in the lba tree: {}, actual chksum: {}",
- t,
- *ref,
- pin->get_checksum(),
- crc);
- assert(ref->is_fully_loaded());
- bool inconsistent = false;
- if (full_extent_integrity_check) {
- inconsistent = (pin->get_checksum() != crc);
- } else { // !full_extent_integrity_check: remapped extent may be skipped
- inconsistent = !(pin->get_checksum() == 0 ||
- pin->get_checksum() == crc);
- }
- if (unlikely(inconsistent)) {
- SUBERRORT(seastore_tm,
- "extent checksum inconsistent, recorded: {}, actual: {}, {}",
+ if (ref->is_fully_loaded()) {
+ auto crc = ref->calc_crc32c();
+ SUBTRACET(
+ seastore_tm,
+ "got extent -- {}, chksum in the lba tree: 0x{:x}, actual chksum: 0x{:x}",
t,
+ *ref,
pin->get_checksum(),
- crc,
- *ref);
- ceph_abort();
+ crc);
+ bool inconsistent = false;
+ if (full_extent_integrity_check) {
+ inconsistent = (pin->get_checksum() != crc);
+ } else { // !full_extent_integrity_check: remapped extent may be skipped
+ inconsistent = !(pin->get_checksum() == 0 ||
+ pin->get_checksum() == crc);
+ }
+ if (unlikely(inconsistent)) {
+ SUBERRORT(seastore_tm,
+ "extent checksum inconsistent, recorded: 0x{:x}, actual: 0x{:x}, {}",
+ t,
+ pin->get_checksum(),
+ crc,
+ *ref);
+ ceph_abort();
+ }
+ } else {
+ assert(!full_extent_integrity_check);
}
return pin_to_extent_ret<T>(
interruptible::ready_future_marker{},
@@ -955,14 +1114,21 @@ private:
t, *pin, type);
assert(is_logical_type(type));
auto &pref = *pin;
+ laddr_t direct_key;
+ extent_len_t direct_length;
+ if (pref.is_indirect()) {
+ direct_key = pref.get_intermediate_base();
+ direct_length = pref.get_intermediate_length();
+ } else {
+ direct_key = pref.get_key();
+ direct_length = pref.get_length();
+ }
return cache->get_absent_extent_by_type(
t,
type,
pref.get_val(),
- pref.get_key(),
- pref.is_indirect() ?
- pref.get_intermediate_length() :
- pref.get_length(),
+ direct_key,
+ direct_length,
[&pref](CachedExtent &extent) mutable {
auto &lextent = static_cast<LogicalCachedExtent&>(extent);
assert(!lextent.has_laddr());
@@ -977,7 +1143,7 @@ private:
auto crc = ref->calc_crc32c();
SUBTRACET(
seastore_tm,
- "got extent -- {}, chksum in the lba tree: {}, actual chksum: {}",
+ "got extent -- {}, chksum in the lba tree: 0x{:x}, actual chksum: 0x{:x}",
t,
*ref,
pin->get_checksum(),
@@ -992,7 +1158,7 @@ private:
}
if (unlikely(inconsistent)) {
SUBERRORT(seastore_tm,
- "extent checksum inconsistent, recorded: {}, actual: {}, {}",
+ "extent checksum inconsistent, recorded: 0x{:x}, actual: 0x{:x}, {}",
t,
pin->get_checksum(),
crc,
diff --git a/src/crimson/osd/backfill_facades.h b/src/crimson/osd/backfill_facades.h
index 522a93a1ddc..ce649303d4f 100644
--- a/src/crimson/osd/backfill_facades.h
+++ b/src/crimson/osd/backfill_facades.h
@@ -36,6 +36,10 @@ struct PeeringFacade final : BackfillState::PeeringFacade {
return peering_state.get_info().log_tail;
}
+ const PGLog& get_pg_log() const override {
+ return peering_state.get_pg_log();
+ }
+
void scan_log_after(eversion_t v, scan_log_func_t f) const override {
peering_state.get_pg_log().get_log().scan_log_after(v, std::move(f));
}
@@ -73,7 +77,14 @@ struct PGFacade final : BackfillState::PGFacade {
return pg.projected_last_update;
}
+ const PGLog::IndexedLog& get_projected_log() const override {
+ return pg.projected_log;
+ }
+
PGFacade(PG& pg) : pg(pg) {}
+ std::ostream &print(std::ostream &out) const override {
+ return out << pg;
+ }
};
} // namespace crimson::osd
diff --git a/src/crimson/osd/backfill_state.cc b/src/crimson/osd/backfill_state.cc
index 018e58b68f8..f957f072c93 100644
--- a/src/crimson/osd/backfill_state.cc
+++ b/src/crimson/osd/backfill_state.cc
@@ -8,11 +8,7 @@
#include "crimson/osd/backfill_state.h"
#include "osd/osd_types_fmt.h"
-namespace {
- seastar::logger& logger() {
- return crimson::get_logger(ceph_subsys_osd);
- }
-}
+SET_SUBSYS(osd);
namespace crimson::osd {
@@ -27,22 +23,23 @@ BackfillState::BackfillState(
progress_tracker(
std::make_unique<BackfillState::ProgressTracker>(backfill_machine))
{
- logger().debug("{}:{}", __func__, __LINE__);
+ LOG_PREFIX(BackfillState::BackfillState);
+ DEBUGDPP("", *backfill_machine.pg);
backfill_machine.initiate();
}
template <class S>
BackfillState::StateHelper<S>::StateHelper()
{
- logger().debug("enter {}",
- boost::typeindex::type_id<S>().pretty_name());
+ LOG_PREFIX(BackfillState::StateHelper);
+ DEBUGDPP("enter {}", pg(), boost::typeindex::type_id<S>().pretty_name());
}
template <class S>
BackfillState::StateHelper<S>::~StateHelper()
{
- logger().debug("exit {}",
- boost::typeindex::type_id<S>().pretty_name());
+ LOG_PREFIX(BackfillState::StateHelper);
+ DEBUG("exit {}", boost::typeindex::type_id<S>().pretty_name());
}
BackfillState::~BackfillState() = default;
@@ -63,13 +60,16 @@ BackfillState::BackfillMachine::~BackfillMachine() = default;
BackfillState::Initial::Initial(my_context ctx)
: my_base(ctx)
{
+ LOG_PREFIX(BackfillState::Initial::Initial);
backfill_state().last_backfill_started = peering_state().earliest_backfill();
- logger().debug("{}: bft={} from {}",
- __func__, peering_state().get_backfill_targets(),
- backfill_state().last_backfill_started);
+ DEBUGDPP("{}: bft={} from {}",
+ pg(),
+ __func__,
+ peering_state().get_backfill_targets(),
+ backfill_state().last_backfill_started);
for (const auto& bt : peering_state().get_backfill_targets()) {
- logger().debug("{}: target shard {} from {}",
- __func__, bt, peering_state().get_peer_last_backfill(bt));
+ DEBUGDPP("{}: target shard {} from {}",
+ pg(), __func__, bt, peering_state().get_peer_last_backfill(bt));
}
ceph_assert(peering_state().get_backfill_targets().size());
ceph_assert(!backfill_state().last_backfill_started.is_max());
@@ -80,7 +80,8 @@ BackfillState::Initial::Initial(my_context ctx)
boost::statechart::result
BackfillState::Initial::react(const BackfillState::Triggered& evt)
{
- logger().debug("{}: backfill triggered", __func__);
+ LOG_PREFIX(BackfillState::Initial::react::Triggered);
+ DEBUGDPP("", pg());
ceph_assert(backfill_state().last_backfill_started == \
peering_state().earliest_backfill());
ceph_assert(peering_state().is_backfilling());
@@ -93,26 +94,10 @@ BackfillState::Initial::react(const BackfillState::Triggered& evt)
if (Enqueuing::all_enqueued(peering_state(),
backfill_state().backfill_info,
backfill_state().peer_backfill_info)) {
- logger().debug("{}: switching to Done state", __func__);
+ DEBUGDPP("switching to Done state", pg());
return transit<BackfillState::Done>();
} else {
- logger().debug("{}: switching to Enqueuing state", __func__);
- return transit<BackfillState::Enqueuing>();
- }
-}
-
-boost::statechart::result
-BackfillState::Cancelled::react(const BackfillState::Triggered& evt)
-{
- logger().debug("{}: backfill re-triggered", __func__);
- ceph_assert(peering_state().is_backfilling());
- if (Enqueuing::all_enqueued(peering_state(),
- backfill_state().backfill_info,
- backfill_state().peer_backfill_info)) {
- logger().debug("{}: switching to Done state", __func__);
- return transit<BackfillState::Done>();
- } else {
- logger().debug("{}: switching to Enqueuing state", __func__);
+ DEBUGDPP("switching to Enqueuing state", pg());
return transit<BackfillState::Enqueuing>();
}
}
@@ -120,12 +105,12 @@ BackfillState::Cancelled::react(const BackfillState::Triggered& evt)
// -- Enqueuing
void BackfillState::Enqueuing::maybe_update_range()
{
+ LOG_PREFIX(BackfillState::Enqueuing::maybe_update_range);
if (auto& primary_bi = backfill_state().backfill_info;
primary_bi.version >= pg().get_projected_last_update()) {
- logger().info("{}: bi is current", __func__);
+ INFODPP("bi is current", pg());
ceph_assert(primary_bi.version == pg().get_projected_last_update());
} else if (primary_bi.version >= peering_state().get_log_tail()) {
-#if 0
if (peering_state().get_pg_log().get_log().empty() &&
pg().get_projected_log().empty()) {
/* Because we don't move log_tail on split, the log might be
@@ -137,30 +122,32 @@ void BackfillState::Enqueuing::maybe_update_range()
ceph_assert(primary_bi.version == eversion_t());
return;
}
-#endif
- logger().debug("{}: bi is old, ({}) can be updated with log to {}",
- __func__,
- primary_bi.version,
- pg().get_projected_last_update());
- logger().debug("{}: scanning pg log first", __func__);
- peering_state().scan_log_after(primary_bi.version,
+ DEBUGDPP("{}: bi is old, ({}) can be updated with log to {}",
+ pg(),
+ primary_bi.version,
+ pg().get_projected_last_update());
+ auto func =
[&](const pg_log_entry_t& e) {
- logger().debug("maybe_update_range(lambda): updating from version {}",
- e.version);
+ DEBUGDPP("maybe_update_range(lambda): updating from version {}",
+ pg(), e.version);
if (e.soid >= primary_bi.begin && e.soid < primary_bi.end) {
if (e.is_update()) {
- logger().debug("maybe_update_range(lambda): {} updated to ver {}",
- e.soid, e.version);
+ DEBUGDPP("maybe_update_range(lambda): {} updated to ver {}",
+ pg(), e.soid, e.version);
primary_bi.objects.erase(e.soid);
primary_bi.objects.insert(std::make_pair(e.soid,
e.version));
} else if (e.is_delete()) {
- logger().debug("maybe_update_range(lambda): {} removed",
- e.soid);
+ DEBUGDPP("maybe_update_range(lambda): {} removed",
+ pg(), e.soid);
primary_bi.objects.erase(e.soid);
}
}
- });
+ };
+ DEBUGDPP("{}: scanning pg log first", pg());
+ peering_state().scan_log_after(primary_bi.version, func);
+ DEBUGDPP("{}: scanning projected log", pg());
+ pg().get_projected_log().scan_log_after(primary_bi.version, func);
primary_bi.version = pg().get_projected_last_update();
} else {
ceph_abort_msg(
@@ -243,6 +230,7 @@ void BackfillState::Enqueuing::trim_backfilled_object_from_intervals(
BackfillState::Enqueuing::result_t
BackfillState::Enqueuing::remove_on_peers(const hobject_t& check)
{
+ LOG_PREFIX(BackfillState::Enqueuing::remove_on_peers);
// set `new_last_backfill_started` to `check`
result_t result { {}, check };
for (const auto& bt : peering_state().get_backfill_targets()) {
@@ -254,8 +242,8 @@ BackfillState::Enqueuing::remove_on_peers(const hobject_t& check)
backfill_listener().enqueue_drop(bt, pbi.begin, version);
}
}
- logger().debug("{}: BACKFILL removing {} from peers {}",
- __func__, check, result.pbi_targets);
+ DEBUGDPP("BACKFILL removing {} from peers {}",
+ pg(), check, result.pbi_targets);
ceph_assert(!result.pbi_targets.empty());
return result;
}
@@ -263,7 +251,8 @@ BackfillState::Enqueuing::remove_on_peers(const hobject_t& check)
BackfillState::Enqueuing::result_t
BackfillState::Enqueuing::update_on_peers(const hobject_t& check)
{
- logger().debug("{}: check={}", __func__, check);
+ LOG_PREFIX(BackfillState::Enqueuing::update_on_peers);
+ DEBUGDPP("check={}", pg(), check);
const auto& primary_bi = backfill_state().backfill_info;
result_t result { {}, primary_bi.begin };
std::map<hobject_t, std::pair<eversion_t, std::vector<pg_shard_t>>> backfills;
@@ -324,6 +313,7 @@ bool BackfillState::Enqueuing::Enqueuing::all_emptied(
BackfillState::Enqueuing::Enqueuing(my_context ctx)
: my_base(ctx)
{
+ LOG_PREFIX(BackfillState::Enqueuing::Enqueuing);
auto& primary_bi = backfill_state().backfill_info;
// update our local interval to cope with recent changes
@@ -333,8 +323,7 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx)
// that backfill will be spinning here over and over. For the sake
// of performance and complexity we don't synchronize with entire PG.
// similar can happen in classical OSD.
- logger().warn("{}: bi is old, rescanning of local backfill_info",
- __func__);
+ WARNDPP("bi is old, rescanning of local backfill_info", pg());
post_event(RequestPrimaryScanning{});
return;
} else {
@@ -346,13 +335,14 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx)
primary_bi)) {
// need to grab one another chunk of the object namespace and restart
// the queueing.
- logger().debug("{}: reached end for current local chunk", __func__);
+ DEBUGDPP("reached end for current local chunk", pg());
post_event(RequestPrimaryScanning{});
return;
}
do {
if (!backfill_listener().budget_available()) {
+ DEBUGDPP("throttle failed, turning to Waiting", pg());
post_event(RequestWaiting{});
return;
} else if (should_rescan_replicas(backfill_state().peer_backfill_info,
@@ -378,28 +368,38 @@ BackfillState::Enqueuing::Enqueuing(my_context ctx)
trim_backfilled_object_from_intervals(std::move(result),
backfill_state().last_backfill_started,
backfill_state().peer_backfill_info);
- } else {
+ backfill_listener().maybe_flush();
+ } else if (!primary_bi.empty()) {
auto result = update_on_peers(check);
trim_backfilled_object_from_intervals(std::move(result),
backfill_state().last_backfill_started,
backfill_state().peer_backfill_info);
- if (!primary_bi.empty()) {
- primary_bi.pop_front();
- }
+ primary_bi.pop_front();
+ backfill_listener().maybe_flush();
+ } else {
+ break;
}
- backfill_listener().maybe_flush();
} while (!all_emptied(primary_bi, backfill_state().peer_backfill_info));
- if (backfill_state().progress_tracker->tracked_objects_completed()
- && Enqueuing::all_enqueued(peering_state(),
- backfill_state().backfill_info,
- backfill_state().peer_backfill_info)) {
- backfill_state().last_backfill_started = hobject_t::get_max();
- backfill_listener().update_peers_last_backfill(hobject_t::get_max());
+ if (should_rescan_primary(backfill_state().peer_backfill_info,
+ primary_bi)) {
+ // need to grab one another chunk of the object namespace and restart
+ // the queueing.
+ DEBUGDPP("reached end for current local chunk", pg());
+ post_event(RequestPrimaryScanning{});
+ return;
+ } else {
+ if (backfill_state().progress_tracker->tracked_objects_completed()
+ && Enqueuing::all_enqueued(peering_state(),
+ backfill_state().backfill_info,
+ backfill_state().peer_backfill_info)) {
+ backfill_state().last_backfill_started = hobject_t::get_max();
+ backfill_listener().update_peers_last_backfill(hobject_t::get_max());
+ }
+ DEBUGDPP("reached end for both local and all peers "
+ "but still has in-flight operations", pg());
+ post_event(RequestWaiting{});
}
- logger().debug("{}: reached end for both local and all peers "
- "but still has in-flight operations", __func__);
- post_event(RequestWaiting{});
}
// -- PrimaryScanning
@@ -414,16 +414,45 @@ BackfillState::PrimaryScanning::PrimaryScanning(my_context ctx)
boost::statechart::result
BackfillState::PrimaryScanning::react(PrimaryScanned evt)
{
- logger().debug("{}", __func__);
+ LOG_PREFIX(BackfillState::PrimaryScanning::react::PrimaryScanned);
+ DEBUGDPP("", pg());
backfill_state().backfill_info = std::move(evt.result);
- return transit<Enqueuing>();
+ if (!backfill_state().is_suspended()) {
+ return transit<Enqueuing>();
+ } else {
+ DEBUGDPP("backfill suspended, not going Enqueuing", pg());
+ backfill_state().go_enqueuing_on_resume();
+ }
+ return discard_event();
+}
+
+boost::statechart::result
+BackfillState::PrimaryScanning::react(CancelBackfill evt)
+{
+ LOG_PREFIX(BackfillState::PrimaryScanning::react::SuspendBackfill);
+ DEBUGDPP("suspended within PrimaryScanning", pg());
+ backfill_state().on_suspended();
+ return discard_event();
+}
+
+boost::statechart::result
+BackfillState::PrimaryScanning::react(Triggered evt)
+{
+ LOG_PREFIX(BackfillState::PrimaryScanning::react::Triggered);
+ ceph_assert(backfill_state().is_suspended());
+ if (backfill_state().on_resumed()) {
+ DEBUGDPP("Backfill resumed, going Enqueuing", pg());
+ return transit<Enqueuing>();
+ }
+ return discard_event();
}
boost::statechart::result
BackfillState::PrimaryScanning::react(ObjectPushed evt)
{
- logger().debug("PrimaryScanning::react() on ObjectPushed; evt.object={}",
- evt.object);
+ LOG_PREFIX(BackfillState::PrimaryScanning::react::ObjectPushed);
+ DEBUGDPP("PrimaryScanning::react() on ObjectPushed; evt.object={}",
+ pg(), evt.object);
backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true);
return discard_event();
}
@@ -441,11 +470,11 @@ bool BackfillState::ReplicasScanning::replica_needs_scan(
BackfillState::ReplicasScanning::ReplicasScanning(my_context ctx)
: my_base(ctx)
{
+ LOG_PREFIX(BackfillState::ReplicasScanning::ReplicasScanning);
for (const auto& bt : peering_state().get_backfill_targets()) {
if (const auto& pbi = backfill_state().peer_backfill_info.at(bt);
replica_needs_scan(pbi, backfill_state().backfill_info)) {
- logger().debug("{}: scanning peer osd.{} from {}",
- __func__, bt, pbi.end);
+ DEBUGDPP("scanning peer osd.{} from {}", pg(), bt, pbi.end);
backfill_listener().request_replica_scan(bt, pbi.end, hobject_t{});
ceph_assert(waiting_on_backfill.find(bt) == \
@@ -467,8 +496,9 @@ BackfillState::ReplicasScanning::~ReplicasScanning()
boost::statechart::result
BackfillState::ReplicasScanning::react(ReplicaScanned evt)
{
- logger().debug("{}: got scan result from osd={}, result={}",
- __func__, evt.from, evt.result);
+ LOG_PREFIX(BackfillState::ReplicasScanning::react::ReplicaScanned);
+ DEBUGDPP("got scan result from osd={}, result={}",
+ pg(), evt.from, evt.result);
// TODO: maybe we'll be able to move waiting_on_backfill from
// the machine to the state.
ceph_assert(peering_state().is_backfill_target(evt.from));
@@ -477,12 +507,17 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt)
if (waiting_on_backfill.empty()) {
ceph_assert(backfill_state().peer_backfill_info.size() == \
peering_state().get_backfill_targets().size());
- return transit<Enqueuing>();
+ if (!backfill_state().is_suspended()) {
+ return transit<Enqueuing>();
+ } else {
+ DEBUGDPP("backfill suspended, not going Enqueuing", pg());
+ backfill_state().go_enqueuing_on_resume();
+ }
}
} else {
- // we canceled backfill for a while due to a too full, and this
+ // we suspended backfill for a while due to a too full, and this
// is an extra response from a non-too-full peer
- logger().debug("{}: canceled backfill (too full?)", __func__);
+ DEBUGDPP("suspended backfill (too full?)", pg());
}
return discard_event();
}
@@ -490,17 +525,30 @@ BackfillState::ReplicasScanning::react(ReplicaScanned evt)
boost::statechart::result
BackfillState::ReplicasScanning::react(CancelBackfill evt)
{
- logger().debug("{}: cancelled within ReplicasScanning",
- __func__);
- waiting_on_backfill.clear();
- return transit<Cancelled>();
+ LOG_PREFIX(BackfillState::ReplicasScanning::react::SuspendBackfill);
+ DEBUGDPP("suspended within ReplicasScanning", pg());
+ backfill_state().on_suspended();
+ return discard_event();
+}
+
+boost::statechart::result
+BackfillState::ReplicasScanning::react(Triggered evt)
+{
+ LOG_PREFIX(BackfillState::ReplicasScanning::react::Triggered);
+ ceph_assert(backfill_state().is_suspended());
+ if (backfill_state().on_resumed()) {
+ DEBUGDPP("Backfill resumed, going Enqueuing", pg());
+ return transit<Enqueuing>();
+ }
+ return discard_event();
}
boost::statechart::result
BackfillState::ReplicasScanning::react(ObjectPushed evt)
{
- logger().debug("ReplicasScanning::react() on ObjectPushed; evt.object={}",
- evt.object);
+ LOG_PREFIX(BackfillState::ReplicasScanning::react::ObjectPushed);
+ DEBUGDPP("ReplicasScanning::react() on ObjectPushed; evt.object={}",
+ pg(), evt.object);
backfill_state().progress_tracker->complete_to(evt.object, evt.stat, true);
return discard_event();
}
@@ -515,17 +563,45 @@ BackfillState::Waiting::Waiting(my_context ctx)
boost::statechart::result
BackfillState::Waiting::react(ObjectPushed evt)
{
- logger().debug("Waiting::react() on ObjectPushed; evt.object={}",
- evt.object);
+ LOG_PREFIX(BackfillState::Waiting::react::ObjectPushed);
+ DEBUGDPP("Waiting::react() on ObjectPushed; evt.object={}", pg(), evt.object);
backfill_state().progress_tracker->complete_to(evt.object, evt.stat, false);
- return transit<Enqueuing>();;
+ if (!backfill_state().is_suspended()) {
+ return transit<Enqueuing>();
+ } else {
+ DEBUGDPP("backfill suspended, not going Enqueuing", pg());
+ backfill_state().go_enqueuing_on_resume();
+ }
+ return discard_event();
+}
+
+boost::statechart::result
+BackfillState::Waiting::react(CancelBackfill evt)
+{
+ LOG_PREFIX(BackfillState::Waiting::react::SuspendBackfill);
+ DEBUGDPP("suspended within Waiting", pg());
+ backfill_state().on_suspended();
+ return discard_event();
+}
+
+boost::statechart::result
+BackfillState::Waiting::react(Triggered evt)
+{
+ LOG_PREFIX(BackfillState::Waiting::react::Triggered);
+ ceph_assert(backfill_state().is_suspended());
+ if (backfill_state().on_resumed()) {
+ DEBUGDPP("Backfill resumed, going Enqueuing", pg());
+ return transit<Enqueuing>();
+ }
+ return discard_event();
}
// -- Done
BackfillState::Done::Done(my_context ctx)
: my_base(ctx)
{
- logger().info("{}: backfill is done", __func__);
+ LOG_PREFIX(BackfillState::Done::Done);
+ INFODPP("backfill is done", pg());
backfill_listener().backfilled();
}
@@ -535,13 +611,6 @@ BackfillState::Crashed::Crashed()
ceph_abort_msg("{}: this should not happen");
}
-// -- Cancelled
-BackfillState::Cancelled::Cancelled(my_context ctx)
- : my_base(ctx)
-{
- ceph_assert(peering_state().get_backfill_targets().size());
-}
-
// ProgressTracker is an intermediary between the BackfillListener and
// BackfillMachine + its states. All requests to push or drop an object
// are directed through it. The same happens with notifications about
@@ -575,8 +644,8 @@ void BackfillState::ProgressTracker::complete_to(
const pg_stat_t& stats,
bool may_push_to_max)
{
- logger().debug("{}: obj={}",
- __func__, obj);
+ LOG_PREFIX(BackfillState::ProgressTracker::complete_to);
+ DEBUGDPP("obj={}", pg(), obj);
if (auto completion_iter = registry.find(obj);
completion_iter != std::end(registry)) {
completion_iter->second = \
@@ -609,4 +678,27 @@ void BackfillState::ProgressTracker::complete_to(
}
}
+void BackfillState::enqueue_standalone_push(
+ const hobject_t &obj,
+ const eversion_t &v,
+ const std::vector<pg_shard_t> &peers) {
+ progress_tracker->enqueue_push(obj);
+ backfill_machine.backfill_listener.enqueue_push(obj, v, peers);
+}
+
+void BackfillState::enqueue_standalone_delete(
+ const hobject_t &obj,
+ const eversion_t &v,
+ const std::vector<pg_shard_t> &peers)
+{
+ progress_tracker->enqueue_drop(obj);
+ for (auto bt : peers) {
+ backfill_machine.backfill_listener.enqueue_drop(bt, obj, v);
+ }
+}
+
+std::ostream &operator<<(std::ostream &out, const BackfillState::PGFacade &pg) {
+ return pg.print(out);
+}
+
} // namespace crimson::osd
diff --git a/src/crimson/osd/backfill_state.h b/src/crimson/osd/backfill_state.h
index ddc0cbf7355..517a02ea4df 100644
--- a/src/crimson/osd/backfill_state.h
+++ b/src/crimson/osd/backfill_state.h
@@ -14,6 +14,7 @@
#include <boost/statechart/transition.hpp>
#include "osd/recovery_types.h"
+#include "osd/PGLog.h"
namespace crimson::osd {
@@ -61,6 +62,8 @@ struct BackfillState {
struct CancelBackfill : sc::event<CancelBackfill> {
};
+ struct ThrottleAcquired : sc::event<ThrottleAcquired> {
+ };
private:
// internal events
struct RequestPrimaryScanning : sc::event<RequestPrimaryScanning> {
@@ -135,34 +138,10 @@ public:
explicit Crashed();
};
- struct Cancelled : sc::state<Cancelled, BackfillMachine>,
- StateHelper<Cancelled> {
- using reactions = boost::mpl::list<
- sc::custom_reaction<Triggered>,
- sc::custom_reaction<PrimaryScanned>,
- sc::custom_reaction<ReplicaScanned>,
- sc::custom_reaction<ObjectPushed>,
- sc::transition<sc::event_base, Crashed>>;
- explicit Cancelled(my_context);
- // resume after triggering backfill by on_activate_complete().
- // transit to Enqueuing.
- sc::result react(const Triggered&);
- sc::result react(const PrimaryScanned&) {
- return discard_event();
- }
- sc::result react(const ReplicaScanned&) {
- return discard_event();
- }
- sc::result react(const ObjectPushed&) {
- return discard_event();
- }
- };
-
struct Initial : sc::state<Initial, BackfillMachine>,
StateHelper<Initial> {
using reactions = boost::mpl::list<
sc::custom_reaction<Triggered>,
- sc::transition<CancelBackfill, Cancelled>,
sc::transition<sc::event_base, Crashed>>;
explicit Initial(my_context);
// initialize after triggering backfill by on_activate_complete().
@@ -173,12 +152,9 @@ public:
struct Enqueuing : sc::state<Enqueuing, BackfillMachine>,
StateHelper<Enqueuing> {
using reactions = boost::mpl::list<
- sc::transition<CancelBackfill, Cancelled>,
sc::transition<RequestPrimaryScanning, PrimaryScanning>,
sc::transition<RequestReplicasScanning, ReplicasScanning>,
sc::transition<RequestWaiting, Waiting>,
- sc::transition<RequestDone, Done>,
- sc::transition<CancelBackfill, Cancelled>,
sc::transition<sc::event_base, Crashed>>;
explicit Enqueuing(my_context);
@@ -236,12 +212,15 @@ public:
sc::custom_reaction<ObjectPushed>,
sc::custom_reaction<PrimaryScanned>,
sc::transition<RequestDone, Done>,
- sc::transition<CancelBackfill, Cancelled>,
+ sc::custom_reaction<CancelBackfill>,
+ sc::custom_reaction<Triggered>,
sc::transition<sc::event_base, Crashed>>;
explicit PrimaryScanning(my_context);
sc::result react(ObjectPushed);
// collect scanning result and transit to Enqueuing.
sc::result react(PrimaryScanned);
+ sc::result react(CancelBackfill);
+ sc::result react(Triggered);
};
struct ReplicasScanning : sc::state<ReplicasScanning, BackfillMachine>,
@@ -250,6 +229,7 @@ public:
sc::custom_reaction<ObjectPushed>,
sc::custom_reaction<ReplicaScanned>,
sc::custom_reaction<CancelBackfill>,
+ sc::custom_reaction<Triggered>,
sc::transition<RequestDone, Done>,
sc::transition<sc::event_base, Crashed>>;
explicit ReplicasScanning(my_context);
@@ -258,6 +238,7 @@ public:
sc::result react(ObjectPushed);
sc::result react(ReplicaScanned);
sc::result react(CancelBackfill);
+ sc::result react(Triggered);
// indicate whether a particular peer should be scanned to retrieve
// BackfillInterval for new range of hobject_t namespace.
@@ -276,17 +257,25 @@ public:
using reactions = boost::mpl::list<
sc::custom_reaction<ObjectPushed>,
sc::transition<RequestDone, Done>,
- sc::transition<CancelBackfill, Cancelled>,
+ sc::custom_reaction<CancelBackfill>,
+ sc::custom_reaction<Triggered>,
+ sc::transition<ThrottleAcquired, Enqueuing>,
sc::transition<sc::event_base, Crashed>>;
explicit Waiting(my_context);
sc::result react(ObjectPushed);
+ sc::result react(CancelBackfill);
+ sc::result react(Triggered);
};
struct Done : sc::state<Done, BackfillMachine>,
StateHelper<Done> {
using reactions = boost::mpl::list<
+ sc::custom_reaction<CancelBackfill>,
sc::transition<sc::event_base, Crashed>>;
explicit Done(my_context);
+ sc::result react(CancelBackfill) {
+ return discard_event();
+ }
};
BackfillState(BackfillListener& backfill_listener,
@@ -299,6 +288,20 @@ public:
backfill_machine.process_event(*std::move(evt));
}
+ void enqueue_standalone_push(
+ const hobject_t &obj,
+ const eversion_t &v,
+ const std::vector<pg_shard_t> &peers);
+ void enqueue_standalone_delete(
+ const hobject_t &obj,
+ const eversion_t &v,
+ const std::vector<pg_shard_t> &peers);
+
+
+ bool is_triggered() const {
+ return backfill_machine.triggering_event() != nullptr;
+ }
+
hobject_t get_last_backfill_started() const {
return last_backfill_started;
}
@@ -311,6 +314,26 @@ public:
}
}
private:
+ struct backfill_suspend_state_t {
+ bool suspended = false;
+ bool should_go_enqueuing = false;
+ } backfill_suspend_state;
+ bool is_suspended() const {
+ return backfill_suspend_state.suspended;
+ }
+ void on_suspended() {
+ ceph_assert(!is_suspended());
+ backfill_suspend_state = {true, false};
+ }
+ bool on_resumed() {
+ auto go_enqueuing = backfill_suspend_state.should_go_enqueuing;
+ backfill_suspend_state = {false, false};
+ return go_enqueuing;
+ }
+ void go_enqueuing_on_resume() {
+ ceph_assert(is_suspended());
+ backfill_suspend_state.should_go_enqueuing = true;
+ }
hobject_t last_backfill_started;
BackfillInterval backfill_info;
std::map<pg_shard_t, BackfillInterval> peer_backfill_info;
@@ -363,6 +386,7 @@ struct BackfillState::PeeringFacade {
virtual hobject_t earliest_backfill() const = 0;
virtual const std::set<pg_shard_t>& get_backfill_targets() const = 0;
virtual const hobject_t& get_peer_last_backfill(pg_shard_t peer) const = 0;
+ virtual const PGLog& get_pg_log() const = 0;
virtual const eversion_t& get_last_update() const = 0;
virtual const eversion_t& get_log_tail() const = 0;
@@ -388,8 +412,12 @@ struct BackfillState::PeeringFacade {
// of behaviour that must be provided by a unit test's mock.
struct BackfillState::PGFacade {
virtual const eversion_t& get_projected_last_update() const = 0;
+ virtual const PGLog::IndexedLog& get_projected_log() const = 0;
+
+ virtual std::ostream &print(std::ostream &out) const = 0;
virtual ~PGFacade() {}
};
+std::ostream &operator<<(std::ostream &out, const BackfillState::PGFacade &pg);
class BackfillState::ProgressTracker {
// TODO: apply_stat,
@@ -416,6 +444,9 @@ class BackfillState::ProgressTracker {
BackfillListener& backfill_listener() {
return backfill_machine.backfill_listener;
}
+ PGFacade& pg() {
+ return *backfill_machine.pg;
+ }
public:
ProgressTracker(BackfillMachine& backfill_machine)
@@ -430,3 +461,9 @@ public:
};
} // namespace crimson::osd
+
+#if FMT_VERSION >= 90000
+template <> struct fmt::formatter<crimson::osd::BackfillState::PGFacade>
+ : fmt::ostream_formatter {};
+#endif
+
diff --git a/src/crimson/osd/ec_backend.cc b/src/crimson/osd/ec_backend.cc
index 32eaaf02b3f..007d0bf35f3 100644
--- a/src/crimson/osd/ec_backend.cc
+++ b/src/crimson/osd/ec_backend.cc
@@ -26,6 +26,7 @@ ECBackend::_read(const hobject_t& hoid,
ECBackend::rep_op_fut_t
ECBackend::submit_transaction(const std::set<pg_shard_t> &pg_shards,
const hobject_t& hoid,
+ crimson::osd::ObjectContextRef&& new_clone,
ceph::os::Transaction&& txn,
osd_op_params_t&& osd_op_p,
epoch_t min_epoch, epoch_t max_epoch,
diff --git a/src/crimson/osd/ec_backend.h b/src/crimson/osd/ec_backend.h
index 90a7e2b1f4d..b14c78c9fc4 100644
--- a/src/crimson/osd/ec_backend.h
+++ b/src/crimson/osd/ec_backend.h
@@ -28,6 +28,7 @@ private:
rep_op_fut_t
submit_transaction(const std::set<pg_shard_t> &pg_shards,
const hobject_t& hoid,
+ crimson::osd::ObjectContextRef&& new_clone,
ceph::os::Transaction&& txn,
osd_op_params_t&& req,
epoch_t min_epoch, epoch_t max_epoch,
diff --git a/src/crimson/osd/heartbeat.cc b/src/crimson/osd/heartbeat.cc
index 03986952b4f..5902fc8c14f 100644
--- a/src/crimson/osd/heartbeat.cc
+++ b/src/crimson/osd/heartbeat.cc
@@ -9,6 +9,7 @@
#include "messages/MOSDPing.h"
#include "messages/MOSDFailure.h"
+#include "msg/msg_types.h"
#include "crimson/common/config_proxy.h"
#include "crimson/common/formatter.h"
diff --git a/src/crimson/osd/main.cc b/src/crimson/osd/main.cc
index fa387804dcd..0bfd3e2266b 100644
--- a/src/crimson/osd/main.cc
+++ b/src/crimson/osd/main.cc
@@ -24,6 +24,7 @@
#include "crimson/common/buffer_io.h"
#include "crimson/common/config_proxy.h"
#include "crimson/common/fatal_signal.h"
+#include "crimson/common/perf_counters_collection.h"
#include "crimson/mon/MonClient.h"
#include "crimson/net/Messenger.h"
#include "crimson/osd/stop_signal.h"
@@ -201,7 +202,7 @@ int main(int argc, const char* argv[])
true);
}
auto store = crimson::os::FuturizedStore::create(
- local_conf().get_val<std::string>("osd_objectstore"),
+ local_conf().get_val<std::string>("crimson_osd_objectstore"),
local_conf().get_val<std::string>("osd_data"),
local_conf().get_config_values());
diff --git a/src/crimson/osd/main_config_bootstrap_helpers.cc b/src/crimson/osd/main_config_bootstrap_helpers.cc
index 3596929527f..e4920eb870f 100644
--- a/src/crimson/osd/main_config_bootstrap_helpers.cc
+++ b/src/crimson/osd/main_config_bootstrap_helpers.cc
@@ -17,10 +17,13 @@
#include "crimson/common/buffer_io.h"
#include "crimson/common/config_proxy.h"
#include "crimson/common/fatal_signal.h"
+#include "crimson/common/perf_counters_collection.h"
#include "crimson/mon/MonClient.h"
#include "crimson/net/Messenger.h"
#include "crimson/osd/main_config_bootstrap_helpers.h"
+#include <sys/wait.h> // for waitpid()
+
using namespace std::literals;
using crimson::common::local_conf;
using crimson::common::sharded_conf;
diff --git a/src/crimson/osd/object_context.h b/src/crimson/osd/object_context.h
index e17af91e3ad..4195e5dc597 100644
--- a/src/crimson/osd/object_context.h
+++ b/src/crimson/osd/object_context.h
@@ -9,6 +9,7 @@
#include <seastar/core/shared_future.hh>
#include <seastar/core/shared_ptr.hh>
+#include "common/fmt_common.h"
#include "common/intrusive_lru.h"
#include "osd/object_state.h"
#include "crimson/common/exception.h"
@@ -73,6 +74,8 @@ public:
using watch_key_t = std::pair<uint64_t, entity_name_t>;
std::map<watch_key_t, seastar::shared_ptr<crimson::osd::Watch>> watchers;
+ CommonOBCPipeline obc_pipeline;
+
ObjectContext(hobject_t hoid) : lock(hoid),
obs(std::move(hoid)) {}
@@ -128,30 +131,49 @@ public:
}
bool is_valid() const {
- return !invalidated_by_interval_change;
+ return !invalidated;
}
private:
- template <typename Lock, typename Func>
- auto _with_lock(Lock& lock, Func&& func) {
- return lock.lock(
- ).then([&lock, func=std::forward<Func>(func), obc=Ref(this)]() mutable {
- return seastar::futurize_invoke(
- func
- ).finally([&lock, obc=std::move(obc)] {
- /* We chain the finally block here because it's possible for lock.lock()
- * above to fail due to a call to ObjectContext::interrupt, which calls
- * tri_mutex::abort. In the event of such an error, the lock isn't
- * actually taken and calling unlock() would be incorrect. */
- lock.unlock();
- });
- });
- }
-
boost::intrusive::list_member_hook<> obc_accessing_hook;
uint64_t list_link_cnt = 0;
+
+ /**
+ * loading_started
+ *
+ * ObjectContext instances may be used for pipeline stages
+ * prior to actually being loaded.
+ *
+ * ObjectContextLoader::load_and_lock* use loading_started
+ * to determine whether to initiate loading or simply take
+ * the desired lock directly.
+ *
+ * If loading_started is not set, the task must set it and
+ * (syncronously) take an exclusive lock. That exclusive lock
+ * must be held until the loading completes, at which point the
+ * lock may be relaxed or released.
+ *
+ * If loading_started is set, it is safe to directly take
+ * the desired lock, once the lock is obtained loading may
+ * be assumed to be complete.
+ *
+ * loading_started, once set, remains set for the lifetime
+ * of the object.
+ */
+ bool loading_started = false;
+
+ /// true once set_*_state has been called, used for debugging
bool fully_loaded = false;
- bool invalidated_by_interval_change = false;
+
+ /**
+ * invalidated
+ *
+ * Set to true upon eviction from cache. This happens to all
+ * cached obc's upon interval change and to the target of
+ * a repop received on a replica to ensure that the cached
+ * state is refreshed upon subsequent replica read.
+ */
+ bool invalidated = false;
friend class ObjectContextRegistry;
friend class ObjectContextLoader;
@@ -172,122 +194,20 @@ public:
}
}
+ template <typename FormatContext>
+ auto fmt_print_ctx(FormatContext & ctx) const {
+ return fmt::format_to(
+ ctx.out(), "ObjectContext({}, oid={}, refcount={})",
+ (void*)this,
+ get_oid(),
+ get_use_count());
+ }
+
using obc_accessing_option_t = boost::intrusive::member_hook<
ObjectContext,
boost::intrusive::list_member_hook<>,
&ObjectContext::obc_accessing_hook>;
- template<RWState::State Type, typename InterruptCond = void, typename Func>
- auto with_lock(Func&& func) {
- if constexpr (!std::is_void_v<InterruptCond>) {
- auto wrapper = ::crimson::interruptible::interruptor<InterruptCond>::wrap_function(std::forward<Func>(func));
- switch (Type) {
- case RWState::RWWRITE:
- return _with_lock(lock.for_write(), std::move(wrapper));
- case RWState::RWREAD:
- return _with_lock(lock.for_read(), std::move(wrapper));
- case RWState::RWEXCL:
- return _with_lock(lock.for_excl(), std::move(wrapper));
- case RWState::RWNONE:
- return seastar::futurize_invoke(std::move(wrapper));
- default:
- assert(0 == "noop");
- }
- } else {
- switch (Type) {
- case RWState::RWWRITE:
- return _with_lock(lock.for_write(), std::forward<Func>(func));
- case RWState::RWREAD:
- return _with_lock(lock.for_read(), std::forward<Func>(func));
- case RWState::RWEXCL:
- return _with_lock(lock.for_excl(), std::forward<Func>(func));
- case RWState::RWNONE:
- return seastar::futurize_invoke(std::forward<Func>(func));
- default:
- assert(0 == "noop");
- }
- }
- }
-
- /**
- * load_then_with_lock
- *
- * Takes two functions as arguments -- load_func to be invoked
- * with an exclusive lock, and func to be invoked under the
- * lock type specified by the Type template argument.
- *
- * Caller must ensure that *this is not already locked, presumably
- * by invoking load_then_with_lock immediately after construction.
- *
- * @param [in] load_func Function to be invoked under excl lock
- * @param [in] func Function to be invoked after load_func under
- * lock of type Type.
- */
- template<RWState::State Type, typename Func, typename Func2>
- auto load_then_with_lock(Func &&load_func, Func2 &&func) {
- class lock_state_t {
- tri_mutex *lock = nullptr;
- bool excl = false;
-
- public:
- lock_state_t(tri_mutex &lock) : lock(&lock), excl(true) {
- ceph_assert(lock.try_lock_for_excl());
- }
- lock_state_t(lock_state_t &&o) : lock(o.lock), excl(o.excl) {
- o.lock = nullptr;
- o.excl = false;
- }
- lock_state_t() = delete;
- lock_state_t &operator=(lock_state_t &&o) = delete;
- lock_state_t(const lock_state_t &o) = delete;
- lock_state_t &operator=(const lock_state_t &o) = delete;
-
- void demote() {
- ceph_assert(excl);
- ceph_assert(lock);
- if constexpr (Type == RWState::RWWRITE) {
- lock->demote_to_write();
- } else if constexpr (Type == RWState::RWREAD) {
- lock->demote_to_read();
- } else if constexpr (Type == RWState::RWNONE) {
- lock->unlock_for_excl();
- }
- excl = false;
- }
-
- ~lock_state_t() {
- if (!lock)
- return;
-
- if constexpr (Type == RWState::RWEXCL) {
- lock->unlock_for_excl();
- } else {
- if (excl) {
- lock->unlock_for_excl();
- return;
- }
-
- if constexpr (Type == RWState::RWWRITE) {
- lock->unlock_for_write();
- } else if constexpr (Type == RWState::RWREAD) {
- lock->unlock_for_read();
- }
- }
- }
- };
-
- return seastar::do_with(
- lock_state_t{lock},
- [load_func=std::move(load_func), func=std::move(func)](auto &ls) mutable {
- return std::invoke(
- std::move(load_func)
- ).si_then([func=std::move(func), &ls]() mutable {
- ls.demote();
- return std::invoke(std::move(func));
- });
- });
- }
-
bool empty() const {
return !lock.is_acquired();
}
@@ -313,12 +233,14 @@ public:
void clear_range(const hobject_t &from,
const hobject_t &to) {
- obc_lru.clear_range(from, to);
+ obc_lru.clear_range(from, to, [](auto &obc) {
+ obc.invalidated = true;
+ });
}
void invalidate_on_interval_change() {
obc_lru.clear([](auto &obc) {
- obc.invalidated_by_interval_change = true;
+ obc.invalidated = true;
});
}
@@ -336,3 +258,6 @@ std::optional<hobject_t> resolve_oid(const SnapSet &ss,
const hobject_t &oid);
} // namespace crimson::osd
+
+template <>
+struct fmt::formatter<RWState::State> : fmt::ostream_formatter {};
diff --git a/src/crimson/osd/object_context_loader.cc b/src/crimson/osd/object_context_loader.cc
index 12aa40b925a..483251a23b5 100644
--- a/src/crimson/osd/object_context_loader.cc
+++ b/src/crimson/osd/object_context_loader.cc
@@ -1,3 +1,4 @@
+#include "crimson/common/coroutine.h"
#include "crimson/osd/object_context_loader.h"
#include "osd/osd_types_fmt.h"
#include "osd/object_state_fmt.h"
@@ -8,207 +9,162 @@ namespace crimson::osd {
using crimson::common::local_conf;
- template<RWState::State State>
- ObjectContextLoader::load_obc_iertr::future<>
- ObjectContextLoader::with_head_obc(const hobject_t& oid,
- with_obc_func_t&& func)
- {
- return with_locked_obc<State, true /* track */>(
- oid,
- [func=std::move(func)](auto obc) {
- // The template with_obc_func_t wrapper supports two obcs (head and clone).
- // In the 'with_head_obc' case, however, only the head is in use.
- // Pass the same head obc twice in order to
- // to support the generic with_obc sturcture.
- return std::invoke(std::move(func), obc, obc);
- });
+
+ObjectContextLoader::load_and_lock_fut
+ObjectContextLoader::load_and_lock_head(Manager &manager, RWState::State lock_type)
+{
+ LOG_PREFIX(ObjectContextLoader::load_and_lock_head);
+ DEBUGDPP("{} {}", dpp, manager.target, lock_type);
+ auto releaser = manager.get_releaser();
+ ceph_assert(manager.target.is_head());
+
+ if (manager.head_state.is_empty()) {
+ auto [obc, _] = obc_registry.get_cached_obc(manager.target);
+ manager.set_state_obc(manager.head_state, obc);
+ }
+ ceph_assert(manager.target_state.is_empty());
+ manager.set_state_obc(manager.target_state, manager.head_state.obc);
+
+ if (manager.target_state.obc->loading_started) {
+ co_await manager.target_state.lock_to(lock_type);
+ } else {
+ manager.target_state.lock_excl_sync();
+ manager.target_state.obc->loading_started = true;
+ co_await load_obc(manager.target_state.obc);
+ manager.target_state.demote_excl_to(lock_type);
}
+ releaser.cancel();
+}
+
+ObjectContextLoader::load_and_lock_fut
+ObjectContextLoader::load_and_lock_clone(
+ Manager &manager, RWState::State lock_type, bool lock_head)
+{
+ LOG_PREFIX(ObjectContextLoader::load_and_lock_clone);
+ DEBUGDPP("{} {}", dpp, manager.target, lock_type);
+ auto releaser = manager.get_releaser();
- template<RWState::State State>
- ObjectContextLoader::load_obc_iertr::future<>
- ObjectContextLoader::with_clone_obc(const hobject_t& oid,
- with_obc_func_t&& func,
- bool resolve_clone)
- {
- LOG_PREFIX(ObjectContextLoader::with_clone_obc);
- assert(!oid.is_head());
- return with_head_obc<RWState::RWREAD>(
- oid.get_head(),
- [FNAME, oid, func=std::move(func), resolve_clone, this]
- (auto head, auto) mutable -> load_obc_iertr::future<> {
- if (!head->obs.exists) {
- ERRORDPP("head doesn't exist for object {}", dpp, head->obs.oi.soid);
- return load_obc_iertr::future<>{
- crimson::ct_error::enoent::make()
- };
- }
- return this->with_clone_obc_only<State>(std::move(head),
- oid,
- std::move(func),
- resolve_clone);
- });
+ ceph_assert(!manager.target.is_head());
+ ceph_assert(manager.target_state.is_empty());
+
+ if (manager.head_state.is_empty()) {
+ auto [obc, _] = obc_registry.get_cached_obc(manager.target.get_head());
+ manager.set_state_obc(manager.head_state, obc);
}
- template<RWState::State State>
- ObjectContextLoader::load_obc_iertr::future<>
- ObjectContextLoader::with_clone_obc_only(ObjectContextRef head,
- hobject_t clone_oid,
- with_obc_func_t&& func,
- bool resolve_clone)
- {
- LOG_PREFIX(ObjectContextLoader::with_clone_obc_only);
- DEBUGDPP("{}", dpp, clone_oid);
- assert(!clone_oid.is_head());
- if (resolve_clone) {
- auto resolved_oid = resolve_oid(head->get_head_ss(), clone_oid);
- if (!resolved_oid) {
- ERRORDPP("clone {} not found", dpp, clone_oid);
- return load_obc_iertr::future<>{
- crimson::ct_error::enoent::make()
- };
- }
- if (resolved_oid->is_head()) {
- // See resolve_oid
- return std::move(func)(head, head);
- }
- clone_oid = *resolved_oid;
- }
- return with_locked_obc<State, false /* don't track */>(
- clone_oid,
- [head=std::move(head), func=std::move(func)](auto clone) {
- clone->set_clone_ssc(head->ssc);
- return std::move(func)(std::move(head), std::move(clone));
- });
+ if (!manager.head_state.obc->loading_started) {
+ // caller is responsible for pre-populating a loaded obc if lock_head is
+ // false
+ ceph_assert(lock_head);
+ manager.head_state.lock_excl_sync();
+ manager.head_state.obc->loading_started = true;
+ co_await load_obc(manager.head_state.obc);
+ manager.head_state.demote_excl_to(RWState::RWREAD);
+ } else if (lock_head) {
+ co_await manager.head_state.lock_to(RWState::RWREAD);
}
- template<RWState::State State>
- ObjectContextLoader::load_obc_iertr::future<>
- ObjectContextLoader::with_obc(hobject_t oid,
- with_obc_func_t&& func,
- bool resolve_clone)
- {
- if (oid.is_head()) {
- return with_head_obc<State>(oid, std::move(func));
- } else {
- return with_clone_obc<State>(oid, std::move(func), resolve_clone);
+ if (manager.options.resolve_clone) {
+ auto resolved_oid = resolve_oid(
+ manager.head_state.obc->get_head_ss(),
+ manager.target);
+ if (!resolved_oid) {
+ ERRORDPP("clone {} not found", dpp, manager.target);
+ co_await load_obc_iertr::future<>(
+ crimson::ct_error::enoent::make()
+ );
}
+ // note: might be head if snap was taken after most recent write!
+ manager.target = *resolved_oid;
}
- template<RWState::State State, bool track, typename Func>
- ObjectContextLoader::load_obc_iertr::future<>
- ObjectContextLoader::with_locked_obc(const hobject_t& oid,
- Func&& func)
- {
- LOG_PREFIX(ObjectContextLoader::with_locked_obc);
- auto [obc, existed] = obc_registry.get_cached_obc(oid);
- DEBUGDPP("object {} existed {}",
- dpp, obc->get_oid(), existed);
- if constexpr (track) {
- obc->append_to(obc_set_accessing);
+ if (manager.target.is_head()) {
+ /* Yes, we assert at the top that manager.target is not head. However, it's
+ * possible that the requested snap (the resolve_clone path above) actually
+ * maps to head (a read on an rbd snapshot more recent than the most recent
+ * write on this specific rbd block, for example).
+ *
+ * In such an event, it's hypothetically possible that lock_type isn't
+ * RWREAD, in which case we need to drop and reacquire the lock. However,
+ * this case is at present impossible. Actual client requests cannot write
+ * to a snapshot and will therefore always be RWREAD. The pathways that
+ * actually can mutate a clone do not set resolve_clone, so target will not
+ * become head here.
+ */
+ manager.set_state_obc(manager.target_state, manager.head_state.obc);
+ if (lock_type != manager.head_state.state) {
+ // This case isn't actually possible at the moment for the above reason.
+ manager.head_state.release_lock();
+ co_await manager.target_state.lock_to(lock_type);
+ } else {
+ manager.target_state.state = manager.head_state.state;
+ manager.head_state.state = RWState::RWNONE;
}
- if (existed) {
- return obc->with_lock<State, IOInterruptCondition>(
- [func=std::move(func), obc=ObjectContextRef(obc)] {
- return std::invoke(std::move(func), obc);
- }
- ).finally([FNAME, this, obc=ObjectContextRef(obc)] {
- DEBUGDPP("released object {}, {}", dpp, obc->get_oid(), obc->obs);
- if constexpr (track) {
- obc->remove_from(obc_set_accessing);
- }
- });
+ } else {
+ auto [obc, _] = obc_registry.get_cached_obc(manager.target);
+ manager.set_state_obc(manager.target_state, obc);
+
+ if (manager.target_state.obc->loading_started) {
+ co_await manager.target_state.lock_to(RWState::RWREAD);
} else {
- return obc->load_then_with_lock<State> (
- [this, obc=ObjectContextRef(obc)] {
- return load_obc(obc);
- },
- [func=std::move(func), obc=ObjectContextRef(obc)] {
- return std::invoke(std::move(func), obc);
- }
- ).finally([FNAME, this, obc=ObjectContextRef(obc)] {
- DEBUGDPP("released object {}, {}", dpp, obc->get_oid(), obc->obs);
- if constexpr (track) {
- obc->remove_from(obc_set_accessing);
- }
- });
+ manager.target_state.lock_excl_sync();
+ manager.target_state.obc->loading_started = true;
+ co_await load_obc(manager.target_state.obc);
+ manager.target_state.obc->set_clone_ssc(manager.head_state.obc->ssc);
+ manager.target_state.demote_excl_to(RWState::RWREAD);
}
}
+ releaser.cancel();
+}
+
+ObjectContextLoader::load_and_lock_fut
+ObjectContextLoader::load_and_lock(Manager &manager, RWState::State lock_type)
+{
+ LOG_PREFIX(ObjectContextLoader::load_and_lock);
+ DEBUGDPP("{} {}", dpp, manager.target, lock_type);
+ if (manager.target.is_head()) {
+ return load_and_lock_head(manager, lock_type);
+ } else {
+ return load_and_lock_clone(manager, lock_type);
+ }
+}
- ObjectContextLoader::load_obc_iertr::future<>
- ObjectContextLoader::load_obc(ObjectContextRef obc)
- {
- LOG_PREFIX(ObjectContextLoader::load_obc);
- return backend.load_metadata(obc->get_oid())
+ObjectContextLoader::load_obc_iertr::future<>
+ObjectContextLoader::load_obc(ObjectContextRef obc)
+{
+ LOG_PREFIX(ObjectContextLoader::load_obc);
+ return backend.load_metadata(obc->get_oid())
.safe_then_interruptible(
[FNAME, this, obc=std::move(obc)](auto md)
-> load_obc_ertr::future<> {
- const hobject_t& oid = md->os.oi.soid;
- DEBUGDPP("loaded obs {} for {}", dpp, md->os.oi, oid);
- if (oid.is_head()) {
- if (!md->ssc) {
- ERRORDPP("oid {} missing snapsetcontext", dpp, oid);
- return crimson::ct_error::object_corrupted::make();
- }
- obc->set_head_state(std::move(md->os),
- std::move(md->ssc));
- } else {
- // we load and set the ssc only for head obc.
- // For clones, the head's ssc will be referenced later.
- // See set_clone_ssc
- obc->set_clone_state(std::move(md->os));
- }
- DEBUGDPP("loaded obc {} for {}", dpp, obc->obs.oi, obc->obs.oi.soid);
- return seastar::now();
- });
- }
-
- ObjectContextLoader::load_obc_iertr::future<>
- ObjectContextLoader::reload_obc(ObjectContext& obc) const
- {
- LOG_PREFIX(ObjectContextLoader::reload_obc);
- assert(obc.is_head());
- return backend.load_metadata(obc.get_oid())
- .safe_then_interruptible<false>(
- [FNAME, this, &obc](auto md)-> load_obc_ertr::future<> {
- DEBUGDPP("reloaded obs {} for {}", dpp, md->os.oi, obc.get_oid());
- if (!md->ssc) {
- ERRORDPP("oid {} missing snapsetcontext", dpp, obc.get_oid());
- return crimson::ct_error::object_corrupted::make();
- }
- obc.set_head_state(std::move(md->os), std::move(md->ssc));
- return load_obc_ertr::now();
- });
- }
+ const hobject_t& oid = md->os.oi.soid;
+ DEBUGDPP("loaded obs {} for {}", dpp, md->os.oi, oid);
+ if (oid.is_head()) {
+ if (!md->ssc) {
+ ERRORDPP("oid {} missing snapsetcontext", dpp, oid);
+ return crimson::ct_error::object_corrupted::make();
+ }
+ obc->set_head_state(std::move(md->os),
+ std::move(md->ssc));
+ } else {
+ // we load and set the ssc only for head obc.
+ // For clones, the head's ssc will be referenced later.
+ // See set_clone_ssc
+ obc->set_clone_state(std::move(md->os));
+ }
+ DEBUGDPP("loaded obc {} for {}", dpp, obc->obs.oi, obc->obs.oi.soid);
+ return seastar::now();
+ });
+}
- void ObjectContextLoader::notify_on_change(bool is_primary)
- {
- LOG_PREFIX(ObjectContextLoader::notify_on_change);
- DEBUGDPP("is_primary: {}", dpp, is_primary);
- for (auto& obc : obc_set_accessing) {
- DEBUGDPP("interrupting obc: {}", dpp, obc.get_oid());
- obc.interrupt(::crimson::common::actingset_changed(is_primary));
- }
+void ObjectContextLoader::notify_on_change(bool is_primary)
+{
+ LOG_PREFIX(ObjectContextLoader::notify_on_change);
+ DEBUGDPP("is_primary: {}", dpp, is_primary);
+ for (auto& obc : obc_set_accessing) {
+ DEBUGDPP("interrupting obc: {}", dpp, obc.get_oid());
+ obc.interrupt(::crimson::common::actingset_changed(is_primary));
}
-
- // explicitly instantiate the used instantiations
- template ObjectContextLoader::load_obc_iertr::future<>
- ObjectContextLoader::with_obc<RWState::RWNONE>(hobject_t,
- with_obc_func_t&&,
- bool resolve_clone);
-
- template ObjectContextLoader::load_obc_iertr::future<>
- ObjectContextLoader::with_obc<RWState::RWREAD>(hobject_t,
- with_obc_func_t&&,
- bool resolve_clone);
-
- template ObjectContextLoader::load_obc_iertr::future<>
- ObjectContextLoader::with_obc<RWState::RWWRITE>(hobject_t,
- with_obc_func_t&&,
- bool resolve_clone);
-
- template ObjectContextLoader::load_obc_iertr::future<>
- ObjectContextLoader::with_obc<RWState::RWEXCL>(hobject_t,
- with_obc_func_t&&,
- bool resolve_clone);
+}
}
diff --git a/src/crimson/osd/object_context_loader.h b/src/crimson/osd/object_context_loader.h
index 277708eca4f..49f8f1572bf 100644
--- a/src/crimson/osd/object_context_loader.h
+++ b/src/crimson/osd/object_context_loader.h
@@ -1,9 +1,14 @@
#pragma once
#include <seastar/core/future.hh>
+#include <seastar/util/defer.hh>
+#include "crimson/common/coroutine.h"
#include "crimson/common/errorator.h"
+#include "crimson/common/log.h"
#include "crimson/osd/object_context.h"
+#include "crimson/osd/osd_operation.h"
#include "crimson/osd/pg_backend.h"
+#include "osd/object_state_fmt.h"
namespace crimson::osd {
class ObjectContextLoader {
@@ -29,6 +34,208 @@ public:
::crimson::osd::IOInterruptCondition,
load_obc_ertr>;
+ class Manager {
+ ObjectContextLoader &loader;
+ hobject_t target;
+
+ Manager() = delete;
+ template <typename T>
+ Manager(ObjectContextLoader &loader, T &&t)
+ : loader(loader), target(std::forward<T>(t)) {}
+ Manager(const Manager &) = delete;
+ Manager &operator=(const Manager &o) = delete;
+
+ struct options_t {
+ bool resolve_clone = true;
+ } options;
+
+ struct state_t {
+ RWState::State state = RWState::RWNONE;
+ ObjectContextRef obc;
+ bool is_empty() const { return !obc; }
+
+ void lock_excl_sync() {
+ bool locked = obc->lock.try_lock_for_excl();
+ ceph_assert(locked);
+ state = RWState::RWEXCL;
+ }
+
+ void demote_excl_to(RWState::State lock_type) {
+ assert(state == RWState::RWEXCL);
+ switch (lock_type) {
+ case RWState::RWWRITE:
+ obc->lock.demote_to_write();
+ state = RWState::RWWRITE;
+ break;
+ case RWState::RWREAD:
+ obc->lock.demote_to_read();
+ state = RWState::RWREAD;
+ break;
+ case RWState::RWNONE:
+ obc->lock.unlock_for_excl();
+ state = RWState::RWNONE;
+ break;
+ case RWState::RWEXCL:
+ //noop
+ break;
+ default:
+ ceph_assert(0 == "impossible");
+ }
+ }
+
+ auto lock_to(RWState::State lock_type) {
+ assert(state == RWState::RWNONE);
+ switch (lock_type) {
+ case RWState::RWWRITE:
+ return interruptor::make_interruptible(
+ obc->lock.lock_for_write().then([this] {
+ state = RWState::RWWRITE;
+ }));
+ case RWState::RWREAD:
+ return interruptor::make_interruptible(
+ obc->lock.lock_for_read().then([this] {
+ state = RWState::RWREAD;
+ }));
+ case RWState::RWNONE:
+ // noop
+ return interruptor::now();
+ case RWState::RWEXCL:
+ return interruptor::make_interruptible(
+ obc->lock.lock_for_excl().then([this] {
+ state = RWState::RWEXCL;
+ }));
+ default:
+ ceph_assert(0 == "impossible");
+ return interruptor::now();
+ }
+ }
+
+ void release_lock() {
+ switch (state) {
+ case RWState::RWREAD:
+ obc->lock.unlock_for_read();
+ break;
+ case RWState::RWWRITE:
+ obc->lock.unlock_for_write();
+ break;
+ case RWState::RWEXCL:
+ obc->lock.unlock_for_excl();
+ break;
+ case RWState::RWNONE:
+ // noop
+ break;
+ default:
+ ceph_assert(0 == "invalid");
+ }
+ state = RWState::RWNONE;
+ }
+ };
+ state_t head_state;
+ state_t target_state;
+
+ friend ObjectContextLoader;
+
+ void set_state_obc(state_t &s, ObjectContextRef _obc) {
+ s.obc = std::move(_obc);
+ s.obc->append_to(loader.obc_set_accessing);
+ }
+
+ void release_state(state_t &s) {
+ LOG_PREFIX(ObjectContextLoader::release_state);
+ if (s.is_empty()) return;
+
+ s.release_lock();
+ SUBDEBUGDPP(osd, "releasing obc {}, {}", loader.dpp, *(s.obc), s.obc->obs);
+ s.obc->remove_from(loader.obc_set_accessing);
+ s = state_t();
+ }
+ public:
+ Manager(Manager &&rhs) : loader(rhs.loader) {
+ std::swap(target, rhs.target);
+ std::swap(options, rhs.options);
+ std::swap(head_state, rhs.head_state);
+ std::swap(target_state, rhs.target_state);
+ }
+
+ Manager &operator=(Manager &&o) {
+ this->~Manager();
+ new(this) Manager(std::move(o));
+ return *this;
+ }
+
+ ObjectContextRef &get_obc() {
+ ceph_assert(!target_state.is_empty());
+ ceph_assert(target_state.obc->is_loaded());
+ return target_state.obc;
+ }
+
+ ObjectContextRef &get_head_obc() {
+ ceph_assert(!head_state.is_empty());
+ ceph_assert(head_state.obc->is_loaded());
+ return head_state.obc;
+ }
+
+ void release() {
+ release_state(head_state);
+ release_state(target_state);
+ }
+
+ auto get_releaser() {
+ return seastar::defer([this] {
+ release();
+ });
+ }
+
+ ~Manager() {
+ release();
+ }
+ };
+
+ class Orderer {
+ friend ObjectContextLoader;
+ ObjectContextRef orderer_obc;
+ public:
+ CommonOBCPipeline &obc_pp() {
+ ceph_assert(orderer_obc);
+ return orderer_obc->obc_pipeline;
+ }
+
+ ~Orderer() {
+ LOG_PREFIX(ObjectContextLoader::~Orderer);
+ SUBDEBUG(osd, "releasing obc {}, {}", *(orderer_obc));
+ }
+ };
+
+ Orderer get_obc_orderer(const hobject_t &oid) {
+ Orderer ret;
+ std::tie(ret.orderer_obc, std::ignore) =
+ obc_registry.get_cached_obc(oid.get_head());
+ return ret;
+ }
+
+ Manager get_obc_manager(const hobject_t &oid, bool resolve_clone = true) {
+ Manager ret(*this, oid);
+ ret.options.resolve_clone = resolve_clone;
+ return ret;
+ }
+
+ Manager get_obc_manager(
+ Orderer &orderer, const hobject_t &oid, bool resolve_clone = true) {
+ Manager ret = get_obc_manager(oid, resolve_clone);
+ ret.set_state_obc(ret.head_state, orderer.orderer_obc);
+ return ret;
+ }
+
+ using load_and_lock_ertr = load_obc_ertr;
+ using load_and_lock_iertr = interruptible::interruptible_errorator<
+ IOInterruptCondition, load_and_lock_ertr>;
+ using load_and_lock_fut = load_and_lock_iertr::future<>;
+private:
+ load_and_lock_fut load_and_lock_head(Manager &, RWState::State);
+ load_and_lock_fut load_and_lock_clone(Manager &, RWState::State, bool lock_head=true);
+public:
+ load_and_lock_fut load_and_lock(Manager &, RWState::State);
+
using interruptor = ::crimson::interruptible::interruptor<
::crimson::osd::IOInterruptCondition>;
@@ -43,8 +250,13 @@ public:
// See SnapTrimObjSubEvent::remove_or_update - in_removed_snaps_queue usage.
template<RWState::State State>
load_obc_iertr::future<> with_obc(hobject_t oid,
- with_obc_func_t&& func,
- bool resolve_clone = true);
+ with_obc_func_t func,
+ bool resolve_clone = true) {
+ auto manager = get_obc_manager(oid, resolve_clone);
+ co_await load_and_lock(manager, State);
+ co_await std::invoke(
+ func, manager.get_head_obc(), manager.get_obc());
+ }
// Use this variant in the case where the head object
// obc is already locked and only the clone obc is needed.
@@ -53,10 +265,20 @@ public:
template<RWState::State State>
load_obc_iertr::future<> with_clone_obc_only(ObjectContextRef head,
hobject_t clone_oid,
- with_obc_func_t&& func,
- bool resolve_clone = true);
-
- load_obc_iertr::future<> reload_obc(ObjectContext& obc) const;
+ with_obc_func_t func,
+ bool resolve_clone = true) {
+ LOG_PREFIX(ObjectContextLoader::with_clone_obc_only);
+ SUBDEBUGDPP(osd, "{}", dpp, clone_oid);
+ auto manager = get_obc_manager(clone_oid, resolve_clone);
+ // We populate head_state here with the passed obc assuming that
+ // it has been loaded and locked appropriately. We do not populate
+ // head_state.state because we won't be taking or releasing any
+ // locks on head as part of this call.
+ manager.head_state.obc = head;
+ manager.head_state.obc->append_to(obc_set_accessing);
+ co_await load_and_lock_clone(manager, State, false);
+ co_await std::invoke(func, head, manager.get_obc());
+ }
void notify_on_change(bool is_primary);
@@ -66,24 +288,9 @@ private:
DoutPrefixProvider& dpp;
obc_accessing_list_t obc_set_accessing;
- template<RWState::State State>
- load_obc_iertr::future<> with_clone_obc(const hobject_t& oid,
- with_obc_func_t&& func,
- bool resolve_clone);
-
- template<RWState::State State>
- load_obc_iertr::future<> with_head_obc(const hobject_t& oid,
- with_obc_func_t&& func);
-
- template<RWState::State State, bool track, typename Func>
- load_obc_iertr::future<> with_locked_obc(const hobject_t& oid,
- Func&& func);
-
- template<RWState::State State>
- load_obc_iertr::future<ObjectContextRef>
- get_or_load_obc(ObjectContextRef obc,
- bool existed);
-
load_obc_iertr::future<> load_obc(ObjectContextRef obc);
};
+
+using ObjectContextManager = ObjectContextLoader::Manager;
+
}
diff --git a/src/crimson/osd/ops_executer.cc b/src/crimson/osd/ops_executer.cc
index 9bf60140374..cbc35c21a04 100644
--- a/src/crimson/osd/ops_executer.cc
+++ b/src/crimson/osd/ops_executer.cc
@@ -15,12 +15,15 @@
#include <seastar/core/thread.hh>
+#include "crimson/common/log.h"
#include "crimson/osd/exceptions.h"
#include "crimson/osd/pg.h"
#include "crimson/osd/watch.h"
#include "osd/ClassHandler.h"
#include "osd/SnapMapper.h"
+SET_SUBSYS(osd);
+
namespace {
seastar::logger& logger() {
return crimson::get_logger(ceph_subsys_osd);
@@ -464,10 +467,7 @@ auto OpsExecuter::do_const_op(Func&& f) {
template <class Func>
auto OpsExecuter::do_write_op(Func&& f, OpsExecuter::modified_by m) {
++num_write;
- if (!osd_op_params) {
- osd_op_params.emplace();
- fill_op_params(m);
- }
+ check_init_op_params(m);
return std::forward<Func>(f)(pg->get_backend(), obc->obs, txn);
}
OpsExecuter::call_errorator::future<> OpsExecuter::do_assert_ver(
@@ -822,25 +822,100 @@ OpsExecuter::do_execute_op(OSDOp& osd_op)
}
}
-void OpsExecuter::fill_op_params(OpsExecuter::modified_by m)
+OpsExecuter::rep_op_fut_t
+OpsExecuter::flush_changes_and_submit(
+ const std::vector<OSDOp>& ops,
+ SnapMapper& snap_mapper,
+ OSDriver& osdriver)
{
- osd_op_params->req_id = msg->get_reqid();
- osd_op_params->mtime = msg->get_mtime();
- osd_op_params->at_version = pg->get_next_version();
- osd_op_params->pg_trim_to = pg->get_pg_trim_to();
- osd_op_params->min_last_complete_ondisk = pg->get_min_last_complete_ondisk();
- osd_op_params->last_complete = pg->get_info().last_complete;
- osd_op_params->user_modify = (m == modified_by::user);
+ const bool want_mutate = !txn.empty();
+ // osd_op_params are instantiated by every wr-like operation.
+ assert(osd_op_params || !want_mutate);
+ assert(obc);
+
+ auto submitted = interruptor::now();
+ auto all_completed = interruptor::now();
+
+ if (cloning_ctx) {
+ ceph_assert(want_mutate);
+ }
+
+ apply_stats();
+ if (want_mutate) {
+ osd_op_params->at_version = pg->get_next_version();
+ osd_op_params->pg_trim_to = pg->get_pg_trim_to();
+ osd_op_params->pg_committed_to = pg->get_pg_committed_to();
+ osd_op_params->last_complete = pg->get_info().last_complete;
+
+ std::vector<pg_log_entry_t> log_entries;
+
+ if (cloning_ctx) {
+ log_entries.emplace_back(complete_cloning_ctx());
+ }
+
+ log_entries.emplace_back(prepare_head_update(ops, txn));
+
+ if (auto log_rit = log_entries.rbegin(); log_rit != log_entries.rend()) {
+ ceph_assert(log_rit->version == osd_op_params->at_version);
+ }
+
+ /*
+ * This works around the gcc bug causing the generated code to incorrectly
+ * execute unconditionally before the predicate.
+ *
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101244
+ */
+ auto clone_obc = cloning_ctx
+ ? std::move(cloning_ctx->clone_obc)
+ : nullptr;
+ auto [_submitted, _all_completed] = co_await pg->submit_transaction(
+ std::move(obc),
+ std::move(clone_obc),
+ std::move(txn),
+ std::move(*osd_op_params),
+ std::move(log_entries)
+ );
+
+ submitted = std::move(_submitted);
+ all_completed = std::move(_all_completed);
+ }
+
+ if (op_effects.size()) [[unlikely]] {
+ // need extra ref pg due to apply_stats() which can be executed after
+ // informing snap mapper
+ all_completed =
+ std::move(all_completed).then_interruptible([this, pg=this->pg] {
+ // let's do the cleaning of `op_effects` in destructor
+ return interruptor::do_for_each(op_effects,
+ [pg=std::move(pg)](auto& op_effect) {
+ return op_effect->execute(pg);
+ });
+ });
+ }
+
+ co_return std::make_tuple(
+ std::move(submitted),
+ std::move(all_completed));
}
-std::vector<pg_log_entry_t> OpsExecuter::prepare_transaction(
- const std::vector<OSDOp>& ops)
+pg_log_entry_t OpsExecuter::prepare_head_update(
+ const std::vector<OSDOp>& ops,
+ ceph::os::Transaction &txn)
{
- // let's ensure we don't need to inform SnapMapper about this particular
- // entry.
+ LOG_PREFIX(OpsExecuter::prepare_head_update);
assert(obc->obs.oi.soid.snap >= CEPH_MAXSNAP);
- std::vector<pg_log_entry_t> log_entries;
- log_entries.emplace_back(
+
+ update_clone_overlap();
+ if (cloning_ctx) {
+ obc->ssc->snapset = std::move(cloning_ctx->new_snapset);
+ }
+ if (snapc.seq > obc->ssc->snapset.seq) {
+ // update snapset with latest snap context
+ obc->ssc->snapset.seq = snapc.seq;
+ obc->ssc->snapset.snaps.clear();
+ }
+
+ pg_log_entry_t ret{
obc->obs.exists ?
pg_log_entry_t::MODIFY : pg_log_entry_t::DELETE,
obc->obs.oi.soid,
@@ -849,15 +924,38 @@ std::vector<pg_log_entry_t> OpsExecuter::prepare_transaction(
osd_op_params->user_modify ? osd_op_params->at_version.version : 0,
osd_op_params->req_id,
osd_op_params->mtime,
- op_info.allows_returnvec() && !ops.empty() ? ops.back().rval.code : 0);
+ op_info.allows_returnvec() && !ops.empty() ? ops.back().rval.code : 0};
+
if (op_info.allows_returnvec()) {
// also the per-op values are recorded in the pg log
- log_entries.back().set_op_returns(ops);
- logger().debug("{} op_returns: {}",
- __func__, log_entries.back().op_returns);
+ ret.set_op_returns(ops);
+ DEBUGDPP("op returns: {}", *pg, ret.op_returns);
+ }
+ ret.clean_regions = std::move(osd_op_params->clean_regions);
+
+
+ if (obc->obs.exists) {
+ obc->obs.oi.prior_version = obc->obs.oi.version;
+ obc->obs.oi.version = osd_op_params->at_version;
+ if (osd_op_params->user_modify)
+ obc->obs.oi.user_version = osd_op_params->at_version.version;
+ obc->obs.oi.last_reqid = osd_op_params->req_id;
+ obc->obs.oi.mtime = osd_op_params->mtime;
+ obc->obs.oi.local_mtime = ceph_clock_now();
+
+ obc->ssc->exists = true;
+ pg->get_backend().set_metadata(
+ obc->obs.oi.soid,
+ obc->obs.oi,
+ obc->obs.oi.soid.is_head() ? &(obc->ssc->snapset) : nullptr,
+ txn);
+ } else {
+ // reset cached ObjectState without enforcing eviction
+ obc->obs.oi = object_info_t(obc->obs.oi.soid);
}
- log_entries.back().clean_regions = std::move(osd_op_params->clean_regions);
- return log_entries;
+
+ DEBUGDPP("entry: {}", *pg, ret);
+ return ret;
}
// Defined here because there is a circular dependency between OpsExecuter and PG
@@ -871,25 +969,26 @@ version_t OpsExecuter::get_last_user_version() const
return pg->get_last_user_version();
}
-std::unique_ptr<OpsExecuter::CloningContext> OpsExecuter::execute_clone(
+void OpsExecuter::prepare_cloning_ctx(
const SnapContext& snapc,
const ObjectState& initial_obs,
const SnapSet& initial_snapset,
PGBackend& backend,
ceph::os::Transaction& txn)
{
+ LOG_PREFIX(OpsExecuter::prepare_cloning_ctx);
const hobject_t& soid = initial_obs.oi.soid;
logger().debug("{} {} snapset={} snapc={}",
__func__, soid,
initial_snapset, snapc);
- auto cloning_ctx = std::make_unique<CloningContext>();
+ cloning_ctx = std::make_unique<CloningContext>();
cloning_ctx->new_snapset = initial_snapset;
// clone object, the snap field is set to the seq of the SnapContext
// at its creation.
- hobject_t coid = soid;
- coid.snap = snapc.seq;
+ cloning_ctx->coid = soid;
+ cloning_ctx->coid.snap = snapc.seq;
// existing snaps are stored in descending order in snapc,
// cloned_snaps vector will hold all the snaps stored until snapset.seq
@@ -900,48 +999,63 @@ std::unique_ptr<OpsExecuter::CloningContext> OpsExecuter::execute_clone(
return std::vector<snapid_t>{std::begin(snapc.snaps), last};
}();
- auto clone_obc = prepare_clone(coid, osd_op_params->at_version);
- osd_op_params->at_version.version++;
+ // make clone here, but populate in metadata in complete_cloning_ctx
+ backend.clone_for_write(soid, cloning_ctx->coid, txn);
- // make clone
- backend.clone(clone_obc->obs.oi, initial_obs, clone_obc->obs, txn);
+ cloning_ctx->clone_obc = prepare_clone(cloning_ctx->coid, initial_obs);
delta_stats.num_objects++;
- if (clone_obc->obs.oi.is_omap()) {
+ if (cloning_ctx->clone_obc->obs.oi.is_omap()) {
delta_stats.num_objects_omap++;
}
delta_stats.num_object_clones++;
// newsnapset is obc's ssc
- cloning_ctx->new_snapset.clones.push_back(coid.snap);
- cloning_ctx->new_snapset.clone_size[coid.snap] = initial_obs.oi.size;
- cloning_ctx->new_snapset.clone_snaps[coid.snap] = cloned_snaps;
+ cloning_ctx->new_snapset.clones.push_back(cloning_ctx->coid.snap);
+ cloning_ctx->new_snapset.clone_size[cloning_ctx->coid.snap] = initial_obs.oi.size;
+ cloning_ctx->new_snapset.clone_snaps[cloning_ctx->coid.snap] = cloned_snaps;
// clone_overlap should contain an entry for each clone
// (an empty interval_set if there is no overlap)
- auto &overlap = cloning_ctx->new_snapset.clone_overlap[coid.snap];
+ auto &overlap = cloning_ctx->new_snapset.clone_overlap[cloning_ctx->coid.snap];
if (initial_obs.oi.size) {
overlap.insert(0, initial_obs.oi.size);
}
// log clone
- logger().debug("cloning v {} to {} v {} snaps={} snapset={}",
- initial_obs.oi.version, coid,
- osd_op_params->at_version, cloned_snaps, cloning_ctx->new_snapset);
+ DEBUGDPP("cloning v {} to {} v {} snaps={} snapset={}", *pg,
+ initial_obs.oi.version, cloning_ctx->coid,
+ osd_op_params->at_version, cloned_snaps, cloning_ctx->new_snapset);
+}
- cloning_ctx->log_entry = {
+pg_log_entry_t OpsExecuter::complete_cloning_ctx()
+{
+ ceph_assert(cloning_ctx);
+ const auto &coid = cloning_ctx->coid;
+ cloning_ctx->clone_obc->obs.oi.version = osd_op_params->at_version;
+
+ osd_op_params->at_version.version++;
+
+ pg->get_backend().set_metadata(
+ cloning_ctx->coid,
+ cloning_ctx->clone_obc->obs.oi,
+ nullptr /* snapset */,
+ txn);
+
+ pg_log_entry_t ret{
pg_log_entry_t::CLONE,
coid,
- clone_obc->obs.oi.version,
- clone_obc->obs.oi.prior_version,
- clone_obc->obs.oi.user_version,
+ cloning_ctx->clone_obc->obs.oi.version,
+ cloning_ctx->clone_obc->obs.oi.prior_version,
+ cloning_ctx->clone_obc->obs.oi.user_version,
osd_reqid_t(),
- clone_obc->obs.oi.mtime, // will be replaced in `apply_to()`
+ cloning_ctx->clone_obc->obs.oi.mtime, // will be replaced in `apply_to()`
0
};
- encode(cloned_snaps, cloning_ctx->log_entry.snaps);
- cloning_ctx->log_entry.clean_regions.mark_data_region_dirty(0, initial_obs.oi.size);
-
- return cloning_ctx;
+ ceph_assert(cloning_ctx->new_snapset.clone_snaps.count(coid.snap));
+ encode(cloning_ctx->new_snapset.clone_snaps[coid.snap], ret.snaps);
+ ret.clean_regions.mark_data_region_dirty(0, cloning_ctx->clone_obc->obs.oi.size);
+ ret.mtime = cloning_ctx->clone_obc->obs.oi.mtime;
+ return ret;
}
void OpsExecuter::update_clone_overlap() {
@@ -964,47 +1078,16 @@ void OpsExecuter::update_clone_overlap() {
delta_stats.num_bytes += osd_op_params->modified_ranges.size();
}
-void OpsExecuter::CloningContext::apply_to(
- std::vector<pg_log_entry_t>& log_entries,
- ObjectContext& processed_obc) &&
-{
- log_entry.mtime = processed_obc.obs.oi.mtime;
- log_entries.insert(log_entries.begin(), std::move(log_entry));
- processed_obc.ssc->snapset = std::move(new_snapset);
-}
-
-std::vector<pg_log_entry_t>
-OpsExecuter::flush_clone_metadata(
- std::vector<pg_log_entry_t>&& log_entries,
- SnapMapper& snap_mapper,
- OSDriver& osdriver,
- ceph::os::Transaction& txn)
-{
- assert(!txn.empty());
- update_clone_overlap();
- if (cloning_ctx) {
- std::move(*cloning_ctx).apply_to(log_entries, *obc);
- }
- if (snapc.seq > obc->ssc->snapset.seq) {
- // update snapset with latest snap context
- obc->ssc->snapset.seq = snapc.seq;
- obc->ssc->snapset.snaps.clear();
- }
- logger().debug("{} done, initial snapset={}, new snapset={}",
- __func__, obc->obs.oi.soid, obc->ssc->snapset);
- return std::move(log_entries);
-}
-
ObjectContextRef OpsExecuter::prepare_clone(
const hobject_t& coid,
- eversion_t version)
+ const ObjectState& initial_obs)
{
ceph_assert(pg->is_primary());
ObjectState clone_obs{coid};
clone_obs.exists = true;
- clone_obs.oi.version = version;
- clone_obs.oi.prior_version = obc->obs.oi.version;
- clone_obs.oi.copy_user_bits(obc->obs.oi);
+ // clone_obs.oi.version will be populated in complete_cloning_ctx
+ clone_obs.oi.prior_version = initial_obs.oi.version;
+ clone_obs.oi.copy_user_bits(initial_obs.oi);
clone_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
auto [clone_obc, existed] = pg->obc_registry.get_cached_obc(std::move(coid));
@@ -1035,11 +1118,12 @@ OpsExecuter::OpsExecuter(Ref<PG> pg,
{
if (op_info.may_write() && should_clone(*obc, snapc)) {
do_write_op([this](auto& backend, auto& os, auto& txn) {
- cloning_ctx = execute_clone(std::as_const(snapc),
- std::as_const(obc->obs),
- std::as_const(obc->ssc->snapset),
- backend,
- txn);
+ prepare_cloning_ctx(
+ std::as_const(snapc),
+ std::as_const(obc->obs),
+ std::as_const(obc->ssc->snapset),
+ backend,
+ txn);
});
}
}
diff --git a/src/crimson/osd/ops_executer.h b/src/crimson/osd/ops_executer.h
index e770e825b32..f5554bd6919 100644
--- a/src/crimson/osd/ops_executer.h
+++ b/src/crimson/osd/ops_executer.h
@@ -195,25 +195,26 @@ private:
SnapContext snapc; // writer snap context
struct CloningContext {
+ /// id of new clone, populated in prepare_cloning_ctx
+ hobject_t coid;
+ /// new snapset, populated in prepare_cloning_ctx
SnapSet new_snapset;
- pg_log_entry_t log_entry;
-
- void apply_to(
- std::vector<pg_log_entry_t>& log_entries,
- ObjectContext& processed_obc) &&;
+ /// populated in complete_cloning_ctx
+ ObjectContextRef clone_obc;
};
std::unique_ptr<CloningContext> cloning_ctx;
-
/**
- * execute_clone
+ * prepare_cloning_ctx
*
* If snapc contains a snap which occurred logically after the last write
* seen by this object (see OpsExecuter::should_clone()), we first need
- * make a clone of the object at its current state. execute_clone primes
- * txn with that clone operation and returns an
- * OpsExecuter::CloningContext which will allow us to fill in the corresponding
- * metadata and log_entries once the operations have been processed.
+ * make a clone of the object at its current state. prepare_cloning_ctx
+ * primes txn with that clone operation and populates cloning_ctx with
+ * an obc for the clone and a new snapset reflecting the clone.
+ *
+ * complete_cloning_ctx later uses the information from cloning_ctx to
+ * generate a log entry and object_info versions for the clone.
*
* Note that this strategy differs from classic, which instead performs this
* work at the end and reorders the transaction. See
@@ -226,13 +227,15 @@ private:
* @param backend [in,out] interface for generating mutations
* @param txn [out] transaction for the operation
*/
- std::unique_ptr<CloningContext> execute_clone(
+ void prepare_cloning_ctx(
const SnapContext& snapc,
const ObjectState& initial_obs,
const SnapSet& initial_snapset,
PGBackend& backend,
ceph::os::Transaction& txn);
+ /// complete clone, populate clone_obc, return log entry
+ pg_log_entry_t complete_cloning_ctx();
/**
* should_clone
@@ -263,12 +266,6 @@ private:
*/
void update_clone_overlap();
- std::vector<pg_log_entry_t> flush_clone_metadata(
- std::vector<pg_log_entry_t>&& log_entries,
- SnapMapper& snap_mapper,
- OSDriver& osdriver,
- ceph::os::Transaction& txn);
-
private:
// this gizmo could be wrapped in std::optional for the sake of lazy
// initialization. we don't need it for ops that doesn't have effect
@@ -399,15 +396,22 @@ public:
std::tuple<interruptible_future<>, interruptible_future<>>;
using rep_op_fut_t =
interruptible_future<rep_op_fut_tuple>;
- template <typename MutFunc>
- rep_op_fut_t flush_changes_n_do_ops_effects(
+ rep_op_fut_t flush_changes_and_submit(
const std::vector<OSDOp>& ops,
SnapMapper& snap_mapper,
- OSDriver& osdriver,
- MutFunc mut_func) &&;
- std::vector<pg_log_entry_t> prepare_transaction(
- const std::vector<OSDOp>& ops);
- void fill_op_params(modified_by m);
+ OSDriver& osdriver);
+ pg_log_entry_t prepare_head_update(
+ const std::vector<OSDOp>& ops,
+ ceph::os::Transaction &txn);
+
+ void check_init_op_params(OpsExecuter::modified_by m) {
+ if (!osd_op_params) {
+ osd_op_params.emplace();
+ osd_op_params->req_id = msg->get_reqid();
+ osd_op_params->mtime = msg->get_mtime();
+ osd_op_params->user_modify = (m == modified_by::user);
+ }
+ }
ObjectContextRef get_obc() const {
return obc;
@@ -442,7 +446,7 @@ public:
ObjectContextRef prepare_clone(
const hobject_t& coid,
- eversion_t version);
+ const ObjectState& initial_obs);
void apply_stats();
};
@@ -484,67 +488,6 @@ auto OpsExecuter::with_effect_on_obc(
return std::forward<MainFunc>(main_func)(ctx_ref);
}
-template <typename MutFunc>
-OpsExecuter::rep_op_fut_t
-OpsExecuter::flush_changes_n_do_ops_effects(
- const std::vector<OSDOp>& ops,
- SnapMapper& snap_mapper,
- OSDriver& osdriver,
- MutFunc mut_func) &&
-{
- const bool want_mutate = !txn.empty();
- // osd_op_params are instantiated by every wr-like operation.
- assert(osd_op_params || !want_mutate);
- assert(obc);
-
- auto submitted = interruptor::now();
- auto all_completed = interruptor::now();
-
- if (cloning_ctx) {
- ceph_assert(want_mutate);
- }
-
- if (want_mutate) {
- auto log_entries = flush_clone_metadata(
- prepare_transaction(ops),
- snap_mapper,
- osdriver,
- txn);
-
- if (auto log_rit = log_entries.rbegin(); log_rit != log_entries.rend()) {
- ceph_assert(log_rit->version == osd_op_params->at_version);
- }
-
- auto [_submitted, _all_completed] = co_await mut_func(
- std::move(txn),
- std::move(obc),
- std::move(*osd_op_params),
- std::move(log_entries));
-
- submitted = std::move(_submitted);
- all_completed = std::move(_all_completed);
- }
-
- apply_stats();
-
- if (op_effects.size()) [[unlikely]] {
- // need extra ref pg due to apply_stats() which can be executed after
- // informing snap mapper
- all_completed =
- std::move(all_completed).then_interruptible([this, pg=this->pg] {
- // let's do the cleaning of `op_effects` in destructor
- return interruptor::do_for_each(op_effects,
- [pg=std::move(pg)](auto& op_effect) {
- return op_effect->execute(pg);
- });
- });
- }
-
- co_return std::make_tuple(
- std::move(submitted),
- std::move(all_completed));
-}
-
template <class Func>
struct OpsExecuter::RollbackHelper {
void rollback_obc_if_modified();
diff --git a/src/crimson/osd/osd.cc b/src/crimson/osd/osd.cc
index 34ad97ceb06..0f19bfd7145 100644
--- a/src/crimson/osd/osd.cc
+++ b/src/crimson/osd/osd.cc
@@ -504,6 +504,8 @@ seastar::future<> OSD::start()
}).then_unpack([this] {
return _add_me_to_crush();
}).then([this] {
+ return _add_device_class();
+ }).then([this] {
monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
monc->sub_want("mgrmap", 0, 0);
monc->sub_want("osdmap", 0, 0);
@@ -608,6 +610,38 @@ seastar::future<> OSD::_send_boot()
return monc->send_message(std::move(m));
}
+seastar::future<> OSD::_add_device_class()
+{
+ LOG_PREFIX(OSD::_add_device_class);
+ if (!local_conf().get_val<bool>("osd_class_update_on_start")) {
+ co_return;
+ }
+
+ std::string device_class = co_await store.get_default_device_class();
+ if (device_class.empty()) {
+ WARN("Device class is empty; skipping crush update.");
+ co_return;
+ }
+
+ INFO("device_class is {} ", device_class);
+
+ std::string cmd = fmt::format(
+ R"({{"prefix": "osd crush set-device-class", "class": "{}", "ids": ["{}"]}})",
+ device_class, stringify(whoami)
+ );
+
+ auto [code, message, out] = co_await monc->run_command(std::move(cmd), {});
+ if (code) {
+ // to be caught by crimson/osd/main.cc
+ WARN("fail to set device_class : {} ({})", message, code);
+ throw std::runtime_error("fail to set device_class");
+ } else {
+ INFO("device_class was set: {}", message);
+ }
+
+ co_return;
+}
+
seastar::future<> OSD::_add_me_to_crush()
{
LOG_PREFIX(OSD::_add_me_to_crush);
diff --git a/src/crimson/osd/osd.h b/src/crimson/osd/osd.h
index d7d54d5d2c3..1a84ccd6a3f 100644
--- a/src/crimson/osd/osd.h
+++ b/src/crimson/osd/osd.h
@@ -188,6 +188,7 @@ private:
seastar::future<> _preboot(version_t oldest_osdmap, version_t newest_osdmap);
seastar::future<> _send_boot();
seastar::future<> _add_me_to_crush();
+ seastar::future<> _add_device_class();
seastar::future<> osdmap_subscribe(version_t epoch, bool force_request);
diff --git a/src/crimson/osd/osd_operation.h b/src/crimson/osd/osd_operation.h
index fd8b049c0bf..394375c1129 100644
--- a/src/crimson/osd/osd_operation.h
+++ b/src/crimson/osd/osd_operation.h
@@ -50,24 +50,36 @@ struct PGPeeringPipeline {
};
struct CommonPGPipeline {
- struct WaitForActive : OrderedExclusivePhaseT<WaitForActive> {
- static constexpr auto type_name = "CommonPGPipeline:::wait_for_active";
- } wait_for_active;
- struct RecoverMissing : OrderedConcurrentPhaseT<RecoverMissing> {
- static constexpr auto type_name = "CommonPGPipeline::recover_missing";
- } recover_missing;
- struct CheckAlreadyCompleteGetObc : OrderedExclusivePhaseT<CheckAlreadyCompleteGetObc> {
- static constexpr auto type_name = "CommonPGPipeline::check_already_complete_get_obc";
- } check_already_complete_get_obc;
- struct LockOBC : OrderedConcurrentPhaseT<LockOBC> {
- static constexpr auto type_name = "CommonPGPipeline::lock_obc";
- } lock_obc;
+ struct WaitPGReady : OrderedConcurrentPhaseT<WaitPGReady> {
+ static constexpr auto type_name = "CommonPGPipeline:::wait_pg_ready";
+ } wait_pg_ready;
+ struct GetOBC : OrderedExclusivePhaseT<GetOBC> {
+ static constexpr auto type_name = "CommonPGPipeline:::get_obc";
+ } get_obc;
+};
+
+struct PGRepopPipeline {
+ struct Process : OrderedExclusivePhaseT<Process> {
+ static constexpr auto type_name = "PGRepopPipeline::process";
+ } process;
+ struct WaitCommit : OrderedConcurrentPhaseT<WaitCommit> {
+ static constexpr auto type_name = "PGRepopPipeline::wait_repop";
+ } wait_commit;
+ struct SendReply : OrderedExclusivePhaseT<SendReply> {
+ static constexpr auto type_name = "PGRepopPipeline::send_reply";
+ } send_reply;
+};
+
+struct CommonOBCPipeline {
struct Process : OrderedExclusivePhaseT<Process> {
- static constexpr auto type_name = "CommonPGPipeline::process";
+ static constexpr auto type_name = "CommonOBCPipeline::process";
} process;
struct WaitRepop : OrderedConcurrentPhaseT<WaitRepop> {
- static constexpr auto type_name = "ClientRequest::PGPipeline::wait_repop";
+ static constexpr auto type_name = "CommonOBCPipeline::wait_repop";
} wait_repop;
+ struct SendReply : OrderedExclusivePhaseT<SendReply> {
+ static constexpr auto type_name = "CommonOBCPipeline::send_reply";
+ } send_reply;
};
@@ -205,6 +217,9 @@ protected:
public:
static constexpr bool is_trackable = true;
+ virtual bool requires_pg() const {
+ return true;
+ }
};
template <class T>
@@ -326,6 +341,18 @@ public:
with_throttle_while(std::forward<Args>(args)...), *this);
}
+ // Returns std::nullopt if the throttle is acquired immediately,
+ // returns the future for the acquiring otherwise
+ std::optional<seastar::future<>>
+ try_acquire_throttle_now(crimson::osd::scheduler::params_t params) {
+ if (!max_in_progress || in_progress < max_in_progress) {
+ ++in_progress;
+ --pending;
+ return std::nullopt;
+ }
+ return acquire_throttle(params);
+ }
+
private:
void dump_detail(Formatter *f) const final;
diff --git a/src/crimson/osd/osd_operation_external_tracking.h b/src/crimson/osd/osd_operation_external_tracking.h
index d2786a95e4d..6a2d7e3ccbd 100644
--- a/src/crimson/osd/osd_operation_external_tracking.h
+++ b/src/crimson/osd/osd_operation_external_tracking.h
@@ -25,24 +25,23 @@ struct LttngBackend
ConnectionPipeline::AwaitMap::BlockingEvent::Backend,
ConnectionPipeline::GetPGMapping::BlockingEvent::Backend,
PerShardPipeline::CreateOrWaitPG::BlockingEvent::Backend,
+ CommonPGPipeline::WaitPGReady::BlockingEvent::Backend,
+ CommonPGPipeline::WaitPGReady::BlockingEvent::ExitBarrierEvent::Backend,
+ CommonPGPipeline::GetOBC::BlockingEvent::Backend,
OSD_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend,
PGMap::PGCreationBlockingEvent::Backend,
- ClientRequest::PGPipeline::AwaitMap::BlockingEvent::Backend,
PG_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend,
- ClientRequest::PGPipeline::WaitForActive::BlockingEvent::Backend,
PGActivationBlocker::BlockingEvent::Backend,
scrub::PGScrubber::BlockingEvent::Backend,
- ClientRequest::PGPipeline::RecoverMissing::BlockingEvent::Backend,
- ClientRequest::PGPipeline::RecoverMissing::
- BlockingEvent::ExitBarrierEvent::Backend,
- ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent::Backend,
- ClientRequest::PGPipeline::LockOBC::BlockingEvent::Backend,
- ClientRequest::PGPipeline::LockOBC::BlockingEvent::ExitBarrierEvent::Backend,
- ClientRequest::PGPipeline::Process::BlockingEvent::Backend,
- ClientRequest::PGPipeline::WaitRepop::BlockingEvent::Backend,
- ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend,
- ClientRequest::PGPipeline::SendReply::BlockingEvent::Backend,
- ClientRequest::CompletionEvent::Backend
+ ClientRequest::CompletionEvent::Backend,
+ CommonOBCPipeline::Process::BlockingEvent::Backend,
+ CommonOBCPipeline::WaitRepop::BlockingEvent::Backend,
+ CommonOBCPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend,
+ CommonOBCPipeline::SendReply::BlockingEvent::Backend,
+ PGRepopPipeline::Process::BlockingEvent::Backend,
+ PGRepopPipeline::WaitCommit::BlockingEvent::Backend,
+ PGRepopPipeline::WaitCommit::BlockingEvent::ExitBarrierEvent::Backend,
+ PGRepopPipeline::SendReply::BlockingEvent::Backend
{
void handle(ClientRequest::StartEvent&,
const Operation&) override {}
@@ -72,24 +71,28 @@ struct LttngBackend
const PerShardPipeline::CreateOrWaitPG& blocker) override {
}
- void handle(PGMap::PGCreationBlockingEvent&,
- const Operation&,
- const PGMap::PGCreationBlocker&) override {
+ void handle(CommonPGPipeline::WaitPGReady::BlockingEvent& ev,
+ const Operation& op,
+ const CommonPGPipeline::WaitPGReady& blocker) override {
}
- void handle(ClientRequest::PGPipeline::AwaitMap::BlockingEvent& ev,
+ void handle(CommonPGPipeline::WaitPGReady::BlockingEvent::ExitBarrierEvent& ev,
+ const Operation& op) override {
+ }
+
+ void handle(CommonPGPipeline::GetOBC::BlockingEvent& ev,
const Operation& op,
- const ClientRequest::PGPipeline::AwaitMap& blocker) override {
+ const CommonPGPipeline::GetOBC& blocker) override {
}
- void handle(PG_OSDMapGate::OSDMapBlocker::BlockingEvent&,
+ void handle(PGMap::PGCreationBlockingEvent&,
const Operation&,
- const PG_OSDMapGate::OSDMapBlocker&) override {
+ const PGMap::PGCreationBlocker&) override {
}
- void handle(ClientRequest::PGPipeline::WaitForActive::BlockingEvent& ev,
- const Operation& op,
- const ClientRequest::PGPipeline::WaitForActive& blocker) override {
+ void handle(PG_OSDMapGate::OSDMapBlocker::BlockingEvent&,
+ const Operation&,
+ const PG_OSDMapGate::OSDMapBlocker&) override {
}
void handle(PGActivationBlocker::BlockingEvent& ev,
@@ -102,51 +105,47 @@ struct LttngBackend
const scrub::PGScrubber& blocker) override {
}
- void handle(ClientRequest::PGPipeline::RecoverMissing::BlockingEvent& ev,
+ void handle(CommonOBCPipeline::Process::BlockingEvent& ev,
const Operation& op,
- const ClientRequest::PGPipeline::RecoverMissing& blocker) override {
+ const CommonOBCPipeline::Process& blocker) override {
}
- void handle(ClientRequest::PGPipeline::RecoverMissing::BlockingEvent::ExitBarrierEvent& ev,
- const Operation& op) override {
- }
-
- void handle(ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent& ev,
+ void handle(CommonOBCPipeline::WaitRepop::BlockingEvent& ev,
const Operation& op,
- const ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc& blocker) override {
+ const CommonOBCPipeline::WaitRepop& blocker) override {
}
-
- void handle(ClientRequest::PGPipeline::LockOBC::BlockingEvent& ev,
- const Operation& op,
- const ClientRequest::PGPipeline::LockOBC& blocker) override {
+ void handle(CommonOBCPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent& ev,
+ const Operation& op) override {
}
- void handle(ClientRequest::PGPipeline::LockOBC::BlockingEvent::ExitBarrierEvent& ev,
- const Operation& op) override {
+ void handle(CommonOBCPipeline::SendReply::BlockingEvent& ev,
+ const Operation& op,
+ const CommonOBCPipeline::SendReply& blocker) override {
}
- void handle(ClientRequest::PGPipeline::Process::BlockingEvent& ev,
+ void handle(PGRepopPipeline::Process::BlockingEvent& ev,
const Operation& op,
- const ClientRequest::PGPipeline::Process& blocker) override {
+ const PGRepopPipeline::Process& blocker) override {
}
- void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent& ev,
+ void handle(PGRepopPipeline::WaitCommit::BlockingEvent& ev,
const Operation& op,
- const ClientRequest::PGPipeline::WaitRepop& blocker) override {
+ const PGRepopPipeline::WaitCommit& blocker) override {
}
- void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent& ev,
+ void handle(PGRepopPipeline::WaitCommit::BlockingEvent::ExitBarrierEvent& ev,
const Operation& op) override {
}
- void handle(ClientRequest::PGPipeline::SendReply::BlockingEvent& ev,
+ void handle(PGRepopPipeline::SendReply::BlockingEvent& ev,
const Operation& op,
- const ClientRequest::PGPipeline::SendReply& blocker) override {
+ const PGRepopPipeline::SendReply& blocker) override {
}
void handle(ClientRequest::CompletionEvent&,
const Operation&) override {}
+
};
struct HistoricBackend
@@ -155,24 +154,23 @@ struct HistoricBackend
ConnectionPipeline::AwaitMap::BlockingEvent::Backend,
ConnectionPipeline::GetPGMapping::BlockingEvent::Backend,
PerShardPipeline::CreateOrWaitPG::BlockingEvent::Backend,
+ CommonPGPipeline::WaitPGReady::BlockingEvent::Backend,
+ CommonPGPipeline::WaitPGReady::BlockingEvent::ExitBarrierEvent::Backend,
+ CommonPGPipeline::GetOBC::BlockingEvent::Backend,
OSD_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend,
PGMap::PGCreationBlockingEvent::Backend,
- ClientRequest::PGPipeline::AwaitMap::BlockingEvent::Backend,
PG_OSDMapGate::OSDMapBlocker::BlockingEvent::Backend,
- ClientRequest::PGPipeline::WaitForActive::BlockingEvent::Backend,
PGActivationBlocker::BlockingEvent::Backend,
scrub::PGScrubber::BlockingEvent::Backend,
- ClientRequest::PGPipeline::RecoverMissing::BlockingEvent::Backend,
- ClientRequest::PGPipeline::RecoverMissing::
- BlockingEvent::ExitBarrierEvent::Backend,
- ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent::Backend,
- ClientRequest::PGPipeline::LockOBC::BlockingEvent::Backend,
- ClientRequest::PGPipeline::LockOBC::BlockingEvent::ExitBarrierEvent::Backend,
- ClientRequest::PGPipeline::Process::BlockingEvent::Backend,
- ClientRequest::PGPipeline::WaitRepop::BlockingEvent::Backend,
- ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend,
- ClientRequest::PGPipeline::SendReply::BlockingEvent::Backend,
- ClientRequest::CompletionEvent::Backend
+ ClientRequest::CompletionEvent::Backend,
+ CommonOBCPipeline::Process::BlockingEvent::Backend,
+ CommonOBCPipeline::WaitRepop::BlockingEvent::Backend,
+ CommonOBCPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent::Backend,
+ CommonOBCPipeline::SendReply::BlockingEvent::Backend,
+ PGRepopPipeline::Process::BlockingEvent::Backend,
+ PGRepopPipeline::WaitCommit::BlockingEvent::Backend,
+ PGRepopPipeline::WaitCommit::BlockingEvent::ExitBarrierEvent::Backend,
+ PGRepopPipeline::SendReply::BlockingEvent::Backend
{
void handle(ClientRequest::StartEvent&,
const Operation&) override {}
@@ -202,24 +200,28 @@ struct HistoricBackend
const PerShardPipeline::CreateOrWaitPG& blocker) override {
}
- void handle(PGMap::PGCreationBlockingEvent&,
- const Operation&,
- const PGMap::PGCreationBlocker&) override {
+ void handle(CommonPGPipeline::WaitPGReady::BlockingEvent& ev,
+ const Operation& op,
+ const CommonPGPipeline::WaitPGReady& blocker) override {
+ }
+
+ void handle(CommonPGPipeline::WaitPGReady::BlockingEvent::ExitBarrierEvent& ev,
+ const Operation& op) override {
}
- void handle(ClientRequest::PGPipeline::AwaitMap::BlockingEvent& ev,
+ void handle(CommonPGPipeline::GetOBC::BlockingEvent& ev,
const Operation& op,
- const ClientRequest::PGPipeline::AwaitMap& blocker) override {
+ const CommonPGPipeline::GetOBC& blocker) override {
}
- void handle(PG_OSDMapGate::OSDMapBlocker::BlockingEvent&,
+ void handle(PGMap::PGCreationBlockingEvent&,
const Operation&,
- const PG_OSDMapGate::OSDMapBlocker&) override {
+ const PGMap::PGCreationBlocker&) override {
}
- void handle(ClientRequest::PGPipeline::WaitForActive::BlockingEvent& ev,
- const Operation& op,
- const ClientRequest::PGPipeline::WaitForActive& blocker) override {
+ void handle(PG_OSDMapGate::OSDMapBlocker::BlockingEvent&,
+ const Operation&,
+ const PG_OSDMapGate::OSDMapBlocker&) override {
}
void handle(PGActivationBlocker::BlockingEvent& ev,
@@ -232,55 +234,52 @@ struct HistoricBackend
const scrub::PGScrubber& blocker) override {
}
- void handle(ClientRequest::PGPipeline::RecoverMissing::BlockingEvent& ev,
- const Operation& op,
- const ClientRequest::PGPipeline::RecoverMissing& blocker) override {
- }
-
- void handle(ClientRequest::PGPipeline::RecoverMissing::BlockingEvent::ExitBarrierEvent& ev,
- const Operation& op) override {
+ static const ClientRequest& to_client_request(const Operation& op) {
+#ifdef NDEBUG
+ return static_cast<const ClientRequest&>(op);
+#else
+ return dynamic_cast<const ClientRequest&>(op);
+#endif
}
- void handle(ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent& ev,
+ void handle(CommonOBCPipeline::Process::BlockingEvent& ev,
const Operation& op,
- const ClientRequest::PGPipeline::CheckAlreadyCompleteGetObc& blocker) override {
+ const CommonOBCPipeline::Process& blocker) override {
}
- void handle(ClientRequest::PGPipeline::LockOBC::BlockingEvent& ev,
+ void handle(CommonOBCPipeline::WaitRepop::BlockingEvent& ev,
const Operation& op,
- const ClientRequest::PGPipeline::LockOBC& blocker) override {
+ const CommonOBCPipeline::WaitRepop& blocker) override {
}
- void handle(ClientRequest::PGPipeline::LockOBC::BlockingEvent::ExitBarrierEvent& ev,
+ void handle(CommonOBCPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent& ev,
const Operation& op) override {
}
- void handle(ClientRequest::PGPipeline::Process::BlockingEvent& ev,
+ void handle(CommonOBCPipeline::SendReply::BlockingEvent& ev,
+ const Operation& op,
+ const CommonOBCPipeline::SendReply& blocker) override {
+ }
+
+ void handle(PGRepopPipeline::Process::BlockingEvent& ev,
const Operation& op,
- const ClientRequest::PGPipeline::Process& blocker) override {
+ const PGRepopPipeline::Process& blocker) override {
}
- void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent& ev,
+ void handle(PGRepopPipeline::WaitCommit::BlockingEvent& ev,
const Operation& op,
- const ClientRequest::PGPipeline::WaitRepop& blocker) override {
+ const PGRepopPipeline::WaitCommit& blocker) override {
}
- void handle(ClientRequest::PGPipeline::WaitRepop::BlockingEvent::ExitBarrierEvent& ev,
+ void handle(PGRepopPipeline::WaitCommit::BlockingEvent::ExitBarrierEvent& ev,
const Operation& op) override {
}
- void handle(ClientRequest::PGPipeline::SendReply::BlockingEvent& ev,
+ void handle(PGRepopPipeline::SendReply::BlockingEvent& ev,
const Operation& op,
- const ClientRequest::PGPipeline::SendReply& blocker) override {
+ const PGRepopPipeline::SendReply& blocker) override {
}
- static const ClientRequest& to_client_request(const Operation& op) {
-#ifdef NDEBUG
- return static_cast<const ClientRequest&>(op);
-#else
- return dynamic_cast<const ClientRequest&>(op);
-#endif
- }
void handle(ClientRequest::CompletionEvent&, const Operation& op) override {
if (crimson::common::local_conf()->osd_op_history_size) {
diff --git a/src/crimson/osd/osd_operations/client_request.cc b/src/crimson/osd/osd_operations/client_request.cc
index a89fb2c84bc..fcd0f318db2 100644
--- a/src/crimson/osd/osd_operations/client_request.cc
+++ b/src/crimson/osd/osd_operations/client_request.cc
@@ -14,6 +14,7 @@
#include "crimson/osd/osd_operations/client_request.h"
#include "crimson/osd/osd_connection_priv.h"
#include "osd/object_state_fmt.h"
+#include "osd/osd_perf_counters.h"
SET_SUBSYS(osd);
@@ -42,15 +43,17 @@ void ClientRequest::Orderer::clear_and_cancel(PG &pg)
{
LOG_PREFIX(ClientRequest::Orderer::clear_and_cancel);
for (auto i = list.begin(); i != list.end(); ) {
- DEBUGDPP("{}", pg, *i);
- i->complete_request();
- remove_request(*(i++));
+ auto &req = *i;
+ DEBUGDPP("{}", pg, req);
+ ++i;
+ req.complete_request(pg);
}
}
-void ClientRequest::complete_request()
+void ClientRequest::complete_request(PG &pg)
{
track_event<CompletionEvent>();
+ pg.client_request_orderer.remove_request(*this);
on_complete.set_value();
}
@@ -98,7 +101,7 @@ PerShardPipeline &ClientRequest::get_pershard_pipeline(
return shard_services.get_client_request_pipeline();
}
-ClientRequest::PGPipeline &ClientRequest::client_pp(PG &pg)
+CommonPGPipeline &ClientRequest::client_pp(PG &pg)
{
return pg.request_pg_pipeline;
}
@@ -137,12 +140,20 @@ ClientRequest::interruptible_future<> ClientRequest::with_pg_process_interruptib
DEBUGDPP("{} start", *pgref, *this);
PG &pg = *pgref;
+
+ DEBUGDPP("{}.{}: entering wait_pg_ready stage",
+ *pgref, *this, this_instance_id);
+ // The prior stage is OrderedExclusive (PerShardPipeline::create_or_wait_pg)
+ // and wait_pg_ready is OrderedConcurrent. This transition, therefore, cannot
+ // block and using enter_stage_sync is legal and more efficient than
+ // enter_stage.
+ ihref.enter_stage_sync(client_pp(pg).wait_pg_ready, *this);
+
if (!m->get_hobj().get_key().empty()) {
// There are no users of locator. It was used to ensure that multipart-upload
// parts would end up in the same PG so that they could be clone_range'd into
// the same object via librados, but that's not how multipart upload works
// anymore and we no longer support clone_range via librados.
- get_handle().exit();
co_await reply_op_error(pgref, -ENOTSUP);
co_return;
}
@@ -152,32 +163,24 @@ ClientRequest::interruptible_future<> ClientRequest::with_pg_process_interruptib
std::ref(get_foreign_connection()), m->get_map_epoch()
));
DEBUGDPP("{}: discarding {}", *pgref, *this, this_instance_id);
- pgref->client_request_orderer.remove_request(*this);
- complete_request();
co_return;
}
- DEBUGDPP("{}.{}: entering await_map stage",
- *pgref, *this, this_instance_id);
- co_await ihref.enter_stage<interruptor>(client_pp(pg).await_map, *this);
- DEBUGDPP("{}.{}: entered await_map stage, waiting for map",
- pg, *this, this_instance_id);
+
auto map_epoch = co_await interruptor::make_interruptible(
ihref.enter_blocker(
*this, pg.osdmap_gate, &decltype(pg.osdmap_gate)::wait_for_map,
m->get_min_epoch(), nullptr));
- DEBUGDPP("{}.{}: map epoch got {}, entering wait_for_active",
+ DEBUGDPP("{}.{}: waited for epoch {}, waiting for active",
pg, *this, this_instance_id, map_epoch);
- co_await ihref.enter_stage<interruptor>(client_pp(pg).wait_for_active, *this);
-
- DEBUGDPP("{}.{}: entered wait_for_active stage, waiting for active",
- pg, *this, this_instance_id);
co_await interruptor::make_interruptible(
ihref.enter_blocker(
*this,
pg.wait_for_active_blocker,
&decltype(pg.wait_for_active_blocker)::wait));
+ co_await ihref.enter_stage<interruptor>(client_pp(pg).get_obc, *this);
+
if (int res = op_info.set_from_op(&*m, *pg.get_osdmap());
res != 0) {
co_await reply_op_error(pgref, res);
@@ -190,15 +193,25 @@ ClientRequest::interruptible_future<> ClientRequest::with_pg_process_interruptib
DEBUGDPP("{}.{}: dropping misdirected op",
pg, *this, this_instance_id);
co_return;
- } else if (const hobject_t& hoid = m->get_hobj();
- !pg.get_peering_state().can_serve_replica_read(hoid)) {
+ }
+
+ pg.get_perf_logger().inc(l_osd_replica_read);
+ if (pg.is_unreadable_object(m->get_hobj())) {
+ DEBUGDPP("{}.{}: {} missing on replica, bouncing to primary",
+ pg, *this, this_instance_id, m->get_hobj());
+ pg.get_perf_logger().inc(l_osd_replica_read_redirect_missing);
+ co_await reply_op_error(pgref, -EAGAIN);
+ co_return;
+ } else if (!pg.get_peering_state().can_serve_replica_read(m->get_hobj())) {
DEBUGDPP("{}.{}: unstable write on replica, bouncing to primary",
pg, *this, this_instance_id);
+ pg.get_perf_logger().inc(l_osd_replica_read_redirect_conflict);
co_await reply_op_error(pgref, -EAGAIN);
co_return;
} else {
DEBUGDPP("{}.{}: serving replica read on oid {}",
pg, *this, this_instance_id, m->get_hobj());
+ pg.get_perf_logger().inc(l_osd_replica_read_served);
}
}
@@ -228,12 +241,6 @@ ClientRequest::interruptible_future<> ClientRequest::with_pg_process_interruptib
DEBUGDPP("{}.{}: process[_pg]_op complete, completing handle",
*pgref, *this, this_instance_id);
co_await interruptor::make_interruptible(ihref.handle.complete());
-
- DEBUGDPP("{}.{}: process[_pg]_op complete,"
- "removing request from orderer",
- *pgref, *this, this_instance_id);
- pgref->client_request_orderer.remove_request(*this);
- complete_request();
}
seastar::future<> ClientRequest::with_pg_process(
@@ -249,16 +256,24 @@ seastar::future<> ClientRequest::with_pg_process(
auto instance_handle = get_instance_handle();
auto &ihref = *instance_handle;
return interruptor::with_interruption(
- [this, pgref, this_instance_id, &ihref]() mutable {
- return with_pg_process_interruptible(pgref, this_instance_id, ihref);
+ [FNAME, this, pgref, this_instance_id, &ihref]() mutable {
+ return with_pg_process_interruptible(
+ pgref, this_instance_id, ihref
+ ).then_interruptible([FNAME, this, this_instance_id, pgref] {
+ DEBUGDPP("{}.{}: with_pg_process_interruptible complete,"
+ " completing request",
+ *pgref, *this, this_instance_id);
+ complete_request(*pgref);
+ });
}, [FNAME, this, this_instance_id, pgref](std::exception_ptr eptr) {
DEBUGDPP("{}.{}: interrupted due to {}",
*pgref, *this, this_instance_id, eptr);
}, pgref, pgref->get_osdmap_epoch()).finally(
[this, FNAME, opref=std::move(opref), pgref,
- this_instance_id, instance_handle=std::move(instance_handle), &ihref] {
+ this_instance_id, instance_handle=std::move(instance_handle), &ihref]() mutable {
DEBUGDPP("{}.{}: exit", *pgref, *this, this_instance_id);
- ihref.handle.exit();
+ return ihref.handle.complete(
+ ).finally([instance_handle=std::move(instance_handle)] {});
});
}
@@ -290,29 +305,41 @@ ClientRequest::process_pg_op(
ClientRequest::interruptible_future<>
ClientRequest::recover_missing_snaps(
Ref<PG> pg,
- instance_handle_t &ihref,
- ObjectContextRef head,
std::set<snapid_t> &snaps)
{
LOG_PREFIX(ClientRequest::recover_missing_snaps);
- for (auto &snap : snaps) {
- auto coid = head->obs.oi.soid;
- coid.snap = snap;
- auto oid = resolve_oid(head->get_head_ss(), coid);
- /* Rollback targets may legitimately not exist if, for instance,
- * the object is an rbd block which happened to be sparse and
- * therefore non-existent at the time of the specified snapshot.
- * In such a case, rollback will simply delete the object. Here,
- * we skip the oid as there is no corresponding clone to recover.
- * See https://tracker.ceph.com/issues/63821 */
- if (oid) {
- auto unfound = co_await do_recover_missing(pg, *oid, m->get_reqid());
- if (unfound) {
- DEBUGDPP("{} unfound, hang it for now", *pg, *oid);
- co_await interruptor::make_interruptible(
- pg->get_recovery_backend()->add_unfound(*oid));
+
+ std::vector<hobject_t> ret;
+ auto resolve_oids = pg->obc_loader.with_obc<RWState::RWREAD>(
+ m->get_hobj().get_head(),
+ [&snaps, &ret](auto head, auto) {
+ for (auto &snap : snaps) {
+ auto coid = head->obs.oi.soid;
+ coid.snap = snap;
+ auto oid = resolve_oid(head->get_head_ss(), coid);
+ /* Rollback targets may legitimately not exist if, for instance,
+ * the object is an rbd block which happened to be sparse and
+ * therefore non-existent at the time of the specified snapshot.
+ * In such a case, rollback will simply delete the object. Here,
+ * we skip the oid as there is no corresponding clone to recover.
+ * See https://tracker.ceph.com/issues/63821 */
+ if (oid) {
+ ret.emplace_back(std::move(*oid));
}
}
+ return seastar::now();
+ }).handle_error_interruptible(
+ crimson::ct_error::assert_all("unexpected error")
+ );
+ co_await std::move(resolve_oids);
+
+ for (auto &oid : ret) {
+ auto unfound = co_await do_recover_missing(pg, oid, m->get_reqid());
+ if (unfound) {
+ DEBUGDPP("{} unfound, hang it for now", *pg, oid);
+ co_await interruptor::make_interruptible(
+ pg->get_recovery_backend()->add_unfound(oid));
+ }
}
}
@@ -321,7 +348,13 @@ ClientRequest::process_op(
instance_handle_t &ihref, Ref<PG> pg, unsigned this_instance_id)
{
LOG_PREFIX(ClientRequest::process_op);
- ihref.enter_stage_sync(client_pp(*pg).recover_missing, *this);
+ ihref.obc_orderer = pg->obc_loader.get_obc_orderer(m->get_hobj());
+ auto obc_manager = pg->obc_loader.get_obc_manager(
+ *(ihref.obc_orderer),
+ m->get_hobj());
+ co_await ihref.enter_stage<interruptor>(
+ ihref.obc_orderer->obc_pp().process, *this);
+
if (!pg->is_primary()) {
DEBUGDPP(
"Skipping recover_missings on non primary pg for soid {}",
@@ -337,28 +370,10 @@ ClientRequest::process_op(
std::set<snapid_t> snaps = snaps_need_to_recover();
if (!snaps.empty()) {
- auto with_obc = pg->obc_loader.with_obc<RWState::RWREAD>(
- m->get_hobj().get_head(),
- [&snaps, &ihref, pg, this](auto head, auto) {
- return recover_missing_snaps(pg, ihref, head, snaps);
- }).handle_error_interruptible(
- crimson::ct_error::assert_all("unexpected error")
- );
- // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98401
- co_await std::move(with_obc);
+ co_await recover_missing_snaps(pg, snaps);
}
}
- /**
- * The previous stage of recover_missing is a concurrent phase.
- * Checking for already_complete requests must done exclusively.
- * Since get_obc is also an exclusive stage, we can merge both stages into
- * a single stage and avoid stage switching overhead.
- */
- DEBUGDPP("{}.{}: entering check_already_complete_get_obc",
- *pg, *this, this_instance_id);
- co_await ihref.enter_stage<interruptor>(
- client_pp(*pg).check_already_complete_get_obc, *this);
DEBUGDPP("{}.{}: checking already_complete",
*pg, *this, this_instance_id);
auto completed = co_await pg->already_complete(m->get_reqid());
@@ -385,51 +400,29 @@ ClientRequest::process_op(
DEBUGDPP("{}.{}: past scrub blocker, getting obc",
*pg, *this, this_instance_id);
- // call with_locked_obc() in order, but wait concurrently for loading.
- ihref.enter_stage_sync(
- client_pp(*pg).lock_obc, *this);
- auto process = pg->with_locked_obc(
- m->get_hobj(), op_info,
- [FNAME, this, pg, this_instance_id, &ihref] (
- auto head, auto obc
- ) -> interruptible_future<> {
- DEBUGDPP("{}.{}: got obc {}, entering process stage",
- *pg, *this, this_instance_id, obc->obs);
- return ihref.enter_stage<interruptor>(
- client_pp(*pg).process, *this
- ).then_interruptible(
- [FNAME, this, pg, this_instance_id, obc, &ihref]() mutable {
- DEBUGDPP("{}.{}: in process stage, calling do_process",
- *pg, *this, this_instance_id);
- return do_process(
- ihref, pg, obc, this_instance_id
- );
- }
- );
- }).handle_error_interruptible(
- PG::load_obc_ertr::all_same_way(
- [FNAME, this, pg=std::move(pg), this_instance_id](
- const auto &code
- ) -> interruptible_future<> {
- DEBUGDPP("{}.{}: saw error code {}",
- *pg, *this, this_instance_id, code);
- assert(code.value() > 0);
- return reply_op_error(pg, -code.value());
- })
- );
- /* The following works around gcc bug
- * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98401.
- * The specific symptom I observed is the pg param being
- * destructed multiple times resulting in the refcount going
- * rapidly to 0 destoying the PG prematurely.
- *
- * This bug seems to be resolved in gcc 13.2.1.
- *
- * Assigning the intermediate result and moving it into the co_await
- * expression bypasses both bugs.
- */
- co_await std::move(process);
+ int load_err = co_await pg->obc_loader.load_and_lock(
+ obc_manager, pg->get_lock_type(op_info)
+ ).si_then([]() -> int {
+ return 0;
+ }).handle_error_interruptible(
+ PG::load_obc_ertr::all_same_way(
+ [](const auto &code) -> int {
+ return -code.value();
+ })
+ );
+ if (load_err) {
+ DEBUGDPP("{}.{}: saw error code loading obc {}",
+ *pg, *this, this_instance_id, load_err);
+ co_await reply_op_error(pg, load_err);
+ co_return;
+ }
+
+ DEBUGDPP("{}.{}: obc {} loaded and locked, calling do_process",
+ *pg, *this, this_instance_id, obc_manager.get_obc()->obs);
+ co_await do_process(
+ ihref, pg, obc_manager.get_obc(), this_instance_id
+ );
}
ClientRequest::interruptible_future<>
@@ -548,12 +541,14 @@ ClientRequest::do_process(
std::move(ox), m->ops);
co_await std::move(submitted);
}
- co_await ihref.enter_stage<interruptor>(client_pp(*pg).wait_repop, *this);
+ co_await ihref.enter_stage<interruptor>(
+ ihref.obc_orderer->obc_pp().wait_repop, *this);
co_await std::move(all_completed);
}
- co_await ihref.enter_stage<interruptor>(client_pp(*pg).send_reply, *this);
+ co_await ihref.enter_stage<interruptor>(
+ ihref.obc_orderer->obc_pp().send_reply, *this);
if (ret) {
int err = -ret->value();
diff --git a/src/crimson/osd/osd_operations/client_request.h b/src/crimson/osd/osd_operations/client_request.h
index 6ee57e9874c..91a6728fd4b 100644
--- a/src/crimson/osd/osd_operations/client_request.h
+++ b/src/crimson/osd/osd_operations/client_request.h
@@ -11,6 +11,7 @@
#include "osd/osd_op_util.h"
#include "crimson/net/Connection.h"
#include "crimson/osd/object_context.h"
+#include "crimson/osd/object_context_loader.h"
#include "crimson/osd/osdmap_gate.h"
#include "crimson/osd/osd_operation.h"
#include "crimson/osd/osd_operations/client_request_common.h"
@@ -41,21 +42,9 @@ class ClientRequest final : public PhasedOperationT<ClientRequest>,
unsigned instance_id = 0;
public:
- class PGPipeline : public CommonPGPipeline {
- public:
- struct AwaitMap : OrderedExclusivePhaseT<AwaitMap> {
- static constexpr auto type_name = "ClientRequest::PGPipeline::await_map";
- } await_map;
- struct SendReply : OrderedExclusivePhaseT<SendReply> {
- static constexpr auto type_name = "ClientRequest::PGPipeline::send_reply";
- } send_reply;
- friend class ClientRequest;
- friend class LttngBackend;
- friend class HistoricBackend;
- friend class ReqRequest;
- friend class LogMissingRequest;
- friend class LogMissingRequestReply;
- };
+ epoch_t get_epoch_sent_at() const {
+ return m->get_map_epoch();
+ }
/**
* instance_handle_t
@@ -93,20 +82,18 @@ public:
// don't leave any references on the source core, so we just bypass it by using
// intrusive_ptr instead.
using ref_t = boost::intrusive_ptr<instance_handle_t>;
+ std::optional<ObjectContextLoader::Orderer> obc_orderer;
PipelineHandle handle;
std::tuple<
- PGPipeline::AwaitMap::BlockingEvent,
+ CommonPGPipeline::WaitPGReady::BlockingEvent,
PG_OSDMapGate::OSDMapBlocker::BlockingEvent,
- PGPipeline::WaitForActive::BlockingEvent,
PGActivationBlocker::BlockingEvent,
- PGPipeline::RecoverMissing::BlockingEvent,
+ CommonPGPipeline::GetOBC::BlockingEvent,
+ CommonOBCPipeline::Process::BlockingEvent,
scrub::PGScrubber::BlockingEvent,
- PGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent,
- PGPipeline::LockOBC::BlockingEvent,
- PGPipeline::Process::BlockingEvent,
- PGPipeline::WaitRepop::BlockingEvent,
- PGPipeline::SendReply::BlockingEvent,
+ CommonOBCPipeline::WaitRepop::BlockingEvent,
+ CommonOBCPipeline::SendReply::BlockingEvent,
CompletionEvent
> pg_tracking_events;
@@ -210,7 +197,7 @@ public:
void requeue(Ref<PG> pg);
void clear_and_cancel(PG &pg);
};
- void complete_request();
+ void complete_request(PG &pg);
static constexpr OperationTypeCode type = OperationTypeCode::client_request;
@@ -285,8 +272,6 @@ private:
interruptible_future<>
recover_missing_snaps(
Ref<PG> pg,
- instance_handle_t &ihref,
- ObjectContextRef head,
std::set<snapid_t> &snaps);
::crimson::interruptible::interruptible_future<
::crimson::osd::IOInterruptCondition> process_op(
@@ -295,7 +280,7 @@ private:
unsigned this_instance_id);
bool is_pg_op() const;
- PGPipeline &client_pp(PG &pg);
+ CommonPGPipeline &client_pp(PG &pg);
template <typename Errorator>
using interruptible_errorator =
diff --git a/src/crimson/osd/osd_operations/internal_client_request.cc b/src/crimson/osd/osd_operations/internal_client_request.cc
index 9e5867caf80..b8f7646bc74 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.cc
+++ b/src/crimson/osd/osd_operations/internal_client_request.cc
@@ -4,6 +4,7 @@
#include <seastar/core/future.hh>
#include "crimson/osd/osd_operations/internal_client_request.h"
+#include "osd/object_state_fmt.h"
namespace {
seastar::logger& logger() {
@@ -51,46 +52,17 @@ CommonPGPipeline& InternalClientRequest::client_pp()
}
InternalClientRequest::interruptible_future<>
-InternalClientRequest::do_process(
- crimson::osd::ObjectContextRef obc,
- std::vector<OSDOp> &osd_ops)
-{
- LOG_PREFIX(InternalClientRequest::do_process);
- auto params = get_do_osd_ops_params();
- OpsExecuter ox(
- pg, obc, op_info, params, params.get_connection(), SnapContext{});
- co_await pg->run_executer(
- ox, obc, op_info, osd_ops
- ).handle_error_interruptible(
- crimson::ct_error::all_same_way(
- [this, FNAME](auto e) {
- ERRORDPPI("{}: got unexpected error {}", *pg, *this, e);
- ceph_assert(0 == "should not return an error");
- return interruptor::now();
- })
- );
-
- auto [submitted, completed] = co_await pg->submit_executer(
- std::move(ox), osd_ops);
-
- co_await std::move(submitted);
- co_await std::move(completed);
-}
-
-InternalClientRequest::interruptible_future<>
InternalClientRequest::with_interruption()
{
LOG_PREFIX(InternalClientRequest::with_interruption);
- co_await enter_stage<interruptor>(
- client_pp().wait_for_active
- );
+ assert(pg->is_active());
- co_await with_blocking_event<PGActivationBlocker::BlockingEvent,
- interruptor>([this] (auto&& trigger) {
- return pg->wait_for_active_blocker.wait(std::move(trigger));
- });
+ obc_orderer = pg->obc_loader.get_obc_orderer(get_target_oid());
+ auto obc_manager = pg->obc_loader.get_obc_manager(
+ *obc_orderer,
+ get_target_oid());
- co_await enter_stage<interruptor>(client_pp().recover_missing);
+ co_await enter_stage<interruptor>(obc_orderer->obc_pp().process);
bool unfound = co_await do_recover_missing(
pg, get_target_oid(), osd_reqid_t());
@@ -100,10 +72,8 @@ InternalClientRequest::with_interruption()
std::make_error_code(std::errc::operation_canceled),
fmt::format("{} is unfound, drop it!", get_target_oid()));
}
- co_await enter_stage<interruptor>(
- client_pp().check_already_complete_get_obc);
- DEBUGI("{}: getting obc lock", *this);
+ DEBUGI("{}: generating ops", *this);
auto osd_ops = create_osd_ops();
@@ -112,23 +82,38 @@ InternalClientRequest::with_interruption()
[[maybe_unused]] const int ret = op_info.set_from_op(
std::as_const(osd_ops), pg->get_pgid().pgid, *pg->get_osdmap());
assert(ret == 0);
- // call with_locked_obc() in order, but wait concurrently for loading.
- enter_stage_sync(client_pp().lock_obc);
-
- auto fut = pg->with_locked_obc(
- get_target_oid(), op_info,
- [&osd_ops, this](auto, auto obc) {
- return enter_stage<interruptor>(client_pp().process
- ).then_interruptible(
- [obc=std::move(obc), &osd_ops, this]() mutable {
- return do_process(std::move(obc), osd_ops);
- });
- }).handle_error_interruptible(
- crimson::ct_error::assert_all("unexpected error")
- );
- co_await std::move(fut);
-
- logger().debug("{}: complete", *this);
+
+ co_await pg->obc_loader.load_and_lock(
+ obc_manager, pg->get_lock_type(op_info)
+ ).handle_error_interruptible(
+ crimson::ct_error::assert_all("unexpected error")
+ );
+
+ auto params = get_do_osd_ops_params();
+ OpsExecuter ox(
+ pg, obc_manager.get_obc(), op_info, params, params.get_connection(),
+ SnapContext{});
+ co_await pg->run_executer(
+ ox, obc_manager.get_obc(), op_info, osd_ops
+ ).handle_error_interruptible(
+ crimson::ct_error::all_same_way(
+ [this, FNAME](auto e) {
+ ERRORDPPI("{}: got unexpected error {}", *pg, *this, e);
+ ceph_assert(0 == "should not return an error");
+ return interruptor::now();
+ })
+ );
+
+ auto [submitted, completed] = co_await pg->submit_executer(
+ std::move(ox), osd_ops);
+
+ co_await std::move(submitted);
+
+ co_await enter_stage<interruptor>(obc_orderer->obc_pp().wait_repop);
+
+ co_await std::move(completed);
+
+ DEBUGDPP("{}: complete", *pg, *this);
co_await interruptor::make_interruptible(handle.complete());
co_return;
}
@@ -150,7 +135,7 @@ seastar::future<> InternalClientRequest::start()
return seastar::now();
}).finally([this] {
logger().debug("{}: exit", *this);
- handle.exit();
+ return handle.complete();
});
}
diff --git a/src/crimson/osd/osd_operations/internal_client_request.h b/src/crimson/osd/osd_operations/internal_client_request.h
index 6023db0a8db..1cfde4ab080 100644
--- a/src/crimson/osd/osd_operations/internal_client_request.h
+++ b/src/crimson/osd/osd_operations/internal_client_request.h
@@ -4,6 +4,7 @@
#pragma once
#include "crimson/common/type_helpers.h"
+#include "crimson/osd/object_context_loader.h"
#include "crimson/osd/osd_operation.h"
#include "crimson/osd/osd_operations/client_request_common.h"
#include "crimson/osd/pg.h"
@@ -45,11 +46,10 @@ private:
crimson::osd::ObjectContextRef obc,
std::vector<OSDOp> &osd_ops);
- seastar::future<> do_process();
-
Ref<PG> pg;
epoch_t start_epoch;
OpInfo op_info;
+ std::optional<ObjectContextLoader::Orderer> obc_orderer;
PipelineHandle handle;
public:
@@ -57,12 +57,8 @@ public:
std::tuple<
StartEvent,
- CommonPGPipeline::WaitForActive::BlockingEvent,
- PGActivationBlocker::BlockingEvent,
- CommonPGPipeline::RecoverMissing::BlockingEvent,
- CommonPGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent,
- CommonPGPipeline::LockOBC::BlockingEvent,
- CommonPGPipeline::Process::BlockingEvent,
+ CommonOBCPipeline::Process::BlockingEvent,
+ CommonOBCPipeline::WaitRepop::BlockingEvent,
CompletionEvent
> tracking_events;
};
diff --git a/src/crimson/osd/osd_operations/logmissing_request.cc b/src/crimson/osd/osd_operations/logmissing_request.cc
index 8147c969260..274744cdd92 100644
--- a/src/crimson/osd/osd_operations/logmissing_request.cc
+++ b/src/crimson/osd/osd_operations/logmissing_request.cc
@@ -58,9 +58,9 @@ PerShardPipeline &LogMissingRequest::get_pershard_pipeline(
return shard_services.get_replicated_request_pipeline();
}
-ClientRequest::PGPipeline &LogMissingRequest::client_pp(PG &pg)
+PGRepopPipeline &LogMissingRequest::repop_pipeline(PG &pg)
{
- return pg.request_pg_pipeline;
+ return pg.repop_pipeline;
}
seastar::future<> LogMissingRequest::with_pg(
@@ -73,7 +73,7 @@ seastar::future<> LogMissingRequest::with_pg(
return interruptor::with_interruption([this, pg] {
LOG_PREFIX(LogMissingRequest::with_pg);
DEBUGI("{}: pg present", *this);
- return this->template enter_stage<interruptor>(client_pp(*pg).await_map
+ return this->template enter_stage<interruptor>(repop_pipeline(*pg).process
).then_interruptible([this, pg] {
return this->template with_blocking_event<
PG_OSDMapGate::OSDMapBlocker::BlockingEvent
diff --git a/src/crimson/osd/osd_operations/logmissing_request.h b/src/crimson/osd/osd_operations/logmissing_request.h
index 51c9d540cb5..fe4761c4ab4 100644
--- a/src/crimson/osd/osd_operations/logmissing_request.h
+++ b/src/crimson/osd/osd_operations/logmissing_request.h
@@ -36,6 +36,9 @@ public:
}
PipelineHandle &get_handle() { return handle; }
epoch_t get_epoch() const { return req->get_min_epoch(); }
+ epoch_t get_epoch_sent_at() const {
+ return req->get_map_epoch();
+ }
ConnectionPipeline &get_connection_pipeline();
@@ -77,14 +80,14 @@ public:
ConnectionPipeline::AwaitMap::BlockingEvent,
ConnectionPipeline::GetPGMapping::BlockingEvent,
PerShardPipeline::CreateOrWaitPG::BlockingEvent,
- ClientRequest::PGPipeline::AwaitMap::BlockingEvent,
+ PGRepopPipeline::Process::BlockingEvent,
PG_OSDMapGate::OSDMapBlocker::BlockingEvent,
PGMap::PGCreationBlockingEvent,
OSD_OSDMapGate::OSDMapBlocker::BlockingEvent
> tracking_events;
private:
- ClientRequest::PGPipeline &client_pp(PG &pg);
+ PGRepopPipeline &repop_pipeline(PG &pg);
crimson::net::ConnectionRef l_conn;
crimson::net::ConnectionXcoreRef r_conn;
diff --git a/src/crimson/osd/osd_operations/logmissing_request_reply.cc b/src/crimson/osd/osd_operations/logmissing_request_reply.cc
index fb122a95cd1..5640610bd01 100644
--- a/src/crimson/osd/osd_operations/logmissing_request_reply.cc
+++ b/src/crimson/osd/osd_operations/logmissing_request_reply.cc
@@ -56,11 +56,6 @@ PerShardPipeline &LogMissingRequestReply::get_pershard_pipeline(
return shard_services.get_replicated_request_pipeline();
}
-ClientRequest::PGPipeline &LogMissingRequestReply::client_pp(PG &pg)
-{
- return pg.request_pg_pipeline;
-}
-
seastar::future<> LogMissingRequestReply::with_pg(
ShardServices &shard_services, Ref<PG> pg)
{
diff --git a/src/crimson/osd/osd_operations/logmissing_request_reply.h b/src/crimson/osd/osd_operations/logmissing_request_reply.h
index c741b41bd0f..bdb6c2ac6ac 100644
--- a/src/crimson/osd/osd_operations/logmissing_request_reply.h
+++ b/src/crimson/osd/osd_operations/logmissing_request_reply.h
@@ -36,6 +36,9 @@ public:
}
PipelineHandle &get_handle() { return handle; }
epoch_t get_epoch() const { return req->get_min_epoch(); }
+ epoch_t get_epoch_sent_at() const {
+ return req->get_map_epoch();
+ }
ConnectionPipeline &get_connection_pipeline();
@@ -82,8 +85,6 @@ public:
> tracking_events;
private:
- ClientRequest::PGPipeline &client_pp(PG &pg);
-
crimson::net::ConnectionRef l_conn;
crimson::net::ConnectionXcoreRef r_conn;
diff --git a/src/crimson/osd/osd_operations/osdop_params.h b/src/crimson/osd/osd_operations/osdop_params.h
index 102cb7fff6b..14202582100 100644
--- a/src/crimson/osd/osd_operations/osdop_params.h
+++ b/src/crimson/osd/osd_operations/osdop_params.h
@@ -12,7 +12,7 @@ struct osd_op_params_t {
utime_t mtime;
eversion_t at_version;
eversion_t pg_trim_to;
- eversion_t min_last_complete_ondisk;
+ eversion_t pg_committed_to;
eversion_t last_complete;
bool user_modify = false;
ObjectCleanRegions clean_regions;
diff --git a/src/crimson/osd/osd_operations/peering_event.cc b/src/crimson/osd/osd_operations/peering_event.cc
index a8d9fce69b6..fb5696b0a9e 100644
--- a/src/crimson/osd/osd_operations/peering_event.cc
+++ b/src/crimson/osd/osd_operations/peering_event.cc
@@ -166,7 +166,8 @@ void RemotePeeringEvent::on_pg_absent(ShardServices &shard_services)
ctx.send_notify(q.from.osd, {q.query.from, q.query.to,
q.query.epoch_sent,
map_epoch, empty,
- PastIntervals{}});
+ PastIntervals{},
+ PG_FEATURE_CRIMSON_ALL});
}
}
}
diff --git a/src/crimson/osd/osd_operations/peering_event.h b/src/crimson/osd/osd_operations/peering_event.h
index 85de5c711d6..aa6b8a95a94 100644
--- a/src/crimson/osd/osd_operations/peering_event.h
+++ b/src/crimson/osd/osd_operations/peering_event.h
@@ -44,6 +44,10 @@ protected:
float delay = 0;
PGPeeringEvent evt;
+ epoch_t get_epoch_sent_at() const {
+ return evt.get_epoch_sent();
+ }
+
const pg_shard_t get_from() const {
return from;
}
@@ -84,6 +88,10 @@ public:
evt(std::forward<Args>(args)...)
{}
+ bool requires_pg() const final {
+ return evt.requires_pg;
+ }
+
void print(std::ostream &) const final;
void dump_detail(ceph::Formatter* f) const final;
seastar::future<> with_pg(
diff --git a/src/crimson/osd/osd_operations/pg_advance_map.h b/src/crimson/osd/osd_operations/pg_advance_map.h
index 43be7319545..21702f6ff4f 100644
--- a/src/crimson/osd/osd_operations/pg_advance_map.h
+++ b/src/crimson/osd/osd_operations/pg_advance_map.h
@@ -50,6 +50,10 @@ public:
PGPeeringPipeline::Process::BlockingEvent
> tracking_events;
+ epoch_t get_epoch_sent_at() const {
+ return to;
+ }
+
private:
PGPeeringPipeline &peering_pp(PG &pg);
};
diff --git a/src/crimson/osd/osd_operations/recovery_subrequest.h b/src/crimson/osd/osd_operations/recovery_subrequest.h
index 17c2faf97ea..2fe8ff372b3 100644
--- a/src/crimson/osd/osd_operations/recovery_subrequest.h
+++ b/src/crimson/osd/osd_operations/recovery_subrequest.h
@@ -39,6 +39,9 @@ public:
}
PipelineHandle &get_handle() { return handle; }
epoch_t get_epoch() const { return m->get_min_epoch(); }
+ epoch_t get_epoch_sent_at() const {
+ return m->get_map_epoch();
+ }
ConnectionPipeline &get_connection_pipeline();
diff --git a/src/crimson/osd/osd_operations/replicated_request.cc b/src/crimson/osd/osd_operations/replicated_request.cc
index 5ca11e5dd15..ec607758c55 100644
--- a/src/crimson/osd/osd_operations/replicated_request.cc
+++ b/src/crimson/osd/osd_operations/replicated_request.cc
@@ -5,6 +5,7 @@
#include "common/Formatter.h"
+#include "crimson/common/coroutine.h"
#include "crimson/osd/osd.h"
#include "crimson/osd/osd_connection_priv.h"
#include "crimson/osd/osd_operation_external_tracking.h"
@@ -58,39 +59,57 @@ PerShardPipeline &RepRequest::get_pershard_pipeline(
return shard_services.get_replicated_request_pipeline();
}
-ClientRequest::PGPipeline &RepRequest::client_pp(PG &pg)
+PGRepopPipeline &RepRequest::repop_pipeline(PG &pg)
{
- return pg.request_pg_pipeline;
+ return pg.repop_pipeline;
+}
+
+RepRequest::interruptible_future<> RepRequest::with_pg_interruptible(
+ Ref<PG> pg)
+{
+ LOG_PREFIX(RepRequest::with_pg_interruptible);
+ DEBUGI("{}", *this);
+ co_await this->template enter_stage<interruptor>(repop_pipeline(*pg).process);
+ co_await interruptor::make_interruptible(this->template with_blocking_event<
+ PG_OSDMapGate::OSDMapBlocker::BlockingEvent
+ >([this, pg](auto &&trigger) {
+ return pg->osdmap_gate.wait_for_map(
+ std::move(trigger), req->min_epoch);
+ }));
+
+ if (pg->can_discard_replica_op(*req)) {
+ co_return;
+ }
+
+ auto [commit_fut, reply] = co_await pg->handle_rep_op(req);
+
+ // Transitions from OrderedExclusive->OrderedConcurrent cannot block
+ this->template enter_stage_sync(repop_pipeline(*pg).wait_commit);
+
+ co_await std::move(commit_fut);
+
+ co_await this->template enter_stage<interruptor>(
+ repop_pipeline(*pg).send_reply);
+
+ co_await interruptor::make_interruptible(
+ pg->shard_services.send_to_osd(
+ req->from.osd, std::move(reply), pg->get_osdmap_epoch())
+ );
}
seastar::future<> RepRequest::with_pg(
ShardServices &shard_services, Ref<PG> pg)
{
LOG_PREFIX(RepRequest::with_pg);
- DEBUGI("{}: RepRequest::with_pg", *this);
+ DEBUGI("{}", *this);
IRef ref = this;
return interruptor::with_interruption([this, pg] {
- LOG_PREFIX(RepRequest::with_pg);
- DEBUGI("{}: pg present", *this);
- return this->template enter_stage<interruptor>(client_pp(*pg).await_map
- ).then_interruptible([this, pg] {
- return this->template with_blocking_event<
- PG_OSDMapGate::OSDMapBlocker::BlockingEvent
- >([this, pg](auto &&trigger) {
- return pg->osdmap_gate.wait_for_map(
- std::move(trigger), req->min_epoch);
- });
- }).then_interruptible([this, pg] (auto) {
- return pg->handle_rep_op(req);
- }).then_interruptible([this] {
- logger().debug("{}: complete", *this);
- return handle.complete();
- });
+ return with_pg_interruptible(pg);
}, [](std::exception_ptr) {
return seastar::now();
}, pg, pg->get_osdmap_epoch()).finally([this, ref=std::move(ref)] {
logger().debug("{}: exit", *this);
- handle.exit();
+ return handle.complete();
});
}
diff --git a/src/crimson/osd/osd_operations/replicated_request.h b/src/crimson/osd/osd_operations/replicated_request.h
index ff5dea6d6db..c2494b3715f 100644
--- a/src/crimson/osd/osd_operations/replicated_request.h
+++ b/src/crimson/osd/osd_operations/replicated_request.h
@@ -36,6 +36,9 @@ public:
}
PipelineHandle &get_handle() { return handle; }
epoch_t get_epoch() const { return req->get_min_epoch(); }
+ epoch_t get_epoch_sent_at() const {
+ return req->get_map_epoch();
+ }
ConnectionPipeline &get_connection_pipeline();
@@ -68,6 +71,9 @@ public:
r_conn = make_local_shared_foreign(std::move(conn));
}
+ interruptible_future<> with_pg_interruptible(
+ Ref<PG> pg);
+
seastar::future<> with_pg(
ShardServices &shard_services, Ref<PG> pg);
@@ -77,14 +83,16 @@ public:
ConnectionPipeline::AwaitMap::BlockingEvent,
ConnectionPipeline::GetPGMapping::BlockingEvent,
PerShardPipeline::CreateOrWaitPG::BlockingEvent,
- ClientRequest::PGPipeline::AwaitMap::BlockingEvent,
+ PGRepopPipeline::Process::BlockingEvent,
+ PGRepopPipeline::WaitCommit::BlockingEvent,
+ PGRepopPipeline::SendReply::BlockingEvent,
PG_OSDMapGate::OSDMapBlocker::BlockingEvent,
PGMap::PGCreationBlockingEvent,
OSD_OSDMapGate::OSDMapBlocker::BlockingEvent
> tracking_events;
private:
- ClientRequest::PGPipeline &client_pp(PG &pg);
+ PGRepopPipeline &repop_pipeline(PG &pg);
crimson::net::ConnectionRef l_conn;
crimson::net::ConnectionXcoreRef r_conn;
diff --git a/src/crimson/osd/osd_operations/scrub_events.h b/src/crimson/osd/osd_operations/scrub_events.h
index 02a5d852bb7..8bed90e4c14 100644
--- a/src/crimson/osd/osd_operations/scrub_events.h
+++ b/src/crimson/osd/osd_operations/scrub_events.h
@@ -27,11 +27,11 @@ class RemoteScrubEventBaseT : public PhasedOperationT<T> {
crimson::net::ConnectionRef l_conn;
crimson::net::ConnectionXcoreRef r_conn;
- epoch_t epoch;
spg_t pgid;
protected:
using interruptor = InterruptibleOperation::interruptor;
+ epoch_t epoch;
template <typename U=void>
using ifut = InterruptibleOperation::interruptible_future<U>;
@@ -40,7 +40,7 @@ protected:
public:
RemoteScrubEventBaseT(
crimson::net::ConnectionRef conn, epoch_t epoch, spg_t pgid)
- : l_conn(std::move(conn)), epoch(epoch), pgid(pgid) {}
+ : l_conn(std::move(conn)), pgid(pgid), epoch(epoch) {}
PGPeeringPipeline &get_peering_pipeline(PG &pg);
@@ -117,6 +117,10 @@ public:
: RemoteScrubEventBaseT<ScrubRequested>(std::forward<Args>(base_args)...),
deep(deep) {}
+ epoch_t get_epoch_sent_at() const {
+ return epoch;
+ }
+
void print(std::ostream &out) const final {
out << "(deep=" << deep << ")";
}
@@ -141,6 +145,10 @@ public:
ceph_assert(scrub::PGScrubber::is_scrub_message(*m));
}
+ epoch_t get_epoch_sent_at() const {
+ return epoch;
+ }
+
void print(std::ostream &out) const final {
out << "(m=" << *m << ")";
}
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.cc b/src/crimson/osd/osd_operations/snaptrim_event.cc
index 9ed0b73cfb4..f8fb7aef6f2 100644
--- a/src/crimson/osd/osd_operations/snaptrim_event.cc
+++ b/src/crimson/osd/osd_operations/snaptrim_event.cc
@@ -388,58 +388,66 @@ SnapTrimObjSubEvent::remove_or_update(
SnapTrimObjSubEvent::snap_trim_obj_subevent_ret_t
SnapTrimObjSubEvent::start()
{
+ obc_orderer = pg->obc_loader.get_obc_orderer(
+ coid);
+
ceph_assert(pg->is_active_clean());
- auto exit_handle = seastar::defer([this] {
- logger().debug("{}: exit", *this);
- handle.exit();
+ auto exit_handle = seastar::defer([this, opref = IRef(this)] {
+ logger().debug("{}: exit", *opref);
+ std::ignore = handle.complete().then([opref = std::move(opref)] {});
});
co_await enter_stage<interruptor>(
- client_pp().check_already_complete_get_obc);
+ obc_orderer->obc_pp().process);
logger().debug("{}: getting obc for {}", *this, coid);
- // end of commonality
- // lock both clone's and head's obcs
- co_await pg->obc_loader.with_obc<RWState::RWWRITE>(
- coid,
- std::bind(&SnapTrimObjSubEvent::process_and_submit,
- this, std::placeholders::_1, std::placeholders::_2),
- false
+
+
+ auto obc_manager = pg->obc_loader.get_obc_manager(
+ *obc_orderer,
+ coid, false /* resolve_oid */);
+
+ co_await pg->obc_loader.load_and_lock(
+ obc_manager, RWState::RWWRITE
).handle_error_interruptible(
remove_or_update_iertr::pass_further{},
crimson::ct_error::assert_all{"unexpected error in SnapTrimObjSubEvent"}
);
- logger().debug("{}: completed", *this);
- co_await interruptor::make_interruptible(handle.complete());
-}
-
-ObjectContextLoader::load_obc_iertr::future<>
-SnapTrimObjSubEvent::process_and_submit(ObjectContextRef head_obc,
- ObjectContextRef clone_obc) {
- logger().debug("{}: got clone_obc={}", *this, clone_obc->get_oid());
-
- co_await enter_stage<interruptor>(client_pp().process);
+ logger().debug("{}: got obc={}", *this, obc_manager.get_obc()->get_oid());
- logger().debug("{}: processing clone_obc={}", *this, clone_obc->get_oid());
-
- auto txn = co_await remove_or_update(clone_obc, head_obc);
-
- auto [submitted, all_completed] = co_await pg->submit_transaction(
- std::move(clone_obc),
- std::move(txn),
- std::move(osd_op_p),
- std::move(log_entries)
- );
+ auto all_completed = interruptor::now();
+ {
+ // as with PG::submit_executer, we need to build the pg log entries
+ // and submit the transaction atomically
+ co_await interruptor::make_interruptible(pg->submit_lock.lock());
+ auto unlocker = seastar::defer([this] {
+ pg->submit_lock.unlock();
+ });
- co_await std::move(submitted);
+ logger().debug("{}: calling remove_or_update obc={}",
+ *this, obc_manager.get_obc()->get_oid());
+
+ auto txn = co_await remove_or_update(
+ obc_manager.get_obc(), obc_manager.get_head_obc());
+
+ auto submitted = interruptor::now();
+ std::tie(submitted, all_completed) = co_await pg->submit_transaction(
+ ObjectContextRef(obc_manager.get_obc()),
+ nullptr,
+ std::move(txn),
+ std::move(osd_op_p),
+ std::move(log_entries)
+ );
+ co_await std::move(submitted);
+ }
- co_await enter_stage<interruptor>(client_pp().wait_repop);
+ co_await enter_stage<interruptor>(obc_orderer->obc_pp().wait_repop);
co_await std::move(all_completed);
- co_return;
+ logger().debug("{}: completed", *this);
}
void SnapTrimObjSubEvent::print(std::ostream &lhs) const
diff --git a/src/crimson/osd/osd_operations/snaptrim_event.h b/src/crimson/osd/osd_operations/snaptrim_event.h
index 1164b3169d2..a2b4d357568 100644
--- a/src/crimson/osd/osd_operations/snaptrim_event.h
+++ b/src/crimson/osd/osd_operations/snaptrim_event.h
@@ -6,6 +6,7 @@
#include <iostream>
#include <seastar/core/future.hh>
+#include "crimson/osd/object_context_loader.h"
#include "crimson/osd/osdmap_gate.h"
#include "crimson/osd/osd_operation.h"
#include "crimson/common/subop_blocker.h"
@@ -112,10 +113,6 @@ public:
private:
object_stat_sum_t delta_stats;
- ObjectContextLoader::load_obc_iertr::future<> process_and_submit(
- ObjectContextRef head_obc,
- ObjectContextRef clone_obc);
-
snap_trim_obj_subevent_ret_t remove_clone(
ObjectContextRef obc,
ObjectContextRef head_obc,
@@ -158,6 +155,7 @@ private:
}
Ref<PG> pg;
+ std::optional<ObjectContextLoader::Orderer> obc_orderer;
PipelineHandle handle;
osd_op_params_t osd_op_p;
const hobject_t coid;
@@ -169,9 +167,8 @@ public:
std::tuple<
StartEvent,
- CommonPGPipeline::CheckAlreadyCompleteGetObc::BlockingEvent,
- CommonPGPipeline::Process::BlockingEvent,
- CommonPGPipeline::WaitRepop::BlockingEvent,
+ CommonOBCPipeline::Process::BlockingEvent,
+ CommonOBCPipeline::WaitRepop::BlockingEvent,
CompletionEvent
> tracking_events;
};
diff --git a/src/crimson/osd/pg.cc b/src/crimson/osd/pg.cc
index 744a1dbc02b..2746e730f2b 100644
--- a/src/crimson/osd/pg.cc
+++ b/src/crimson/osd/pg.cc
@@ -132,6 +132,7 @@ PG::PG(
pool,
name),
osdmap,
+ PG_FEATURE_CRIMSON_ALL,
this,
this),
scrubber(*this),
@@ -392,7 +393,13 @@ void PG::on_replica_activate()
void PG::on_activate_complete()
{
- wait_for_active_blocker.unblock();
+ /* Confusingly, on_activate_complete is invoked when the primary and replicas
+ * have recorded the current interval. At that point, the PG may either become
+ * ACTIVE or PEERED, depending on whether the acting set is eligible for client
+ * IO. Only unblock wait_for_active_blocker if we actually became ACTIVE */
+ if (peering_state.is_active()) {
+ wait_for_active_blocker.unblock();
+ }
if (peering_state.needs_recovery()) {
logger().info("{}: requesting recovery",
@@ -861,43 +868,26 @@ std::ostream& operator<<(std::ostream& os, const PG& pg)
return os;
}
-void PG::mutate_object(
- ObjectContextRef& obc,
- ceph::os::Transaction& txn,
- osd_op_params_t& osd_op_p)
+void PG::enqueue_push_for_backfill(
+ const hobject_t &obj,
+ const eversion_t &v,
+ const std::vector<pg_shard_t> &peers)
{
- if (obc->obs.exists) {
- obc->obs.oi.prior_version = obc->obs.oi.version;
- obc->obs.oi.version = osd_op_p.at_version;
- if (osd_op_p.user_modify)
- obc->obs.oi.user_version = osd_op_p.at_version.version;
- obc->obs.oi.last_reqid = osd_op_p.req_id;
- obc->obs.oi.mtime = osd_op_p.mtime;
- obc->obs.oi.local_mtime = ceph_clock_now();
-
- // object_info_t
- {
- ceph::bufferlist osv;
- obc->obs.oi.encode_no_oid(osv, CEPH_FEATURES_ALL);
- // TODO: get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
- txn.setattr(coll_ref->get_cid(), ghobject_t{obc->obs.oi.soid}, OI_ATTR, osv);
- }
+ assert(recovery_handler);
+ assert(recovery_handler->backfill_state);
+ auto backfill_state = recovery_handler->backfill_state.get();
+ backfill_state->enqueue_standalone_push(obj, v, peers);
+}
- // snapset
- if (obc->obs.oi.soid.snap == CEPH_NOSNAP) {
- logger().debug("final snapset {} in {}",
- obc->ssc->snapset, obc->obs.oi.soid);
- ceph::bufferlist bss;
- encode(obc->ssc->snapset, bss);
- txn.setattr(coll_ref->get_cid(), ghobject_t{obc->obs.oi.soid}, SS_ATTR, bss);
- obc->ssc->exists = true;
- } else {
- logger().debug("no snapset (this is a clone)");
- }
- } else {
- // reset cached ObjectState without enforcing eviction
- obc->obs.oi = object_info_t(obc->obs.oi.soid);
- }
+void PG::enqueue_delete_for_backfill(
+ const hobject_t &obj,
+ const eversion_t &v,
+ const std::vector<pg_shard_t> &peers)
+{
+ assert(recovery_handler);
+ assert(recovery_handler->backfill_state);
+ auto backfill_state = recovery_handler->backfill_state.get();
+ backfill_state->enqueue_standalone_delete(obj, v, peers);
}
PG::interruptible_future<
@@ -905,6 +895,7 @@ PG::interruptible_future<
PG::interruptible_future<>>>
PG::submit_transaction(
ObjectContextRef&& obc,
+ ObjectContextRef&& new_clone,
ceph::os::Transaction&& txn,
osd_op_params_t&& osd_op_p,
std::vector<pg_log_entry_t>&& log_entries)
@@ -917,17 +908,23 @@ PG::submit_transaction(
}
epoch_t map_epoch = get_osdmap_epoch();
+ auto at_version = osd_op_p.at_version;
- peering_state.pre_submit_op(obc->obs.oi.soid, log_entries, osd_op_p.at_version);
+ peering_state.pre_submit_op(obc->obs.oi.soid, log_entries, at_version);
peering_state.update_trim_to();
ceph_assert(!log_entries.empty());
ceph_assert(log_entries.rbegin()->version >= projected_last_update);
projected_last_update = log_entries.rbegin()->version;
+ for (const auto& entry: log_entries) {
+ projected_log.add(entry);
+ }
+
auto [submitted, all_completed] = co_await backend->submit_transaction(
peering_state.get_acting_recovery_backfill(),
obc->obs.oi.soid,
+ std::move(new_clone),
std::move(txn),
std::move(osd_op_p),
peering_state.get_last_peering_reset(),
@@ -936,8 +933,8 @@ PG::submit_transaction(
co_return std::make_tuple(
std::move(submitted),
all_completed.then_interruptible(
- [this, last_complete=peering_state.get_info().last_complete,
- at_version=osd_op_p.at_version](auto acked) {
+ [this, last_complete=peering_state.get_info().last_complete, at_version]
+ (auto acked) {
for (const auto& peer : acked) {
peering_state.update_peer_last_complete_ondisk(
peer.shard, peer.last_complete_ondisk);
@@ -1014,8 +1011,15 @@ PG::interruptible_future<eversion_t> PG::submit_error_log(
const std::error_code e,
ceph_tid_t rep_tid)
{
- logger().debug("{}: {} rep_tid: {} error: {}",
- __func__, *m, rep_tid, e);
+ // as with submit_executer, need to ensure that log numbering and submission
+ // are atomic
+ co_await interruptor::make_interruptible(submit_lock.lock());
+ auto unlocker = seastar::defer([this] {
+ submit_lock.unlock();
+ });
+ LOG_PREFIX(PG::submit_error_log);
+ DEBUGDPP("{} rep_tid: {} error: {}",
+ *this, *m, rep_tid, e);
const osd_reqid_t &reqid = m->get_reqid();
mempool::osd_pglog::list<pg_log_entry_t> log_entries;
log_entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR,
@@ -1034,49 +1038,47 @@ PG::interruptible_future<eversion_t> PG::submit_error_log(
ceph::os::Transaction t;
peering_state.merge_new_log_entries(
log_entries, t, peering_state.get_pg_trim_to(),
- peering_state.get_min_last_complete_ondisk());
-
- return seastar::do_with(log_entries, set<pg_shard_t>{},
- [this, t=std::move(t), rep_tid](auto& log_entries, auto& waiting_on) mutable {
- return interruptor::do_for_each(get_acting_recovery_backfill(),
- [this, log_entries, waiting_on, rep_tid]
- (auto& i) mutable {
- pg_shard_t peer(i);
- if (peer == pg_whoami) {
- return seastar::now();
- }
- ceph_assert(peering_state.get_peer_missing().count(peer));
- ceph_assert(peering_state.has_peer_info(peer));
- auto log_m = crimson::make_message<MOSDPGUpdateLogMissing>(
- log_entries,
- spg_t(peering_state.get_info().pgid.pgid, i.shard),
- pg_whoami.shard,
- get_osdmap_epoch(),
- get_last_peering_reset(),
- rep_tid,
- peering_state.get_pg_trim_to(),
- peering_state.get_min_last_complete_ondisk());
- waiting_on.insert(peer);
- logger().debug("submit_error_log: sending log"
- "missing_request (rep_tid: {} entries: {})"
- " to osd {}", rep_tid, log_entries, peer.osd);
- return shard_services.send_to_osd(peer.osd,
- std::move(log_m),
- get_osdmap_epoch());
- }).then_interruptible([this, waiting_on, t=std::move(t), rep_tid] () mutable {
- waiting_on.insert(pg_whoami);
- logger().debug("submit_error_log: inserting rep_tid {}", rep_tid);
- log_entry_update_waiting_on.insert(
- std::make_pair(rep_tid,
- log_update_t{std::move(waiting_on)}));
- return shard_services.get_store().do_transaction(
- get_collection_ref(), std::move(t)
- ).then([this] {
- peering_state.update_trim_to();
- return seastar::make_ready_future<eversion_t>(projected_last_update);
- });
- });
- });
+ peering_state.get_pg_committed_to());
+
+
+ set<pg_shard_t> waiting_on;
+ for (const auto &peer: get_acting_recovery_backfill()) {
+ if (peer == pg_whoami) {
+ continue;
+ }
+ ceph_assert(peering_state.get_peer_missing().count(peer));
+ ceph_assert(peering_state.has_peer_info(peer));
+ auto log_m = crimson::make_message<MOSDPGUpdateLogMissing>(
+ log_entries,
+ spg_t(peering_state.get_info().pgid.pgid, peer.shard),
+ pg_whoami.shard,
+ get_osdmap_epoch(),
+ get_last_peering_reset(),
+ rep_tid,
+ peering_state.get_pg_trim_to(),
+ peering_state.get_pg_committed_to());
+ waiting_on.insert(peer);
+
+ DEBUGDPP("sending log missing_request (rep_tid: {} entries: {}) to osd {}",
+ *this, rep_tid, log_entries, peer.osd);
+ co_await interruptor::make_interruptible(
+ shard_services.send_to_osd(
+ peer.osd,
+ std::move(log_m),
+ get_osdmap_epoch()));
+ }
+ waiting_on.insert(pg_whoami);
+ DEBUGDPP("inserting rep_tid {}", *this, rep_tid);
+ log_entry_update_waiting_on.insert(
+ std::make_pair(rep_tid,
+ log_update_t{std::move(waiting_on)}));
+ co_await interruptor::make_interruptible(
+ shard_services.get_store().do_transaction(
+ get_collection_ref(), std::move(t)
+ ));
+
+ peering_state.update_trim_to();
+ co_return projected_last_update;
}
PG::run_executer_fut PG::run_executer(
@@ -1132,25 +1134,25 @@ PG::submit_executer_fut PG::submit_executer(
OpsExecuter &&ox,
const std::vector<OSDOp>& ops) {
LOG_PREFIX(PG::submit_executer);
- // transaction must commit at this point
- return std::move(
+ DEBUGDPP("", *this);
+
+ // we need to build the pg log entries and submit the transaction
+ // atomically to ensure log ordering
+ co_await interruptor::make_interruptible(submit_lock.lock());
+ auto unlocker = seastar::defer([this] {
+ submit_lock.unlock();
+ });
+
+ auto [submitted, completed] = co_await std::move(
ox
- ).flush_changes_n_do_ops_effects(
+ ).flush_changes_and_submit(
ops,
snap_mapper,
- osdriver,
- [FNAME, this](auto&& txn,
- auto&& obc,
- auto&& osd_op_p,
- auto&& log_entries) {
- DEBUGDPP("object {} submitting txn", *this, obc->get_oid());
- mutate_object(obc, txn, osd_op_p);
- return submit_transaction(
- std::move(obc),
- std::move(txn),
- std::move(osd_op_p),
- std::move(log_entries));
- });
+ osdriver
+ );
+ co_return std::make_tuple(
+ std::move(submitted).then_interruptible([unlocker=std::move(unlocker)] {}),
+ std::move(completed));
}
PG::interruptible_future<MURef<MOSDOpReply>> PG::do_pg_ops(Ref<MOSDOp> m)
@@ -1215,31 +1217,6 @@ void PG::check_blocklisted_obc_watchers(
}
}
-PG::load_obc_iertr::future<>
-PG::with_locked_obc(const hobject_t &hobj,
- const OpInfo &op_info,
- with_obc_func_t &&f)
-{
- if (__builtin_expect(stopping, false)) {
- throw crimson::common::system_shutdown_exception();
- }
- const hobject_t oid = get_oid(hobj);
- auto wrapper = [f=std::move(f), this](auto head, auto obc) {
- check_blocklisted_obc_watchers(obc);
- return f(head, obc);
- };
- switch (get_lock_type(op_info)) {
- case RWState::RWREAD:
- return obc_loader.with_obc<RWState::RWREAD>(oid, std::move(wrapper));
- case RWState::RWWRITE:
- return obc_loader.with_obc<RWState::RWWRITE>(oid, std::move(wrapper));
- case RWState::RWEXCL:
- return obc_loader.with_obc<RWState::RWEXCL>(oid, std::move(wrapper));
- default:
- ceph_abort();
- };
-}
-
void PG::update_stats(const pg_stat_t &stat) {
peering_state.update_stats(
[&stat] (auto& history, auto& stats) {
@@ -1249,13 +1226,10 @@ void PG::update_stats(const pg_stat_t &stat) {
);
}
-PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req)
+PG::handle_rep_op_fut PG::handle_rep_op(Ref<MOSDRepOp> req)
{
LOG_PREFIX(PG::handle_rep_op);
DEBUGDPP("{}", *this, *req);
- if (can_discard_replica_op(*req)) {
- co_return;
- }
ceph::os::Transaction txn;
auto encoded_txn = req->get_data().cbegin();
@@ -1272,12 +1246,13 @@ PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req)
log_operation(std::move(log_entries),
req->pg_trim_to,
req->version,
- req->min_last_complete_ondisk,
+ req->pg_committed_to,
!txn.empty(),
txn,
false);
DEBUGDPP("{} do_transaction", *this, *req);
- co_await interruptor::make_interruptible(
+
+ auto commit_fut = interruptor::make_interruptible(
shard_services.get_store().do_transaction(coll_ref, std::move(txn))
);
@@ -1288,10 +1263,7 @@ PG::interruptible_future<> PG::handle_rep_op(Ref<MOSDRepOp> req)
req.get(), pg_whoami, 0,
map_epoch, req->get_min_epoch(), CEPH_OSD_FLAG_ONDISK);
reply->set_last_complete_ondisk(lcod);
- co_await interruptor::make_interruptible(
- shard_services.send_to_osd(req->from.osd, std::move(reply), map_epoch)
- );
- co_return;
+ co_return handle_rep_op_ret(std::move(commit_fut), std::move(reply));
}
PG::interruptible_future<> PG::update_snap_map(
@@ -1318,28 +1290,25 @@ void PG::log_operation(
std::vector<pg_log_entry_t>&& logv,
const eversion_t &trim_to,
const eversion_t &roll_forward_to,
- const eversion_t &min_last_complete_ondisk,
+ const eversion_t &pg_committed_to,
bool transaction_applied,
ObjectStore::Transaction &txn,
bool async) {
- logger().debug("{}", __func__);
+ LOG_PREFIX(PG::log_operation);
+ DEBUGDPP("", *this);
if (is_primary()) {
- ceph_assert(trim_to <= peering_state.get_last_update_ondisk());
+ ceph_assert(trim_to <= peering_state.get_pg_committed_to());
}
- /* TODO: when we add snap mapper and projected log support,
- * we'll likely want to update them here.
- *
- * See src/osd/PrimaryLogPG.h:log_operation for how classic
- * handles these cases.
- */
-#if 0
auto last = logv.rbegin();
if (is_primary() && last != logv.rend()) {
+ DEBUGDPP("on primary, trimming projected log", *this);
projected_log.skip_can_rollback_to_to_head();
- projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
+ projected_log.trim(shard_services.get_cct(), last->version,
+ nullptr, nullptr, nullptr);
}
-#endif
+
if (!is_primary()) { // && !is_ec_pg()
+ DEBUGDPP("on replica, clearing obc", *this);
replica_clear_repop_obc(logv);
}
if (!logv.empty()) {
@@ -1348,7 +1317,7 @@ void PG::log_operation(
peering_state.append_log(std::move(logv),
trim_to,
roll_forward_to,
- min_last_complete_ondisk,
+ pg_committed_to,
txn,
!txn.empty(),
false);
@@ -1356,13 +1325,13 @@ void PG::log_operation(
void PG::replica_clear_repop_obc(
const std::vector<pg_log_entry_t> &logv) {
- logger().debug("{} clearing {} entries", __func__, logv.size());
- for (auto &&e: logv) {
- logger().debug(" {} get_object_boundary(from): {} "
- " head version(to): {}",
- e.soid,
- e.soid.get_object_boundary(),
- e.soid.get_head());
+ LOG_PREFIX(PG::replica_clear_repop_obc);
+ DEBUGDPP("clearing obc for {} log entries", logv.size());
+ for (auto &&e: logv) {
+ DEBUGDPP("clearing entry for {} from: {} to: {}",
+ e.soid,
+ e.soid.get_object_boundary(),
+ e.soid.get_head());
/* Have to blast all clones, they share a snapset */
obc_registry.clear_range(
e.soid.get_object_boundary(), e.soid.get_head());
@@ -1387,17 +1356,17 @@ PG::interruptible_future<> PG::do_update_log_missing(
ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
ObjectStore::Transaction t;
- std::optional<eversion_t> op_trim_to, op_roll_forward_to;
+ std::optional<eversion_t> op_trim_to, op_pg_committed_to;
if (m->pg_trim_to != eversion_t())
op_trim_to = m->pg_trim_to;
- if (m->pg_roll_forward_to != eversion_t())
- op_roll_forward_to = m->pg_roll_forward_to;
- logger().debug("op_trim_to = {}, op_roll_forward_to = {}",
+ if (m->pg_committed_to != eversion_t())
+ op_pg_committed_to = m->pg_committed_to;
+ logger().debug("op_trim_to = {}, op_pg_committed_to = {}",
op_trim_to.has_value() ? *op_trim_to : eversion_t(),
- op_roll_forward_to.has_value() ? *op_roll_forward_to : eversion_t());
+ op_pg_committed_to.has_value() ? *op_pg_committed_to : eversion_t());
peering_state.append_log_entries_update_missing(
- m->entries, t, op_trim_to, op_roll_forward_to);
+ m->entries, t, op_trim_to, op_pg_committed_to);
return interruptor::make_interruptible(shard_services.get_store().do_transaction(
coll_ref, std::move(t))).then_interruptible(
@@ -1615,14 +1584,21 @@ bool PG::should_send_op(
return true;
bool should_send =
(hoid.pool != (int64_t)get_info().pgid.pool() ||
- (has_backfill_state() && hoid <= get_last_backfill_started()) ||
- hoid <= peering_state.get_peer_info(peer).last_backfill);
+ // An object has been fully pushed to the backfill target if and only if
+ // either of the following conditions is met:
+ // 1. peer_info.last_backfill has passed "hoid"
+ // 2. last_backfill_started has passed "hoid" and "hoid" is not in the peer
+ // missing set
+ hoid <= peering_state.get_peer_info(peer).last_backfill ||
+ (has_backfill_state() && hoid <= get_last_backfill_started() &&
+ !is_missing_on_peer(peer, hoid)));
if (!should_send) {
ceph_assert(is_backfill_target(peer));
logger().debug("{} issue_repop shipping empty opt to osd."
"{}, object {} beyond std::max(last_backfill_started, "
"peer_info[peer].last_backfill {})",
- peer, hoid, peering_state.get_peer_info(peer).last_backfill);
+ __func__, peer, hoid,
+ peering_state.get_peer_info(peer).last_backfill);
}
return should_send;
// TODO: should consider async recovery cases in the future which are not supported
@@ -1637,8 +1613,8 @@ PG::already_complete(const osd_reqid_t& reqid)
int ret;
std::vector<pg_log_op_return_item_t> op_returns;
- if (peering_state.get_pg_log().get_log().get_request(
- reqid, &version, &user_version, &ret, &op_returns)) {
+ if (check_in_progress_op(
+ reqid, &version, &user_version, &ret, &op_returns)) {
complete_op_t dupinfo{
user_version,
version,
@@ -1703,4 +1679,19 @@ void PG::C_PG_FinishRecovery::finish(int r) {
DEBUGDPP("stale recovery finsher", pg);
}
}
+bool PG::check_in_progress_op(
+ const osd_reqid_t& reqid,
+ eversion_t *version,
+ version_t *user_version,
+ int *return_code,
+ std::vector<pg_log_op_return_item_t> *op_returns
+ ) const
+{
+ return (
+ projected_log.get_request(reqid, version, user_version, return_code,
+ op_returns) ||
+ peering_state.get_pg_log().get_log().get_request(
+ reqid, version, user_version, return_code, op_returns));
+}
+
}
diff --git a/src/crimson/osd/pg.h b/src/crimson/osd/pg.h
index 604f49005ff..06038c0aa00 100644
--- a/src/crimson/osd/pg.h
+++ b/src/crimson/osd/pg.h
@@ -10,6 +10,7 @@
#include <seastar/core/shared_future.hh>
#include "common/dout.h"
+#include "common/ostream_temp.h"
#include "include/interval_set.h"
#include "crimson/net/Fwd.h"
#include "messages/MOSDRepOpReply.h"
@@ -45,6 +46,7 @@
class MQuery;
class OSDMap;
class PGBackend;
+class ReplicatedBackend;
class PGPeeringEvent;
class osd_op_params_t;
@@ -76,7 +78,8 @@ class PG : public boost::intrusive_ref_counter<
using ec_profile_t = std::map<std::string,std::string>;
using cached_map_t = OSDMapService::cached_map_t;
- ClientRequest::PGPipeline request_pg_pipeline;
+ CommonPGPipeline request_pg_pipeline;
+ PGRepopPipeline repop_pipeline;
PGPeeringPipeline peering_request_pg_pipeline;
ClientRequest::Orderer client_request_orderer;
@@ -129,8 +132,8 @@ public:
return peering_state.get_pg_trim_to();
}
- eversion_t get_min_last_complete_ondisk() const {
- return peering_state.get_min_last_complete_ondisk();
+ eversion_t get_pg_committed_to() const {
+ return peering_state.get_pg_committed_to();
}
const pg_info_t& get_info() const final {
@@ -376,6 +379,7 @@ public:
void check_blocklisted_watchers() final;
void clear_primary_state() final {
recovery_finisher = nullptr;
+ projected_log = PGLog::IndexedLog();
}
void queue_check_readable(epoch_t last_peering_reset,
@@ -517,6 +521,9 @@ public:
// Utility
+ bool is_active() const {
+ return peering_state.is_active();
+ }
bool is_active_clean() const {
return peering_state.is_active() && peering_state.is_clean();
}
@@ -589,12 +596,13 @@ public:
using with_obc_func_t =
std::function<load_obc_iertr::future<> (ObjectContextRef, ObjectContextRef)>;
- load_obc_iertr::future<> with_locked_obc(
- const hobject_t &hobj,
- const OpInfo &op_info,
- with_obc_func_t&& f);
-
- interruptible_future<> handle_rep_op(Ref<MOSDRepOp> m);
+ using handle_rep_op_ret = std::tuple<
+ interruptible_future<>, // resolves upon commit
+ MURef<MOSDRepOpReply> // reply message
+ >;
+ // outer future resolves upon submission
+ using handle_rep_op_fut = interruptible_future<handle_rep_op_ret>;
+ handle_rep_op_fut handle_rep_op(Ref<MOSDRepOp> m);
void update_stats(const pg_stat_t &stat);
interruptible_future<> update_snap_map(
const std::vector<pg_log_entry_t> &log_entries,
@@ -603,7 +611,7 @@ public:
std::vector<pg_log_entry_t>&& logv,
const eversion_t &trim_to,
const eversion_t &roll_forward_to,
- const eversion_t &min_last_complete_ondisk,
+ const eversion_t &pg_commited_to,
bool transaction_applied,
ObjectStore::Transaction &txn,
bool async = false);
@@ -663,6 +671,7 @@ private:
const OpInfo &op_info,
std::vector<OSDOp>& ops);
+ seastar::shared_mutex submit_lock;
using submit_executer_ret = std::tuple<
interruptible_future<>,
interruptible_future<>>;
@@ -675,13 +684,18 @@ private:
struct do_osd_ops_params_t;
interruptible_future<MURef<MOSDOpReply>> do_pg_ops(Ref<MOSDOp> m);
+
+public:
interruptible_future<
std::tuple<interruptible_future<>, interruptible_future<>>>
submit_transaction(
ObjectContextRef&& obc,
+ ObjectContextRef&& new_clone,
ceph::os::Transaction&& txn,
osd_op_params_t&& oop,
std::vector<pg_log_entry_t>&& log_entries);
+
+private:
interruptible_future<> repair_object(
const hobject_t& oid,
eversion_t& v);
@@ -826,8 +840,15 @@ public:
const eversion_t version;
const int err;
};
+ PGLog::IndexedLog projected_log;
interruptible_future<std::optional<complete_op_t>>
already_complete(const osd_reqid_t& reqid);
+ bool check_in_progress_op(
+ const osd_reqid_t& reqid,
+ eversion_t *version,
+ version_t *user_version,
+ int *return_code,
+ std::vector<pg_log_op_return_item_t> *op_returns) const;
int get_recovery_op_priority() const {
int64_t pri = 0;
get_pgpool().info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri);
@@ -879,15 +900,20 @@ private:
friend class SnapTrimObjSubEvent;
private:
- void mutate_object(
- ObjectContextRef& obc,
- ceph::os::Transaction& txn,
- osd_op_params_t& osd_op_p);
+ void enqueue_push_for_backfill(
+ const hobject_t &obj,
+ const eversion_t &v,
+ const std::vector<pg_shard_t> &peers);
+ void enqueue_delete_for_backfill(
+ const hobject_t &obj,
+ const eversion_t &v,
+ const std::vector<pg_shard_t> &peers);
+
bool can_discard_replica_op(const Message& m, epoch_t m_map_epoch) const;
bool can_discard_op(const MOSDOp& m) const;
void context_registry_on_change();
bool is_missing_object(const hobject_t& soid) const {
- return peering_state.get_pg_log().get_missing().get_items().count(soid);
+ return get_local_missing().is_missing(soid);
}
bool is_unreadable_object(const hobject_t &oid,
eversion_t* v = 0) const final {
@@ -895,6 +921,11 @@ private:
!peering_state.get_missing_loc().readable_with_acting(
oid, get_actingset(), v);
}
+ bool is_missing_on_peer(
+ const pg_shard_t &peer,
+ const hobject_t &soid) const {
+ return peering_state.get_peer_missing(peer).is_missing(soid);
+ }
bool is_degraded_or_backfilling_object(const hobject_t& soid) const;
const std::set<pg_shard_t> &get_actingset() const {
return peering_state.get_actingset();
@@ -902,6 +933,7 @@ private:
private:
friend class IOInterruptCondition;
+ friend class ::ReplicatedBackend;
struct log_update_t {
std::set<pg_shard_t> waiting_on;
seastar::shared_promise<> all_committed;
diff --git a/src/crimson/osd/pg_backend.cc b/src/crimson/osd/pg_backend.cc
index 24a381b4cf7..79895de06de 100644
--- a/src/crimson/osd/pg_backend.cc
+++ b/src/crimson/osd/pg_backend.cc
@@ -1283,22 +1283,6 @@ PGBackend::rm_xattr(
return rm_xattr_iertr::now();
}
-void PGBackend::clone(
- /* const */object_info_t& snap_oi,
- const ObjectState& os,
- const ObjectState& d_os,
- ceph::os::Transaction& txn)
-{
- // See OpsExecuter::execute_clone documentation
- txn.clone(coll->get_cid(), ghobject_t{os.oi.soid}, ghobject_t{d_os.oi.soid});
- {
- ceph::bufferlist bv;
- snap_oi.encode_no_oid(bv, CEPH_FEATURES_ALL);
- txn.setattr(coll->get_cid(), ghobject_t{d_os.oi.soid}, OI_ATTR, bv);
- }
- txn.rmattr(coll->get_cid(), ghobject_t{d_os.oi.soid}, SS_ATTR);
-}
-
using get_omap_ertr =
crimson::os::FuturizedStore::Shard::read_errorator::extend<
crimson::ct_error::enodata>;
@@ -1341,9 +1325,10 @@ maybe_get_omap_vals(
PGBackend::ll_read_ierrorator::future<ceph::bufferlist>
PGBackend::omap_get_header(
const crimson::os::CollectionRef& c,
- const ghobject_t& oid) const
+ const ghobject_t& oid,
+ uint32_t op_flags) const
{
- return store->omap_get_header(c, oid)
+ return store->omap_get_header(c, oid, op_flags)
.handle_error(
crimson::ct_error::enodata::handle([] {
return seastar::make_ready_future<bufferlist>();
@@ -1356,10 +1341,13 @@ PGBackend::ll_read_ierrorator::future<>
PGBackend::omap_get_header(
const ObjectState& os,
OSDOp& osd_op,
- object_stat_sum_t& delta_stats) const
+ object_stat_sum_t& delta_stats,
+ uint32_t op_flags) const
{
if (os.oi.is_omap()) {
- return omap_get_header(coll, ghobject_t{os.oi.soid}).safe_then_interruptible(
+ return omap_get_header(
+ coll, ghobject_t{os.oi.soid}, CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
+ ).safe_then_interruptible(
[&delta_stats, &osd_op] (ceph::bufferlist&& header) {
osd_op.outdata = std::move(header);
delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
@@ -1723,7 +1711,8 @@ PGBackend::fiemap(
CollectionRef c,
const ghobject_t& oid,
uint64_t off,
- uint64_t len)
+ uint64_t len,
+ uint32_t op_flags)
{
return store->fiemap(c, oid, off, len);
}
@@ -1835,3 +1824,32 @@ PGBackend::read_ierrorator::future<> PGBackend::tmapget(
read_errorator::pass_further{});
}
+void PGBackend::set_metadata(
+ const hobject_t &obj,
+ object_info_t &oi,
+ const SnapSet *ss /* non-null iff head */,
+ ceph::os::Transaction& txn)
+{
+ ceph_assert((obj.is_head() && ss) || (!obj.is_head() && !ss));
+ {
+ ceph::bufferlist bv;
+ oi.encode_no_oid(bv, CEPH_FEATURES_ALL);
+ txn.setattr(coll->get_cid(), ghobject_t{obj}, OI_ATTR, bv);
+ }
+ if (ss) {
+ ceph::bufferlist bss;
+ encode(*ss, bss);
+ txn.setattr(coll->get_cid(), ghobject_t{obj}, SS_ATTR, bss);
+ }
+}
+
+void PGBackend::clone_for_write(
+ const hobject_t &from,
+ const hobject_t &to,
+ ceph::os::Transaction &txn)
+{
+ // See OpsExecuter::execute_clone documentation
+ txn.clone(coll->get_cid(), ghobject_t{from}, ghobject_t{to});
+ txn.rmattr(coll->get_cid(), ghobject_t{to}, SS_ATTR);
+}
+
diff --git a/src/crimson/osd/pg_backend.h b/src/crimson/osd/pg_backend.h
index fa1f1405ffe..9c2230375b0 100644
--- a/src/crimson/osd/pg_backend.h
+++ b/src/crimson/osd/pg_backend.h
@@ -308,11 +308,6 @@ public:
ObjectState& os,
const OSDOp& osd_op,
ceph::os::Transaction& trans);
- void clone(
- /* const */object_info_t& snap_oi,
- const ObjectState& os,
- const ObjectState& d_os,
- ceph::os::Transaction& trans);
interruptible_future<struct stat> stat(
CollectionRef c,
const ghobject_t& oid) const;
@@ -320,7 +315,8 @@ public:
CollectionRef c,
const ghobject_t& oid,
uint64_t off,
- uint64_t len);
+ uint64_t len,
+ uint32_t op_flags = 0);
write_iertr::future<> tmapput(
ObjectState& os,
@@ -380,11 +376,13 @@ public:
object_stat_sum_t& delta_stats);
ll_read_ierrorator::future<ceph::bufferlist> omap_get_header(
const crimson::os::CollectionRef& c,
- const ghobject_t& oid) const;
+ const ghobject_t& oid,
+ uint32_t op_flags = 0) const;
ll_read_ierrorator::future<> omap_get_header(
const ObjectState& os,
OSDOp& osd_op,
- object_stat_sum_t& delta_stats) const;
+ object_stat_sum_t& delta_stats,
+ uint32_t op_flags = 0) const;
interruptible_future<> omap_set_header(
ObjectState& os,
const OSDOp& osd_op,
@@ -411,9 +409,24 @@ public:
ceph::os::Transaction& trans,
osd_op_params_t& osd_op_params,
object_stat_sum_t& delta_stats);
+
+ /// sets oi and (for head) ss attrs
+ void set_metadata(
+ const hobject_t &obj,
+ object_info_t &oi,
+ const SnapSet *ss /* non-null iff head */,
+ ceph::os::Transaction& trans);
+
+ /// clone from->to and clear ss attribute on to
+ void clone_for_write(
+ const hobject_t &from,
+ const hobject_t &to,
+ ceph::os::Transaction& trans);
+
virtual rep_op_fut_t
submit_transaction(const std::set<pg_shard_t> &pg_shards,
const hobject_t& hoid,
+ crimson::osd::ObjectContextRef&& new_clone,
ceph::os::Transaction&& txn,
osd_op_params_t&& osd_op_p,
epoch_t min_epoch, epoch_t max_epoch,
diff --git a/src/crimson/osd/pg_recovery.cc b/src/crimson/osd/pg_recovery.cc
index ec3af0d2b00..5eef584c776 100644
--- a/src/crimson/osd/pg_recovery.cc
+++ b/src/crimson/osd/pg_recovery.cc
@@ -67,8 +67,6 @@ PGRecovery::start_recovery_ops(
if (max_to_start > 0) {
max_to_start -= start_replica_recovery_ops(trigger, max_to_start, &started);
}
- using interruptor =
- crimson::interruptible::interruptor<crimson::osd::IOInterruptCondition>;
return interruptor::parallel_for_each(started,
[] (auto&& ifut) {
return std::move(ifut);
@@ -609,8 +607,21 @@ void PGRecovery::update_peers_last_backfill(
bool PGRecovery::budget_available() const
{
- // TODO: the limits!
- return true;
+ crimson::osd::scheduler::params_t params =
+ {1, 0, crimson::osd::scheduler::scheduler_class_t::background_best_effort};
+ auto &ss = pg->get_shard_services();
+ auto futopt = ss.try_acquire_throttle_now(std::move(params));
+ if (!futopt) {
+ return true;
+ }
+ std::ignore = interruptor::make_interruptible(std::move(*futopt)
+ ).then_interruptible([this] {
+ assert(!backfill_state->is_triggered());
+ using BackfillState = crimson::osd::BackfillState;
+ backfill_state->process_event(
+ BackfillState::ThrottleAcquired{}.intrusive_from_this());
+ });
+ return false;
}
void PGRecovery::on_pg_clean()
diff --git a/src/crimson/osd/pg_recovery.h b/src/crimson/osd/pg_recovery.h
index 705b3176b97..5c7b5c5ef2b 100644
--- a/src/crimson/osd/pg_recovery.h
+++ b/src/crimson/osd/pg_recovery.h
@@ -25,6 +25,8 @@ class PGBackend;
class PGRecovery : public crimson::osd::BackfillState::BackfillListener {
public:
+ using interruptor =
+ crimson::interruptible::interruptor<crimson::osd::IOInterruptCondition>;
template <typename T = void>
using interruptible_future = RecoveryBackend::interruptible_future<T>;
PGRecovery(PGRecoveryListener* pg) : pg(pg) {}
@@ -45,6 +47,10 @@ public:
seastar::future<> stop() { return seastar::now(); }
void on_pg_clean();
+ void enqueue_push(
+ const hobject_t& obj,
+ const eversion_t& v,
+ const std::vector<pg_shard_t> &peers) final;
private:
PGRecoveryListener* pg;
size_t start_primary_recovery_ops(
@@ -108,10 +114,6 @@ private:
const hobject_t& end) final;
void request_primary_scan(
const hobject_t& begin) final;
- void enqueue_push(
- const hobject_t& obj,
- const eversion_t& v,
- const std::vector<pg_shard_t> &peers) final;
void enqueue_drop(
const pg_shard_t& target,
const hobject_t& obj,
diff --git a/src/crimson/osd/pg_shard_manager.h b/src/crimson/osd/pg_shard_manager.h
index b9879c8c9dd..f7bd7a6c08e 100644
--- a/src/crimson/osd/pg_shard_manager.h
+++ b/src/crimson/osd/pg_shard_manager.h
@@ -256,18 +256,40 @@ public:
auto &opref = *op;
return opref.template with_blocking_event<
PGMap::PGCreationBlockingEvent
- >([&target_shard_services, &opref](auto &&trigger) {
- return target_shard_services.wait_for_pg(
- std::move(trigger), opref.get_pgid());
- }).safe_then([&logger, &target_shard_services, &opref](Ref<PG> pgref) {
- logger.debug("{}: have_pg", opref);
- return opref.with_pg(target_shard_services, pgref);
- }).handle_error(
- crimson::ct_error::ecanceled::handle([&logger, &opref](auto) {
- logger.debug("{}: pg creation canceled, dropping", opref);
- return seastar::now();
- })
- ).then([op=std::move(op)] {});
+ >([&target_shard_services, &opref, &logger](auto &&trigger) mutable {
+ auto pg = target_shard_services.get_pg(opref.get_pgid());
+ auto fut = ShardServices::wait_for_pg_ertr::make_ready_future<Ref<PG>>(pg);
+ if (!pg) {
+ if (opref.requires_pg()) {
+ auto osdmap = target_shard_services.get_map();
+ if (!osdmap->is_up_acting_osd_shard(
+ opref.get_pgid(), target_shard_services.local_state.whoami)) {
+ logger.debug(
+ "pg {} for {} is no longer here, discarding",
+ opref.get_pgid(), opref);
+ opref.get_handle().exit();
+ auto _fut = seastar::now();
+ if (osdmap->get_epoch() > opref.get_epoch_sent_at()) {
+ _fut = target_shard_services.send_incremental_map(
+ std::ref(opref.get_foreign_connection()),
+ opref.get_epoch_sent_at() + 1);
+ }
+ return _fut;
+ }
+ }
+ fut = target_shard_services.wait_for_pg(
+ std::move(trigger), opref.get_pgid());
+ }
+ return fut.safe_then([&logger, &target_shard_services, &opref](Ref<PG> pgref) {
+ logger.debug("{}: have_pg", opref);
+ return opref.with_pg(target_shard_services, pgref);
+ }).handle_error(
+ crimson::ct_error::ecanceled::handle([&logger, &opref](auto) {
+ logger.debug("{}: pg creation canceled, dropping", opref);
+ return seastar::now();
+ })
+ );
+ }).then([op=std::move(op)] {});
}
seastar::future<> load_pgs(crimson::os::FuturizedStore& store);
diff --git a/src/crimson/osd/replicated_backend.cc b/src/crimson/osd/replicated_backend.cc
index cbb8c883e07..6c8abecffaf 100644
--- a/src/crimson/osd/replicated_backend.cc
+++ b/src/crimson/osd/replicated_backend.cc
@@ -36,19 +36,59 @@ ReplicatedBackend::_read(const hobject_t& hoid,
return store->read(coll, ghobject_t{hoid}, off, len, flags);
}
+MURef<MOSDRepOp> ReplicatedBackend::new_repop_msg(
+ const pg_shard_t &pg_shard,
+ const hobject_t &hoid,
+ const bufferlist &encoded_txn,
+ const osd_op_params_t &osd_op_p,
+ epoch_t min_epoch,
+ epoch_t map_epoch,
+ const std::vector<pg_log_entry_t> &log_entries,
+ bool send_op,
+ ceph_tid_t tid)
+{
+ ceph_assert(pg_shard != whoami);
+ auto m = crimson::make_message<MOSDRepOp>(
+ osd_op_p.req_id,
+ whoami,
+ spg_t{pgid, pg_shard.shard},
+ hoid,
+ CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
+ map_epoch,
+ min_epoch,
+ tid,
+ osd_op_p.at_version);
+ if (send_op) {
+ m->set_data(encoded_txn);
+ } else {
+ ceph::os::Transaction t;
+ bufferlist bl;
+ encode(t, bl);
+ m->set_data(bl);
+ }
+ encode(log_entries, m->logbl);
+ m->pg_trim_to = osd_op_p.pg_trim_to;
+ m->pg_committed_to = osd_op_p.pg_committed_to;
+ m->pg_stats = pg.get_info().stats;
+ return m;
+}
+
ReplicatedBackend::rep_op_fut_t
-ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
- const hobject_t& hoid,
- ceph::os::Transaction&& t,
- osd_op_params_t&& opp,
- epoch_t min_epoch, epoch_t map_epoch,
- std::vector<pg_log_entry_t>&& logv)
+ReplicatedBackend::submit_transaction(
+ const std::set<pg_shard_t> &pg_shards,
+ const hobject_t& hoid,
+ crimson::osd::ObjectContextRef &&new_clone,
+ ceph::os::Transaction&& t,
+ osd_op_params_t&& opp,
+ epoch_t min_epoch, epoch_t map_epoch,
+ std::vector<pg_log_entry_t>&& logv)
{
LOG_PREFIX(ReplicatedBackend::submit_transaction);
DEBUGDPP("object {}", dpp, hoid);
auto log_entries = std::move(logv);
auto txn = std::move(t);
auto osd_op_p = std::move(opp);
+ auto _new_clone = std::move(new_clone);
const ceph_tid_t tid = shard_services.get_tid();
auto pending_txn =
@@ -56,50 +96,57 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
bufferlist encoded_txn;
encode(txn, encoded_txn);
+ bool is_delete = false;
for (auto &le : log_entries) {
le.mark_unrollbackable();
+ if (le.is_delete()) {
+ is_delete = true;
+ }
}
+ co_await pg.update_snap_map(log_entries, txn);
+
+ std::vector<pg_shard_t> to_push_clone;
+ std::vector<pg_shard_t> to_push_delete;
auto sends = std::make_unique<std::vector<seastar::future<>>>();
- for (auto pg_shard : pg_shards) {
- if (pg_shard != whoami) {
- auto m = crimson::make_message<MOSDRepOp>(
- osd_op_p.req_id,
- whoami,
- spg_t{pgid, pg_shard.shard},
- hoid,
- CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
- map_epoch,
- min_epoch,
- tid,
- osd_op_p.at_version);
- if (pg.should_send_op(pg_shard, hoid)) {
- m->set_data(encoded_txn);
- } else {
- ceph::os::Transaction t;
- bufferlist bl;
- encode(t, bl);
- m->set_data(bl);
+ for (auto &pg_shard : pg_shards) {
+ if (pg_shard == whoami) {
+ continue;
+ }
+ MURef<MOSDRepOp> m;
+ if (pg.should_send_op(pg_shard, hoid)) {
+ m = new_repop_msg(
+ pg_shard, hoid, encoded_txn, osd_op_p,
+ min_epoch, map_epoch, log_entries, true, tid);
+ } else {
+ m = new_repop_msg(
+ pg_shard, hoid, encoded_txn, osd_op_p,
+ min_epoch, map_epoch, log_entries, false, tid);
+ if (pg.is_missing_on_peer(pg_shard, hoid)) {
+ if (_new_clone) {
+ // The head is in the push queue but hasn't been pushed yet.
+ // We need to ensure that the newly created clone will be
+ // pushed as well, otherwise we might skip it.
+ // See: https://tracker.ceph.com/issues/68808
+ to_push_clone.push_back(pg_shard);
+ }
+ if (is_delete) {
+ to_push_delete.push_back(pg_shard);
+ }
}
- pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}});
- encode(log_entries, m->logbl);
- m->pg_trim_to = osd_op_p.pg_trim_to;
- m->min_last_complete_ondisk = osd_op_p.min_last_complete_ondisk;
- m->pg_stats = pg.get_info().stats;
- // TODO: set more stuff. e.g., pg_states
- sends->emplace_back(
- shard_services.send_to_osd(
- pg_shard.osd, std::move(m), map_epoch));
}
+ pending_txn->second.acked_peers.push_back({pg_shard, eversion_t{}});
+ // TODO: set more stuff. e.g., pg_states
+ sends->emplace_back(
+ shard_services.send_to_osd(
+ pg_shard.osd, std::move(m), map_epoch));
}
- co_await pg.update_snap_map(log_entries, txn);
-
pg.log_operation(
std::move(log_entries),
osd_op_p.pg_trim_to,
osd_op_p.at_version,
- osd_op_p.min_last_complete_ondisk,
+ osd_op_p.pg_committed_to,
true,
txn,
false);
@@ -120,9 +167,20 @@ ReplicatedBackend::submit_transaction(const std::set<pg_shard_t>& pg_shards,
return seastar::now();
}
return peers->all_committed.get_shared_future();
- }).then_interruptible([pending_txn, this] {
+ }).then_interruptible([pending_txn, this, _new_clone, &hoid,
+ to_push_delete=std::move(to_push_delete),
+ to_push_clone=std::move(to_push_clone)] {
auto acked_peers = std::move(pending_txn->second.acked_peers);
pending_trans.erase(pending_txn);
+ if (_new_clone && !to_push_clone.empty()) {
+ pg.enqueue_push_for_backfill(
+ _new_clone->obs.oi.soid,
+ _new_clone->obs.oi.version,
+ to_push_clone);
+ }
+ if (!to_push_delete.empty()) {
+ pg.enqueue_delete_for_backfill(hoid, {}, to_push_delete);
+ }
return seastar::make_ready_future<
crimson::osd::acked_peers_t>(std::move(acked_peers));
});
diff --git a/src/crimson/osd/replicated_backend.h b/src/crimson/osd/replicated_backend.h
index fb8704d8742..d5844b23a0c 100644
--- a/src/crimson/osd/replicated_backend.h
+++ b/src/crimson/osd/replicated_backend.h
@@ -35,6 +35,7 @@ private:
rep_op_fut_t submit_transaction(
const std::set<pg_shard_t> &pg_shards,
const hobject_t& hoid,
+ crimson::osd::ObjectContextRef&& new_clone,
ceph::os::Transaction&& txn,
osd_op_params_t&& osd_op_p,
epoch_t min_epoch, epoch_t max_epoch,
@@ -60,6 +61,17 @@ private:
pending_transactions_t pending_trans;
crimson::osd::PG& pg;
+ MURef<MOSDRepOp> new_repop_msg(
+ const pg_shard_t &pg_shard,
+ const hobject_t &hoid,
+ const bufferlist &encoded_txn,
+ const osd_op_params_t &osd_op_p,
+ epoch_t min_epoch,
+ epoch_t map_epoch,
+ const std::vector<pg_log_entry_t> &log_entries,
+ bool send_op,
+ ceph_tid_t tid);
+
seastar::future<> request_committed(
const osd_reqid_t& reqid, const eversion_t& at_version) final;
};
diff --git a/src/crimson/osd/replicated_recovery_backend.cc b/src/crimson/osd/replicated_recovery_backend.cc
index 76f24196b51..0d6c9d38236 100644
--- a/src/crimson/osd/replicated_recovery_backend.cc
+++ b/src/crimson/osd/replicated_recovery_backend.cc
@@ -35,6 +35,15 @@ ReplicatedRecoveryBackend::recover_object(
logger().debug("recover_object: loading obc: {}", soid);
return pg.obc_loader.with_obc<RWState::RWREAD>(soid,
[this, soid, need](auto head, auto obc) {
+ if (!obc->obs.exists) {
+ // XXX: this recovery must be triggered by backfills and the corresponding
+ // object must have been deleted by some client request after the object
+ // is enqueued for push but before the lock is acquired by the recovery.
+ //
+ // Abort the recovery in this case, a "recover_delete" must have been
+ // added for this object by the client request that deleted it.
+ return interruptor::now();
+ }
logger().debug("recover_object: loaded obc: {}", obc->obs.oi.soid);
auto& recovery_waiter = get_recovering(soid);
recovery_waiter.obc = obc;
@@ -306,7 +315,10 @@ ReplicatedRecoveryBackend::recover_delete(
}
return seastar::make_ready_future<>();
}).then_interruptible([this, soid, &stat_diff] {
- pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true);
+ const auto &missing = pg.get_peering_state().get_pg_log().get_missing();
+ if (!missing.is_missing(soid)) {
+ pg.get_recovery_handler()->on_global_recover(soid, stat_diff, true);
+ }
return seastar::make_ready_future<>();
});
});
@@ -568,14 +580,17 @@ ReplicatedRecoveryBackend::read_metadata_for_push_op(
return seastar::make_ready_future<eversion_t>(ver);
}
return interruptor::make_interruptible(interruptor::when_all_succeed(
- backend->omap_get_header(coll, ghobject_t(oid)).handle_error_interruptible<false>(
+ backend->omap_get_header(
+ coll, ghobject_t(oid), CEPH_OSD_OP_FLAG_FADVISE_DONTNEED
+ ).handle_error_interruptible<false>(
crimson::os::FuturizedStore::Shard::read_errorator::all_same_way(
[oid] (const std::error_code& e) {
logger().debug("read_metadata_for_push_op, error {} when getting omap header: {}", e, oid);
return seastar::make_ready_future<bufferlist>();
})),
- interruptor::make_interruptible(store->get_attrs(coll, ghobject_t(oid)))
- .handle_error_interruptible<false>(
+ interruptor::make_interruptible(
+ store->get_attrs(coll, ghobject_t(oid), CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)
+ ).handle_error_interruptible<false>(
crimson::os::FuturizedStore::Shard::get_attrs_ertr::all_same_way(
[oid] (const std::error_code& e) {
logger().debug("read_metadata_for_push_op, error {} when getting attrs: {}", e, oid);
@@ -613,8 +628,14 @@ ReplicatedRecoveryBackend::read_object_for_push_op(
return seastar::make_ready_future<uint64_t>(offset);
}
// 1. get the extents in the interested range
- return interruptor::make_interruptible(backend->fiemap(coll, ghobject_t{oid},
- 0, copy_subset.range_end())).safe_then_interruptible(
+ return interruptor::make_interruptible(
+ backend->fiemap(
+ coll,
+ ghobject_t{oid},
+ 0,
+ copy_subset.range_end(),
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)
+ ).safe_then_interruptible(
[=, this](auto&& fiemap_included) mutable {
interval_set<uint64_t> extents;
try {
@@ -630,8 +651,12 @@ ReplicatedRecoveryBackend::read_object_for_push_op(
push_op->data_included.span_of(extents, offset, max_len);
// 3. read the truncated extents
// TODO: check if the returned extents are pruned
- return interruptor::make_interruptible(store->readv(coll, ghobject_t{oid},
- push_op->data_included, 0));
+ return interruptor::make_interruptible(
+ store->readv(
+ coll,
+ ghobject_t{oid},
+ push_op->data_included,
+ CEPH_OSD_OP_FLAG_FADVISE_DONTNEED));
}).safe_then_interruptible([push_op, range_end=copy_subset.range_end()](auto &&bl) {
push_op->data.claim_append(std::move(bl));
uint64_t recovered_to = 0;
diff --git a/src/crimson/osd/shard_services.cc b/src/crimson/osd/shard_services.cc
index a053d9d5044..e1acb34636f 100644
--- a/src/crimson/osd/shard_services.cc
+++ b/src/crimson/osd/shard_services.cc
@@ -783,6 +783,11 @@ seastar::future<> ShardServices::dispatch_context_transaction(
co_return;
}
+Ref<PG> ShardServices::get_pg(spg_t pgid)
+{
+ return local_state.get_pg(pgid);
+}
+
seastar::future<> ShardServices::dispatch_context_messages(
BufferedRecoveryMessages &&ctx)
{
@@ -802,15 +807,19 @@ seastar::future<> ShardServices::dispatch_context_messages(
seastar::future<> ShardServices::dispatch_context(
crimson::os::CollectionRef col,
- PeeringCtx &&ctx)
-{
- ceph_assert(col || ctx.transaction.empty());
- return seastar::when_all_succeed(
- dispatch_context_messages(
- BufferedRecoveryMessages{ctx}),
- col ? dispatch_context_transaction(col, ctx) : seastar::now()
- ).then_unpack([] {
- return seastar::now();
+ PeeringCtx &&pctx)
+{
+ return seastar::do_with(
+ std::move(pctx),
+ [this, col](auto &ctx) {
+ ceph_assert(col || ctx.transaction.empty());
+ return seastar::when_all_succeed(
+ dispatch_context_messages(
+ BufferedRecoveryMessages{ctx}),
+ col ? dispatch_context_transaction(col, ctx) : seastar::now()
+ ).then_unpack([] {
+ return seastar::now();
+ });
});
}
diff --git a/src/crimson/osd/shard_services.h b/src/crimson/osd/shard_services.h
index fb86418aba2..f1ed9b8d911 100644
--- a/src/crimson/osd/shard_services.h
+++ b/src/crimson/osd/shard_services.h
@@ -10,6 +10,7 @@
#include "include/common_fwd.h"
#include "osd_operation.h"
+#include "osd/osd_types_fmt.h"
#include "msg/MessageRef.h"
#include "crimson/common/exception.h"
#include "crimson/common/shared_lru.h"
@@ -482,6 +483,8 @@ public:
return pg_to_shard_mapping.remove_pg_mapping(pgid);
}
+ Ref<PG> get_pg(spg_t pgid);
+
crimson::common::CephContext *get_cct() {
return &(local_state.cct);
}
@@ -588,6 +591,7 @@ public:
FORWARD_TO_OSD_SINGLETON(get_pool_info)
FORWARD(with_throttle_while, with_throttle_while, local_state.throttler)
+ FORWARD(try_acquire_throttle_now, try_acquire_throttle_now, local_state.throttler)
FORWARD_TO_OSD_SINGLETON(build_incremental_map_msg)
FORWARD_TO_OSD_SINGLETON(send_incremental_map)
diff --git a/src/crimson/tools/perf_crimson_msgr.cc b/src/crimson/tools/perf_crimson_msgr.cc
index e5f56361fff..5623438f821 100644
--- a/src/crimson/tools/perf_crimson_msgr.cc
+++ b/src/crimson/tools/perf_crimson_msgr.cc
@@ -1,6 +1,7 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
+#include <iomanip>
#include <map>
#include <boost/program_options.hpp>
#include <boost/iterator/counting_iterator.hpp>
diff --git a/src/crimson/tools/store_nbd/tm_driver.cc b/src/crimson/tools/store_nbd/tm_driver.cc
index 7af0d996caa..870809c5153 100644
--- a/src/crimson/tools/store_nbd/tm_driver.cc
+++ b/src/crimson/tools/store_nbd/tm_driver.cc
@@ -25,6 +25,7 @@ seastar::future<> TMDriver::write(
return tm->with_transaction_intr(
Transaction::src_t::MUTATE,
"write",
+ CACHE_HINT_TOUCH,
[this, offset, &ptr](auto& t)
{
return tm->remove(t, laddr_t::from_byte_offset(offset)
@@ -82,11 +83,14 @@ TMDriver::read_extents_ret TMDriver::read_extents(
return tm->read_pin<TestBlock>(
t,
std::move(pin)
- ).si_then([&ret](auto ref) mutable {
- ret.push_back(std::make_pair(ref->get_laddr(), ref));
+ ).si_then([&ret](auto maybe_indirect_extent) mutable {
+ assert(!maybe_indirect_extent.is_indirect());
+ assert(!maybe_indirect_extent.is_clone);
+ auto& e = maybe_indirect_extent.extent;
+ ret.push_back(std::make_pair(e->get_laddr(), e));
logger().debug(
"read_extents: got extent {}",
- *ref);
+ *e);
return seastar::now();
});
}).si_then([&ret] {
@@ -109,6 +113,7 @@ seastar::future<bufferlist> TMDriver::read(
return tm->with_transaction_intr(
Transaction::src_t::READ,
"read",
+ CACHE_HINT_TOUCH,
[=, &blret, this](auto& t)
{
return read_extents(t, laddr_t::from_byte_offset(offset), size