diff options
-rw-r--r-- | src/common/ceph_time.h | 17 | ||||
-rw-r--r-- | src/crimson/os/alienstore/alien_store.cc | 17 | ||||
-rw-r--r-- | src/kv/KeyValueDB.h | 22 | ||||
-rw-r--r-- | src/kv/RocksDBStore.cc | 92 | ||||
-rw-r--r-- | src/kv/RocksDBStore.h | 4 | ||||
-rw-r--r-- | src/os/DBObjectMap.cc | 5 | ||||
-rw-r--r-- | src/os/DBObjectMap.h | 2 | ||||
-rw-r--r-- | src/os/ObjectStore.h | 52 | ||||
-rw-r--r-- | src/os/bluestore/BlueStore.cc | 151 | ||||
-rw-r--r-- | src/os/bluestore/BlueStore.h | 18 | ||||
-rw-r--r-- | src/os/kstore/KStore.cc | 72 | ||||
-rw-r--r-- | src/os/kstore/KStore.h | 8 | ||||
-rw-r--r-- | src/os/memstore/MemStore.cc | 70 | ||||
-rw-r--r-- | src/os/memstore/MemStore.h | 15 | ||||
-rw-r--r-- | src/osd/PrimaryLogPG.cc | 49 | ||||
-rw-r--r-- | src/test/ObjectMap/KeyValueDBMemory.cc | 21 | ||||
-rw-r--r-- | src/test/objectstore/ObjectStoreImitator.h | 10 |
17 files changed, 502 insertions, 123 deletions
diff --git a/src/common/ceph_time.h b/src/common/ceph_time.h index 01feff4c063..0b05be5372e 100644 --- a/src/common/ceph_time.h +++ b/src/common/ceph_time.h @@ -342,6 +342,23 @@ public: } }; +// Please note time_guard is not thread safety -- multiple threads +// updating same diff_accumulator can corrupt it. +template <class ClockT = mono_clock> +class time_guard { + const typename ClockT::time_point start; + timespan& diff_accumulator; + +public: + time_guard(timespan& diff_accumulator) + : start(ClockT::now()), + diff_accumulator(diff_accumulator) { + } + ~time_guard() { + diff_accumulator += ClockT::now() - start; + } +}; + namespace time_detail { // So that our subtractions produce negative spans rather than // arithmetic underflow. diff --git a/src/crimson/os/alienstore/alien_store.cc b/src/crimson/os/alienstore/alien_store.cc index a9c69f4660e..db6decd84f9 100644 --- a/src/crimson/os/alienstore/alien_store.cc +++ b/src/crimson/os/alienstore/alien_store.cc @@ -435,8 +435,21 @@ auto AlienStore::omap_get_values(CollectionRef ch, return do_with_op_gate(omap_values_t{}, [=, this] (auto &values) { return tp->submit(ch->get_cid().hash_to_shard(tp->size()), [=, this, &values] { auto c = static_cast<AlienCollection*>(ch.get()); - return store->omap_get_values(c->collection, oid, start, - reinterpret_cast<map<string, bufferlist>*>(&values)); + return store->omap_iterate( + c->collection, oid, + ObjectStore::omap_iter_seek_t{ + .seek_position = start.value_or(std::string{}), + // FIXME: classical OSDs begins iteration from LOWER_BOUND + // (or UPPER_BOUND if filter_prefix > start). However, these + // bits are not implemented yet + .seek_type = ObjectStore::omap_iter_seek_t::UPPER_BOUND + }, + [&values] + (std::string_view key, std::string_view value) mutable { + values[std::string{key}].append(value); + // FIXME: there is limit on number of entries yet + return ObjectStore::omap_iter_ret_t::NEXT; + }); }).then([&values] (int r) -> read_errorator::future<std::tuple<bool, omap_values_t>> { if (r == -ENOENT) { diff --git a/src/kv/KeyValueDB.h b/src/kv/KeyValueDB.h index 858742d511e..d926840180e 100644 --- a/src/kv/KeyValueDB.h +++ b/src/kv/KeyValueDB.h @@ -9,6 +9,7 @@ #include <map> #include <optional> #include <string> +#include <string_view> #include <boost/scoped_ptr.hpp> #include "include/encoding.h" #include "common/Formatter.h" @@ -211,6 +212,10 @@ public: return ""; } virtual ceph::buffer::list value() = 0; + // When valid() returns true, value returned as string-view + // is guaranteed to be valid until iterator is moved to another + // position; that is until call to next() / seek_to_first() / etc. + virtual std::string_view value_as_sv() = 0; virtual int status() = 0; virtual ~SimplestIteratorImpl() {} }; @@ -220,7 +225,12 @@ public: virtual ~IteratorImpl() {} virtual int seek_to_last() = 0; virtual int prev() = 0; + // When valid() returns true, key returned as string-view + // is guaranteed to be valid until iterator is moved to another + // position; that is until call to next() / seek_to_first() / etc. + virtual std::string_view key_as_sv() = 0; virtual std::pair<std::string, std::string> raw_key() = 0; + virtual std::pair<std::string_view, std::string_view> raw_key_as_sv() = 0; virtual ceph::buffer::ptr value_as_ptr() { ceph::buffer::list bl = value(); if (bl.length() == 1) { @@ -247,7 +257,9 @@ public: virtual int next() = 0; virtual int prev() = 0; virtual std::string key() = 0; + virtual std::string_view key_as_sv() = 0; virtual std::pair<std::string,std::string> raw_key() = 0; + virtual std::pair<std::string_view, std::string_view> raw_key_as_sv() = 0; virtual bool raw_key_is_prefixed(const std::string &prefix) = 0; virtual ceph::buffer::list value() = 0; virtual ceph::buffer::ptr value_as_ptr() { @@ -258,6 +270,7 @@ public: return ceph::buffer::ptr(); } } + virtual std::string_view value_as_sv() = 0; virtual int status() = 0; virtual size_t key_size() { return 0; @@ -315,15 +328,24 @@ private: std::string key() override { return generic_iter->key(); } + std::string_view key_as_sv() override { + return generic_iter->key_as_sv(); + } std::pair<std::string, std::string> raw_key() override { return generic_iter->raw_key(); } + std::pair<std::string_view, std::string_view> raw_key_as_sv() override { + return generic_iter->raw_key_as_sv(); + } ceph::buffer::list value() override { return generic_iter->value(); } ceph::buffer::ptr value_as_ptr() override { return generic_iter->value_as_ptr(); } + std::string_view value_as_sv() override { + return generic_iter->value_as_sv(); + } int status() override { return generic_iter->status(); } diff --git a/src/kv/RocksDBStore.cc b/src/kv/RocksDBStore.cc index ca63ea06484..51d224b67c0 100644 --- a/src/kv/RocksDBStore.cc +++ b/src/kv/RocksDBStore.cc @@ -6,6 +6,7 @@ #include <memory> #include <set> #include <string> +#include <string_view> #include <errno.h> #include <unistd.h> #include <sys/types.h> @@ -47,6 +48,7 @@ using std::ostream; using std::pair; using std::set; using std::string; +using std::string_view; using std::unique_ptr; using std::vector; @@ -1992,7 +1994,7 @@ int RocksDBStore::split_key(rocksdb::Slice in, string *prefix, string *key) // Find separator inside Slice char* separator = (char*) memchr(in.data(), 0, in.size()); - if (separator == NULL) + if (separator == nullptr) return -EINVAL; prefix_len = size_t(separator - in.data()); if (prefix_len >= in.size()) @@ -2006,6 +2008,27 @@ int RocksDBStore::split_key(rocksdb::Slice in, string *prefix, string *key) return 0; } +// TODO: deduplicate the code, preferrably by removing the string variant +int RocksDBStore::split_key(rocksdb::Slice in, string_view *prefix, string_view *key) +{ + size_t prefix_len = 0; + + // Find separator inside Slice + char* separator = (char*) memchr(in.data(), 0, in.size()); + if (separator == nullptr) + return -EINVAL; + prefix_len = size_t(separator - in.data()); + if (prefix_len >= in.size()) + return -EINVAL; + + // Fetch prefix and/or key directly from Slice + if (prefix) + *prefix = string_view(in.data(), prefix_len); + if (key) + *key = string_view(separator + 1, in.size() - prefix_len - 1); + return 0; +} + void RocksDBStore::compact() { dout(2) << __func__ << " starting" << dendl; @@ -2226,7 +2249,13 @@ int RocksDBStore::RocksDBWholeSpaceIteratorImpl::prev() string RocksDBStore::RocksDBWholeSpaceIteratorImpl::key() { string out_key; - split_key(dbiter->key(), 0, &out_key); + split_key(dbiter->key(), nullptr, &out_key); + return out_key; +} +string_view RocksDBStore::RocksDBWholeSpaceIteratorImpl::key_as_sv() +{ + string_view out_key; + split_key(dbiter->key(), nullptr, &out_key); return out_key; } pair<string,string> RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key() @@ -2235,6 +2264,12 @@ pair<string,string> RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key() split_key(dbiter->key(), &prefix, &key); return make_pair(prefix, key); } +pair<string_view,string_view> RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key_as_sv() +{ + string_view prefix, key; + split_key(dbiter->key(), &prefix, &key); + return make_pair(prefix, key); +} bool RocksDBStore::RocksDBWholeSpaceIteratorImpl::raw_key_is_prefixed(const string &prefix) { // Look for "prefix\0" right in rocksb::Slice @@ -2267,6 +2302,12 @@ bufferptr RocksDBStore::RocksDBWholeSpaceIteratorImpl::value_as_ptr() return bufferptr(val.data(), val.size()); } +std::string_view RocksDBStore::RocksDBWholeSpaceIteratorImpl::value_as_sv() +{ + rocksdb::Slice val = dbiter->value(); + return std::string_view{val.data(), val.size()}; +} + int RocksDBStore::RocksDBWholeSpaceIteratorImpl::status() { return dbiter->status().ok() ? 0 : -1; @@ -2348,9 +2389,15 @@ public: string key() override { return dbiter->key().ToString(); } + string_view key_as_sv() override { + return dbiter->key().ToStringView(); + } std::pair<std::string, std::string> raw_key() override { return make_pair(prefix, key()); } + std::pair<std::string_view, std::string_view> raw_key_as_sv() override { + return make_pair(prefix, dbiter->key().ToStringView()); + } bufferlist value() override { return to_bufferlist(dbiter->value()); } @@ -2358,6 +2405,10 @@ public: rocksdb::Slice val = dbiter->value(); return bufferptr(val.data(), val.size()); } + std::string_view value_as_sv() override { + rocksdb::Slice val = dbiter->value(); + return std::string_view{val.data(), val.size()}; + } int status() override { return dbiter->status().ok() ? 0 : -1; } @@ -2668,6 +2719,15 @@ public: } } + std::string_view key_as_sv() override + { + if (smaller == on_main) { + return main->key_as_sv(); + } else { + return current_shard->second->key_as_sv(); + } + } + std::pair<std::string,std::string> raw_key() override { if (smaller == on_main) { @@ -2677,6 +2737,15 @@ public: } } + std::pair<std::string_view,std::string_view> raw_key_as_sv() override + { + if (smaller == on_main) { + return main->raw_key_as_sv(); + } else { + return { current_shard->first, current_shard->second->key_as_sv() }; + } + } + bool raw_key_is_prefixed(const std::string &prefix) override { if (smaller == on_main) { @@ -2695,6 +2764,15 @@ public: } } + std::string_view value_as_sv() override + { + if (smaller == on_main) { + return main->value_as_sv(); + } else { + return current_shard->second->value_as_sv(); + } + } + int status() override { //because we already had to inspect key, it must be ok @@ -3017,9 +3095,15 @@ public: string key() override { return iters[0]->key().ToString(); } + string_view key_as_sv() override { + return iters[0]->key().ToStringView(); + } std::pair<std::string, std::string> raw_key() override { return make_pair(prefix, key()); } + std::pair<std::string_view, std::string_view> raw_key_as_sv() override { + return make_pair(prefix, iters[0]->key().ToStringView()); + } bufferlist value() override { return to_bufferlist(iters[0]->value()); } @@ -3027,6 +3111,10 @@ public: rocksdb::Slice val = iters[0]->value(); return bufferptr(val.data(), val.size()); } + std::string_view value_as_sv() override { + rocksdb::Slice val = iters[0]->value(); + return std::string_view{val.data(), val.size()}; + } int status() override { return iters[0]->status().ok() ? 0 : -1; } diff --git a/src/kv/RocksDBStore.h b/src/kv/RocksDBStore.h index 477b209854c..50b91be2bf6 100644 --- a/src/kv/RocksDBStore.h +++ b/src/kv/RocksDBStore.h @@ -386,10 +386,13 @@ public: int next() override; int prev() override; std::string key() override; + std::string_view key_as_sv() override; std::pair<std::string,std::string> raw_key() override; + std::pair<std::string_view,std::string_view> raw_key_as_sv() override; bool raw_key_is_prefixed(const std::string &prefix) override; ceph::bufferlist value() override; ceph::bufferptr value_as_ptr() override; + std::string_view value_as_sv() override; int status() override; size_t key_size() override; size_t value_size() override; @@ -419,6 +422,7 @@ public: } static int split_key(rocksdb::Slice in, std::string *prefix, std::string *key); + static int split_key(rocksdb::Slice in, std::string_view *prefix, std::string_view *key); static std::string past_prefix(const std::string &prefix); diff --git a/src/os/DBObjectMap.cc b/src/os/DBObjectMap.cc index 7da9a67be62..65627b5f818 100644 --- a/src/os/DBObjectMap.cc +++ b/src/os/DBObjectMap.cc @@ -519,6 +519,11 @@ bufferlist DBObjectMap::DBObjectMapIteratorImpl::value() return cur_iter->value(); } +std::string_view DBObjectMap::DBObjectMapIteratorImpl::value_as_sv() +{ + return cur_iter->value_as_sv(); +} + int DBObjectMap::DBObjectMapIteratorImpl::status() { return r; diff --git a/src/os/DBObjectMap.h b/src/os/DBObjectMap.h index 444f21eb815..1e1452010e7 100644 --- a/src/os/DBObjectMap.h +++ b/src/os/DBObjectMap.h @@ -393,6 +393,7 @@ private: int next() override { ceph_abort(); return 0; } std::string key() override { ceph_abort(); return ""; } ceph::buffer::list value() override { ceph_abort(); return ceph::buffer::list(); } + std::string_view value_as_sv() override { ceph_abort(); return std::string_view(); } int status() override { return 0; } }; @@ -431,6 +432,7 @@ private: int next() override; std::string key() override; ceph::buffer::list value() override; + std::string_view value_as_sv() override; int status() override; bool on_parent() { diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h index 521435b6c31..df3ae920a2f 100644 --- a/src/os/ObjectStore.h +++ b/src/os/ObjectStore.h @@ -29,6 +29,7 @@ #include <errno.h> #include <sys/stat.h> +#include <functional> #include <map> #include <memory> #include <vector> @@ -735,15 +736,6 @@ public: std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values ) = 0; -#ifdef WITH_SEASTAR - virtual int omap_get_values( - CollectionHandle &c, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object containing omap - const std::optional<std::string> &start_after, ///< [in] Keys to get - std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values - ) = 0; -#endif - /// Filters keys into out which are defined on oid virtual int omap_check_keys( CollectionHandle &c, ///< [in] Collection containing oid @@ -766,6 +758,48 @@ public: const ghobject_t &oid ///< [in] object ) = 0; + struct omap_iter_seek_t { + std::string seek_position; + enum { + // start with provided key (seek_position), if it exists + LOWER_BOUND, + // skip provided key (seek_position) even if it exists + UPPER_BOUND + } seek_type = LOWER_BOUND; + static omap_iter_seek_t min_lower_bound() { return {}; } + }; + enum class omap_iter_ret_t { + STOP, + NEXT + }; + /** + * Iterate over object map with user-provided callable + * + * Warning! The callable is executed under lock on bluestore + * operations in c. Do not use bluestore methods on c while + * iterating. (Filling in a transaction is no problem). + * + * @param c collection + * @param oid object + * @param start_from where the iterator should point to at + * the beginning + * @param visitor callable that takes OMAP key and corresponding + * value as string_views and controls iteration + * by the return. It is executed for every object's + * OMAP entry from `start_from` till end of the + * object's OMAP or till the iteration is stopped + * by `STOP`. Please note that if there is no such + * entry, `visitor` will be called 0 times. + * @return error code, zero on success + */ + virtual int omap_iterate( + CollectionHandle &c, + const ghobject_t &oid, + omap_iter_seek_t start_from, + std::function<omap_iter_ret_t(std::string_view, + std::string_view)> visitor + ) = 0; + virtual int flush_journal() { return -EOPNOTSUPP; } virtual int dump_journal(std::ostream& out) { return -EOPNOTSUPP; } diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index a024a0c2105..5d7c8ef07d5 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -4830,7 +4830,7 @@ void BlueStore::Onode::rewrite_omap_key(const string& old, string *out) out->append(old.c_str() + out->length(), old.size() - out->length()); } -void BlueStore::Onode::decode_omap_key(const string& key, string *user_key) +size_t BlueStore::Onode::calc_userkey_offset_in_omap_key() const { size_t pos = sizeof(uint64_t) + 1; if (!onode.is_pgmeta_omap()) { @@ -4840,9 +4840,15 @@ void BlueStore::Onode::decode_omap_key(const string& key, string *user_key) pos += sizeof(uint64_t); } } - *user_key = key.substr(pos); + return pos; } +void BlueStore::Onode::decode_omap_key(const string& key, string *user_key) +{ + *user_key = key.substr(calc_userkey_offset_in_omap_key()); +} + + void BlueStore::Onode::finish_write(TransContext* txc, uint32_t offset, uint32_t length) { while (true) { @@ -5654,6 +5660,13 @@ bufferlist BlueStore::OmapIteratorImpl::value() return it->value(); } +std::string_view BlueStore::OmapIteratorImpl::value_as_sv() +{ + std::shared_lock l(c->lock); + ceph_assert(it->valid()); + return it->value_as_sv(); +} + // ===================================== @@ -13601,52 +13614,6 @@ int BlueStore::omap_get_values( return r; } -#ifdef WITH_SEASTAR -int BlueStore::omap_get_values( - CollectionHandle &c_, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object containing omap - const std::optional<string> &start_after, ///< [in] Keys to get - map<string, bufferlist> *output ///< [out] Returned keys and values - ) -{ - Collection *c = static_cast<Collection *>(c_.get()); - dout(15) << __func__ << " " << c->get_cid() << " oid " << oid << dendl; - if (!c->exists) - return -ENOENT; - std::shared_lock l(c->lock); - int r = 0; - OnodeRef o = c->get_onode(oid, false); - if (!o || !o->exists) { - r = -ENOENT; - goto out; - } - if (!o->onode.has_omap()) { - goto out; - } - o->flush(); - { - ObjectMap::ObjectMapIterator iter = get_omap_iterator(c_, oid); - if (!iter) { - r = -ENOENT; - goto out; - } - if (start_after) { - iter->upper_bound(*start_after); - } else { - iter->seek_to_first(); - } - for (; iter->valid(); iter->next()) { - output->insert(make_pair(iter->key(), iter->value())); - } - } - -out: - dout(10) << __func__ << " " << c->get_cid() << " oid " << oid << " = " << r - << dendl; - return r; -} -#endif - int BlueStore::omap_check_keys( CollectionHandle &c_, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap @@ -13724,6 +13691,94 @@ ObjectMap::ObjectMapIterator BlueStore::get_omap_iterator( return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(logger,c, o, it)); } +int BlueStore::omap_iterate( + CollectionHandle &c_, ///< [in] collection + const ghobject_t &oid, ///< [in] object + ObjectStore::omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f + ) +{ + Collection *c = static_cast<Collection *>(c_.get()); + dout(10) << __func__ << " " << c->get_cid() << " " << oid << dendl; + if (!c->exists) { + return -ENOENT; + } + std::shared_lock l(c->lock); + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl; + return -ENOENT; + } + o->flush(); + dout(10) << __func__ << " has_omap = " << (int)o->onode.has_omap() <<dendl; + if (!o->onode.has_omap()) { + // nothing to do + return 0; + } + + KeyValueDB::Iterator it; + { + auto bounds = KeyValueDB::IteratorBounds(); + std::string lower_bound, upper_bound; + o->get_omap_key(string(), &lower_bound); + o->get_omap_tail(&upper_bound); + bounds.lower_bound = std::move(lower_bound); + bounds.upper_bound = std::move(upper_bound); + it = db->get_iterator(o->get_omap_prefix(), 0, std::move(bounds)); + } + + // seek the iterator + { + std::string key; + o->get_omap_key(start_from.seek_position, &key); + auto start = ceph::mono_clock::now(); + if (start_from.seek_type == omap_iter_seek_t::LOWER_BOUND) { + it->lower_bound(key); + c->store->log_latency( + __func__, + l_bluestore_omap_lower_bound_lat, + ceph::mono_clock::now() - start, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + } else { + it->upper_bound(key); + c->store->log_latency( + __func__, + l_bluestore_omap_upper_bound_lat, + ceph::mono_clock::now() - start, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + } + } + + // iterate! + std::string tail; + o->get_omap_tail(&tail); + const std::string_view::size_type userkey_offset_in_dbkey = + o->calc_userkey_offset_in_omap_key(); + ceph::timespan next_lat_acc{0}; + while (it->valid()) { + const auto& db_key = it->raw_key_as_sv().second; + if (db_key >= tail) { + break; + } + std::string_view user_key = db_key.substr(userkey_offset_in_dbkey); + omap_iter_ret_t ret = f(user_key, it->value_as_sv()); + if (ret == omap_iter_ret_t::STOP) { + break; + } else if (ret == omap_iter_ret_t::NEXT) { + ceph::time_guard<ceph::mono_clock>{next_lat_acc}; + it->next(); + } else { + ceph_abort(); + } + } + c->store->log_latency( + __func__, + l_bluestore_omap_next_lat, + next_lat_acc, + c->store->cct->_conf->bluestore_log_omap_iterator_age); + return 0; +} + // ----------------- // write helpers diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 99f8d057cf0..5549f97ffea 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -1457,6 +1457,7 @@ public: } void rewrite_omap_key(const std::string& old, std::string *out); + size_t calc_userkey_offset_in_omap_key() const; void decode_omap_key(const std::string& key, std::string *user_key); void finish_write(TransContext* txc, uint32_t offset, uint32_t length); @@ -1753,6 +1754,7 @@ public: int next() override; std::string key() override; ceph::buffer::list value() override; + std::string_view value_as_sv() override; std::string tail_key() override { return tail; } @@ -3416,15 +3418,6 @@ public: std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values ) override; -#ifdef WITH_SEASTAR - int omap_get_values( - CollectionHandle &c, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object containing omap - const std::optional<std::string> &start_after, ///< [in] Keys to get - std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values - ) override; -#endif - /// Filters keys into out which are defined on oid int omap_check_keys( CollectionHandle &c, ///< [in] Collection containing oid @@ -3438,6 +3431,13 @@ public: const ghobject_t &oid ///< [in] object ) override; + int omap_iterate( + CollectionHandle &c, ///< [in] collection + const ghobject_t &oid, ///< [in] object + omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f + ) override; + void set_fsid(uuid_d u) override { fsid = u; } diff --git a/src/os/kstore/KStore.cc b/src/os/kstore/KStore.cc index 7158486ca38..a069d429155 100644 --- a/src/os/kstore/KStore.cc +++ b/src/os/kstore/KStore.cc @@ -1651,6 +1651,13 @@ bufferlist KStore::OmapIteratorImpl::value() return it->value(); } +std::string_view KStore::OmapIteratorImpl::value_as_sv() +{ + std::shared_lock l{c->lock}; + ceph_assert(it->valid()); + return it->value_as_sv(); +} + int KStore::omap_get( CollectionHandle& ch, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap @@ -1866,6 +1873,71 @@ ObjectMap::ObjectMapIterator KStore::get_omap_iterator( return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o, it)); } +int KStore::omap_iterate( + CollectionHandle &ch, ///< [in] collection + const ghobject_t &oid, ///< [in] object + ObjectStore::omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f) +{ + dout(10) << __func__ << " " << ch->cid << " " << oid << dendl; + Collection *c = static_cast<Collection*>(ch.get()); + { + std::shared_lock l{c->lock}; + + OnodeRef o = c->get_onode(oid, false); + if (!o || !o->exists) { + dout(10) << __func__ << " " << oid << "doesn't exist" <<dendl; + return -ENOENT; + } + o->flush(); + dout(10) << __func__ << " header = " << o->onode.omap_head <<dendl; + + KeyValueDB::Iterator it = db->get_iterator(PREFIX_OMAP); + std::string tail; + std::string seek_key; + if (o->onode.omap_head) { + return 0; // nothing to do + } + + // acquire data depedencies for seek & iterate + get_omap_key(o->onode.omap_head, start_from.seek_position, &seek_key); + get_omap_tail(o->onode.omap_head, &tail); + + // acquire the iterator + { + it = db->get_iterator(PREFIX_OMAP); + } + + // seek the iterator + { + if (start_from.seek_type == omap_iter_seek_t::LOWER_BOUND) { + it->lower_bound(seek_key); + } else { + it->upper_bound(seek_key); + } + } + + // iterate! + while (it->valid()) { + std::string user_key; + if (const auto& db_key = it->raw_key().second; db_key >= tail) { + break; + } else { + decode_omap_key(db_key, &user_key); + } + omap_iter_ret_t ret = f(user_key, it->value_as_sv()); + if (ret == omap_iter_ret_t::STOP) { + break; + } else if (ret == omap_iter_ret_t::NEXT) { + it->next(); + } else { + ceph_abort(); + } + } + } + return 0; +} + // ----------------- // write helpers diff --git a/src/os/kstore/KStore.h b/src/os/kstore/KStore.h index 9a9d413c66a..06115d3cab7 100644 --- a/src/os/kstore/KStore.h +++ b/src/os/kstore/KStore.h @@ -180,6 +180,7 @@ public: int next() override; std::string key() override; ceph::buffer::list value() override; + std::string_view value_as_sv() override; int status() override { return 0; } @@ -553,6 +554,13 @@ public: const ghobject_t &oid ///< [in] object ) override; + int omap_iterate( + CollectionHandle &c, ///< [in] collection + const ghobject_t &oid, ///< [in] object + omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f + ) override; + void set_fsid(uuid_d u) override { fsid = u; } diff --git a/src/os/memstore/MemStore.cc b/src/os/memstore/MemStore.cc index 89cb09361cf..f9d3bf0d8a2 100644 --- a/src/os/memstore/MemStore.cc +++ b/src/os/memstore/MemStore.cc @@ -537,30 +537,6 @@ int MemStore::omap_get_values( return 0; } -#ifdef WITH_SEASTAR -int MemStore::omap_get_values( - CollectionHandle& ch, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object containing omap - const std::optional<std::string> &start_after, ///< [in] Keys to get - std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values - ) -{ - dout(10) << __func__ << " " << ch->cid << " " << oid << dendl; - Collection *c = static_cast<Collection*>(ch.get()); - ObjectRef o = c->get_object(oid); - if (!o) - return -ENOENT; - assert(start_after); - std::lock_guard lock{o->omap_mutex}; - for (auto it = o->omap.upper_bound(*start_after); - it != std::end(o->omap); - ++it) { - out->insert(*it); - } - return 0; -} -#endif - int MemStore::omap_check_keys( CollectionHandle& ch, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap @@ -622,6 +598,10 @@ public: std::lock_guard lock{o->omap_mutex}; return it->second; } + std::string_view value_as_sv() override { + std::lock_guard lock{o->omap_mutex}; + return std::string_view{it->second.c_str(), it->second.length()}; + } int status() override { return 0; } @@ -639,6 +619,48 @@ ObjectMap::ObjectMapIterator MemStore::get_omap_iterator( return ObjectMap::ObjectMapIterator(new OmapIteratorImpl(c, o)); } +int MemStore::omap_iterate( + CollectionHandle &ch, ///< [in] collection + const ghobject_t &oid, ///< [in] object + ObjectStore::omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f) +{ + Collection *c = static_cast<Collection*>(ch.get()); + ObjectRef o = c->get_object(oid); + if (!o) { + return -ENOENT; + } + + { + std::lock_guard lock{o->omap_mutex}; + + // obtain seek the iterator + decltype(o->omap)::iterator it; + { + if (start_from.seek_type == omap_iter_seek_t::LOWER_BOUND) { + it = o->omap.lower_bound(start_from.seek_position); + } else { + it = o->omap.upper_bound(start_from.seek_position); + } + } + + // iterate! + while (it != o->omap.end()) { + // potentially rectifying memcpy but who cares for memstore? + omap_iter_ret_t ret = + f(it->first, std::string_view{it->second.c_str(), it->second.length()}); + if (ret == omap_iter_ret_t::STOP) { + break; + } else if (ret == omap_iter_ret_t::NEXT) { + ++it; + } else { + ceph_abort(); + } + } + } + return 0; +} + // --------------- // write operations diff --git a/src/os/memstore/MemStore.h b/src/os/memstore/MemStore.h index 2abe552891f..9621773598f 100644 --- a/src/os/memstore/MemStore.h +++ b/src/os/memstore/MemStore.h @@ -363,14 +363,6 @@ public: const std::set<std::string> &keys, ///< [in] Keys to get std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values ) override; -#ifdef WITH_SEASTAR - int omap_get_values( - CollectionHandle &c, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object containing omap - const std::optional<std::string> &start_after, ///< [in] Keys to get - std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values - ) override; -#endif using ObjectStore::omap_check_keys; /// Filters keys into out which are defined on oid @@ -387,6 +379,13 @@ public: const ghobject_t &oid ///< [in] object ) override; + int omap_iterate( + CollectionHandle &c, ///< [in] collection + const ghobject_t &oid, ///< [in] object + omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f + ) override; + void set_fsid(uuid_d u) override; uuid_d get_fsid() override; diff --git a/src/osd/PrimaryLogPG.cc b/src/osd/PrimaryLogPG.cc index 75484edb75d..3324ba9dc91 100644 --- a/src/osd/PrimaryLogPG.cc +++ b/src/osd/PrimaryLogPG.cc @@ -7786,27 +7786,34 @@ int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops) bool truncated = false; bufferlist bl; if (oi.is_omap()) { - ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator( - ch, ghobject_t(soid) - ); - if (!iter) { - result = -ENOENT; - goto fail; - } - iter->upper_bound(start_after); - if (filter_prefix > start_after) iter->lower_bound(filter_prefix); - for (num = 0; - iter->valid() && - iter->key().substr(0, filter_prefix.size()) == filter_prefix; - ++num, iter->next()) { - dout(20) << "Found key " << iter->key() << dendl; - if (num >= max_return || - bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) { - truncated = true; - break; - } - encode(iter->key(), bl); - encode(iter->value(), bl); + using omap_iter_seek_t = ObjectStore::omap_iter_seek_t; + result = osd->store->omap_iterate( + ch, ghobject_t(soid), + // try to seek as many keys-at-once as possible for the sake of performance. + // note complexity should be logarithmic, so seek(n/2) + seek(n/2) is worse + // than just seek(n). + ObjectStore::omap_iter_seek_t{ + .seek_position = std::max(start_after, filter_prefix), + .seek_type = filter_prefix > start_after ? omap_iter_seek_t::LOWER_BOUND + : omap_iter_seek_t::UPPER_BOUND + }, + [&bl, &truncated, &filter_prefix, &num, max_return, + max_bytes=cct->_conf->osd_max_omap_bytes_per_request] + (std::string_view key, std::string_view value) mutable { + if (key.substr(0, filter_prefix.size()) != filter_prefix) { + return ObjectStore::omap_iter_ret_t::STOP; + } + if (num >= max_return || bl.length() >= max_bytes) { + truncated = true; + return ObjectStore::omap_iter_ret_t::STOP; + } + encode(key, bl); + encode(value, bl); + ++num; + return ObjectStore::omap_iter_ret_t::NEXT; + }); + if (result < 0) { + goto fail; } } // else return empty out_set encode(num, osd_op.outdata); diff --git a/src/test/ObjectMap/KeyValueDBMemory.cc b/src/test/ObjectMap/KeyValueDBMemory.cc index 234e963397e..cfe25930d6a 100644 --- a/src/test/ObjectMap/KeyValueDBMemory.cc +++ b/src/test/ObjectMap/KeyValueDBMemory.cc @@ -132,12 +132,26 @@ public: return ""; } + string_view key_as_sv() override { + if (valid()) + return (*it).first.second; + else + return ""; + } + pair<string,string> raw_key() override { if (valid()) return (*it).first; else return make_pair("", ""); } + + pair<string_view,string_view> raw_key_as_sv() override { + if (valid()) + return (*it).first; + else + return make_pair("", ""); + } bool raw_key_is_prefixed(const string &prefix) override { return prefix == (*it).first.first; @@ -150,6 +164,13 @@ public: return bufferlist(); } + std::string_view value_as_sv() override { + if (valid()) + return std::string_view{it->second.c_str(), it->second.length()}; + else + return std::string_view(); + } + int status() override { return 0; } diff --git a/src/test/objectstore/ObjectStoreImitator.h b/src/test/objectstore/ObjectStoreImitator.h index d71d7f2fe58..875f9041b83 100644 --- a/src/test/objectstore/ObjectStoreImitator.h +++ b/src/test/objectstore/ObjectStoreImitator.h @@ -347,6 +347,16 @@ public: ) override { return {}; } + + int omap_iterate(CollectionHandle &c, ///< [in] collection + const ghobject_t &oid, ///< [in] object + /// [in] where the iterator should point to at the beginning + omap_iter_seek_t start_from, + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f + ) override { + return 0; + } + void set_fsid(uuid_d u) override {} uuid_d get_fsid() override { return {}; } uint64_t estimate_objects_overhead(uint64_t num_objects) override { |