diff options
Diffstat (limited to 'src/os/bluestore/BlueStore.h')
-rw-r--r-- | src/os/bluestore/BlueStore.h | 168 |
1 files changed, 155 insertions, 13 deletions
diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 8141a7babf7..5549f97ffea 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -127,6 +127,7 @@ enum { // write op stats //**************************************** + l_bluestore_write_lat, l_bluestore_write_big, l_bluestore_write_big_bytes, l_bluestore_write_big_blobs, @@ -274,6 +275,22 @@ public: virtual ~AioContext() {} }; + static constexpr uint32_t OBJECT_MAX_SIZE = 0xffffffff; // 32 bits + struct printer { + static constexpr uint16_t PTR = 1; // pointer to Blob + static constexpr uint16_t NICK = 2; // a nickname of this Blob + static constexpr uint16_t DISK = 4; // disk allocations of Blob + static constexpr uint16_t SDISK = 8; // shortened version of disk allocaitons + static constexpr uint16_t USE = 16; // use tracker + static constexpr uint16_t SUSE = 32; // shortened use tracker + static constexpr uint16_t CHK = 64; // checksum, full dump + static constexpr uint16_t SCHK = 128; // only base checksum info + static constexpr uint16_t BUF = 256; // print Blob's buffers (takes cache lock) + static constexpr uint16_t SBUF = 512; // short print Blob's buffers (takes cache lock) + static constexpr uint16_t ATTRS = 1024; // print attrs in onode + static constexpr uint16_t JUSTID = 2048; // used to suppress printing length, spanning and shared blob + }; + /// cached buffer struct Buffer { MEMPOOL_CLASS_HELPERS(); @@ -291,6 +308,16 @@ public: default: return "???"; } } + // Short version of state name. + // Not print "clean", as it is most frequent. + static const char *get_state_name_short(int s) { + switch (s) { + case STATE_EMPTY: return ",empty"; + case STATE_CLEAN: return ""; + case STATE_WRITING: return ",writing"; + default: return "???"; + } + } enum { FLAG_NOCACHE = 1, ///< trim when done WRITING (do not become CLEAN) // NOTE: fix operator<< when you define a second flag @@ -637,7 +664,16 @@ public: void dump(ceph::Formatter* f) const; friend std::ostream& operator<<(std::ostream& out, const Blob &b); - + struct printer : public BlueStore::printer { + const Blob& blob; + uint16_t mode; + printer(const Blob& blob, uint16_t mode) + :blob(blob), mode(mode) {} + }; + friend std::ostream& operator<<(std::ostream& out, const printer &p); + printer print(uint16_t mode) const { + return printer(*this, mode); + } const bluestore_blob_use_tracker_t& get_blob_use_tracker() const { return used_in_blob; } @@ -690,6 +726,7 @@ public: o.blob_bl = blob_bl; #endif } + void add_tail(uint32_t new_blob_size, uint32_t min_release_size); void dup(const Blob& from, bool copy_used_in_blob); void copy_from(CephContext* cct, const Blob& from, uint32_t min_release_size, uint32_t start, uint32_t len); @@ -712,6 +749,11 @@ public: /// put logical references, and get back any released extents bool put_ref(Collection *coll, uint32_t offset, uint32_t length, PExtentVector *r); + uint32_t put_ref_accumulate( + Collection *coll, + uint32_t offset, + uint32_t length, + PExtentVector *released_disk); /// split the blob void split(Collection *coll, uint32_t blob_offset, Blob *o); @@ -842,6 +884,16 @@ public: blob->get_cache()->rm_extent(); } } + struct printer : public BlueStore::printer { + const Extent& ext; + uint16_t mode; + printer(const Extent& ext, uint16_t mode) + :ext(ext), mode(mode) {} + }; + friend std::ostream& operator<<(std::ostream& out, const printer &p); + printer print(uint16_t mode) const { + return printer(*this, mode); + } void dump(ceph::Formatter* f) const; @@ -1104,6 +1156,10 @@ public: extent_map_t::iterator seek_lextent(uint64_t offset); extent_map_t::const_iterator seek_lextent(uint64_t offset) const; + /// split extent + extent_map_t::iterator split_at(extent_map_t::iterator p, uint32_t offset); + /// if inside extent split it, if not return extent on right + extent_map_t::iterator maybe_split_at(uint32_t offset); /// add a new Extent void add(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) { extent_map.insert(*new Extent(lo, o, l, b)); @@ -1401,9 +1457,25 @@ public: } void rewrite_omap_key(const std::string& old, std::string *out); + size_t calc_userkey_offset_in_omap_key() const; void decode_omap_key(const std::string& key, std::string *user_key); void finish_write(TransContext* txc, uint32_t offset, uint32_t length); + + struct printer : public BlueStore::printer { + const Onode &onode; + uint16_t mode; + uint32_t from = 0; + uint32_t end = OBJECT_MAX_SIZE; + printer(const Onode &onode, uint16_t mode) : onode(onode), mode(mode) {} + printer(const Onode &onode, uint16_t mode, uint32_t from, uint32_t end) + : onode(onode), mode(mode), from(from), end(end) {} + }; + friend std::ostream &operator<<(std::ostream &out, const printer &p); + printer print(uint16_t mode) const { return printer(*this, mode); } + printer print(uint16_t mode, uint32_t from, uint32_t end) const { + return printer(*this, mode, from, end); + } }; /// A generic Cache Shard @@ -1682,6 +1754,7 @@ public: int next() override; std::string key() override; ceph::buffer::list value() override; + std::string_view value_as_sv() override; std::string tail_key() override { return tail; } @@ -1767,6 +1840,14 @@ public: values[STATFS_COMPRESSED_ALLOCATED] = st.data_compressed_allocated; return *this; } + bool operator==(const volatile_statfs& rhs) const { + return + values[STATFS_ALLOCATED] == rhs.values[STATFS_ALLOCATED] && + values[STATFS_STORED] == rhs.values[STATFS_STORED] && + values[STATFS_COMPRESSED_ORIGINAL] == rhs.values[STATFS_COMPRESSED_ORIGINAL] && + values[STATFS_COMPRESSED] == rhs.values[STATFS_COMPRESSED] && + values[STATFS_COMPRESSED_ALLOCATED] == rhs.values[STATFS_COMPRESSED_ALLOCATED]; + } bool is_empty() { return values[STATFS_ALLOCATED] == 0 && values[STATFS_STORED] == 0 && @@ -2017,6 +2098,20 @@ public: Throttle throttle_deferred_bytes; ///< submit to deferred complete public: + ceph::mutex lock = ceph::make_mutex("BlueStoreThrottle::max_lock"); + + std::atomic<uint64_t> transactions = 0; + + int64_t bytes_observed_max = 0; + utime_t bytes_max_ts; + uint64_t transactions_observed_max = 0; + utime_t transactions_max_ts; + + uint64_t get_current() { + return throttle_bytes.get_current(); + } + + public: BlueStoreThrottle(CephContext *cct) : throttle_bytes(cct, "bluestore_throttle_bytes", 0), throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes", 0) @@ -2042,8 +2137,9 @@ public: KeyValueDB &db, TransContext &txc, ceph::mono_clock::time_point); - void release_kv_throttle(uint64_t cost) { + void release_kv_throttle(uint64_t cost, uint64_t txcs) { throttle_bytes.put(cost); + transactions -= txcs; } void release_deferred_throttle(uint64_t cost) { throttle_deferred_bytes.put(cost); @@ -2275,6 +2371,9 @@ public: bool apply_defer(); }; + class Writer; + friend class Writer; + // -------------------------------------------------------- // members private: @@ -2372,6 +2471,7 @@ private: std::numeric_limits<decltype(min_alloc_size)>::digits, "not enough bits for min_alloc_size"); bool elastic_shared_blobs = false; ///< use smart ExtentMap::dup to reduce shared blob count + bool use_write_v2 = false; ///< use new write path enum { // Please preserve the order since it's DB persistent @@ -2402,6 +2502,7 @@ private: uint64_t kv_ios = 0; uint64_t kv_throttle_costs = 0; + uint64_t kv_throttle_txcs = 0; // cache trim control uint64_t cache_size = 0; ///< total cache size @@ -3109,6 +3210,7 @@ public: int dump_bluefs_sizes(std::ostream& out); void trim_free_space(const std::string& type, std::ostream& outss); + static int zap_device(CephContext* cct, const std::string& dev); public: int statfs(struct store_statfs_t *buf, @@ -3316,15 +3418,6 @@ public: std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values ) override; -#ifdef WITH_SEASTAR - int omap_get_values( - CollectionHandle &c, ///< [in] Collection containing oid - const ghobject_t &oid, ///< [in] Object containing omap - const std::optional<std::string> &start_after, ///< [in] Keys to get - std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values - ) override; -#endif - /// Filters keys into out which are defined on oid int omap_check_keys( CollectionHandle &c, ///< [in] Collection containing oid @@ -3338,6 +3431,13 @@ public: const ghobject_t &oid ///< [in] object ) override; + int omap_iterate( + CollectionHandle &c, ///< [in] collection + const ghobject_t &oid, ///< [in] object + omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning + std::function<omap_iter_ret_t(std::string_view, std::string_view)> f + ) override; + void set_fsid(uuid_d u) override { fsid = u; } @@ -3463,6 +3563,29 @@ public: CephContext* cct, const std::string &path, const bluestore_bdev_label_t& label, uint64_t disk_position = 0); + void debug_punch_hole_2( + CollectionRef& c, + OnodeRef& o, + uint32_t offset, + uint32_t length, + PExtentVector& released, + std::vector<BlobRef>& pruned_blobs, + std::set<SharedBlobRef>& shared_changed, + volatile_statfs& statfs_delta) { + _punch_hole_2(c.get(), o, offset, length, released, + pruned_blobs, shared_changed, statfs_delta); + } + Allocator*& debug_get_alloc() { + return alloc; + } + void debug_set_block_size(uint64_t _block_size) { + block_size = _block_size; + block_mask = ~(block_size - 1); + block_size_order = std::countr_zero(block_size); + } + void debug_set_prefer_deferred_size(uint64_t s) { + prefer_deferred_size = s; + } inline void log_latency(const char* name, int idx, const ceph::timespan& lat, @@ -3555,12 +3678,13 @@ private: // -------------------------------------------------------- // write ops - + public: struct WriteContext { bool buffered = false; ///< buffered write bool compress = false; ///< compressed write - uint64_t target_blob_size = 0; ///< target (max) blob size + uint8_t csum_type = 0; ///< checksum type for new blobs unsigned csum_order = 0; ///< target checksum chunk order + uint64_t target_blob_size = 0; ///< target (max) blob size old_extent_map_t old_extents; ///< must deref these blobs interval_set<uint64_t> extents_to_gc; ///< extents for garbage collection @@ -3609,6 +3733,7 @@ private: buffered = other.buffered; compress = other.compress; target_blob_size = other.target_blob_size; + csum_type = other.csum_type; csum_order = other.csum_order; } void write( @@ -3638,6 +3763,16 @@ private: uint64_t loffs_end, uint64_t min_alloc_size); }; + private: + BlueStore::extent_map_t::iterator _punch_hole_2( + Collection* c, + OnodeRef& o, + uint32_t offset, + uint32_t length, + PExtentVector& released, + std::vector<BlobRef>& pruned_blobs, + std::set<SharedBlobRef>& shared_changed, + volatile_statfs& statfs_delta); void _do_write_small( TransContext *txc, CollectionRef &c, @@ -3705,6 +3840,13 @@ private: uint64_t length, ceph::buffer::list& bl, WriteContext *wctx); + int _do_write_v2( + TransContext *txc, + CollectionRef &c, + OnodeRef& o, + uint64_t offset, uint64_t length, + ceph::buffer::list& bl, + uint32_t fadvise_flags); int _touch(TransContext *txc, CollectionRef& c, |