summaryrefslogtreecommitdiffstats
path: root/src/os/bluestore/BlueStore.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/os/bluestore/BlueStore.h')
-rw-r--r--src/os/bluestore/BlueStore.h168
1 files changed, 155 insertions, 13 deletions
diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h
index 8141a7babf7..5549f97ffea 100644
--- a/src/os/bluestore/BlueStore.h
+++ b/src/os/bluestore/BlueStore.h
@@ -127,6 +127,7 @@ enum {
// write op stats
//****************************************
+ l_bluestore_write_lat,
l_bluestore_write_big,
l_bluestore_write_big_bytes,
l_bluestore_write_big_blobs,
@@ -274,6 +275,22 @@ public:
virtual ~AioContext() {}
};
+ static constexpr uint32_t OBJECT_MAX_SIZE = 0xffffffff; // 32 bits
+ struct printer {
+ static constexpr uint16_t PTR = 1; // pointer to Blob
+ static constexpr uint16_t NICK = 2; // a nickname of this Blob
+ static constexpr uint16_t DISK = 4; // disk allocations of Blob
+ static constexpr uint16_t SDISK = 8; // shortened version of disk allocaitons
+ static constexpr uint16_t USE = 16; // use tracker
+ static constexpr uint16_t SUSE = 32; // shortened use tracker
+ static constexpr uint16_t CHK = 64; // checksum, full dump
+ static constexpr uint16_t SCHK = 128; // only base checksum info
+ static constexpr uint16_t BUF = 256; // print Blob's buffers (takes cache lock)
+ static constexpr uint16_t SBUF = 512; // short print Blob's buffers (takes cache lock)
+ static constexpr uint16_t ATTRS = 1024; // print attrs in onode
+ static constexpr uint16_t JUSTID = 2048; // used to suppress printing length, spanning and shared blob
+ };
+
/// cached buffer
struct Buffer {
MEMPOOL_CLASS_HELPERS();
@@ -291,6 +308,16 @@ public:
default: return "???";
}
}
+ // Short version of state name.
+ // Not print "clean", as it is most frequent.
+ static const char *get_state_name_short(int s) {
+ switch (s) {
+ case STATE_EMPTY: return ",empty";
+ case STATE_CLEAN: return "";
+ case STATE_WRITING: return ",writing";
+ default: return "???";
+ }
+ }
enum {
FLAG_NOCACHE = 1, ///< trim when done WRITING (do not become CLEAN)
// NOTE: fix operator<< when you define a second flag
@@ -637,7 +664,16 @@ public:
void dump(ceph::Formatter* f) const;
friend std::ostream& operator<<(std::ostream& out, const Blob &b);
-
+ struct printer : public BlueStore::printer {
+ const Blob& blob;
+ uint16_t mode;
+ printer(const Blob& blob, uint16_t mode)
+ :blob(blob), mode(mode) {}
+ };
+ friend std::ostream& operator<<(std::ostream& out, const printer &p);
+ printer print(uint16_t mode) const {
+ return printer(*this, mode);
+ }
const bluestore_blob_use_tracker_t& get_blob_use_tracker() const {
return used_in_blob;
}
@@ -690,6 +726,7 @@ public:
o.blob_bl = blob_bl;
#endif
}
+ void add_tail(uint32_t new_blob_size, uint32_t min_release_size);
void dup(const Blob& from, bool copy_used_in_blob);
void copy_from(CephContext* cct, const Blob& from,
uint32_t min_release_size, uint32_t start, uint32_t len);
@@ -712,6 +749,11 @@ public:
/// put logical references, and get back any released extents
bool put_ref(Collection *coll, uint32_t offset, uint32_t length,
PExtentVector *r);
+ uint32_t put_ref_accumulate(
+ Collection *coll,
+ uint32_t offset,
+ uint32_t length,
+ PExtentVector *released_disk);
/// split the blob
void split(Collection *coll, uint32_t blob_offset, Blob *o);
@@ -842,6 +884,16 @@ public:
blob->get_cache()->rm_extent();
}
}
+ struct printer : public BlueStore::printer {
+ const Extent& ext;
+ uint16_t mode;
+ printer(const Extent& ext, uint16_t mode)
+ :ext(ext), mode(mode) {}
+ };
+ friend std::ostream& operator<<(std::ostream& out, const printer &p);
+ printer print(uint16_t mode) const {
+ return printer(*this, mode);
+ }
void dump(ceph::Formatter* f) const;
@@ -1104,6 +1156,10 @@ public:
extent_map_t::iterator seek_lextent(uint64_t offset);
extent_map_t::const_iterator seek_lextent(uint64_t offset) const;
+ /// split extent
+ extent_map_t::iterator split_at(extent_map_t::iterator p, uint32_t offset);
+ /// if inside extent split it, if not return extent on right
+ extent_map_t::iterator maybe_split_at(uint32_t offset);
/// add a new Extent
void add(uint32_t lo, uint32_t o, uint32_t l, BlobRef& b) {
extent_map.insert(*new Extent(lo, o, l, b));
@@ -1401,9 +1457,25 @@ public:
}
void rewrite_omap_key(const std::string& old, std::string *out);
+ size_t calc_userkey_offset_in_omap_key() const;
void decode_omap_key(const std::string& key, std::string *user_key);
void finish_write(TransContext* txc, uint32_t offset, uint32_t length);
+
+ struct printer : public BlueStore::printer {
+ const Onode &onode;
+ uint16_t mode;
+ uint32_t from = 0;
+ uint32_t end = OBJECT_MAX_SIZE;
+ printer(const Onode &onode, uint16_t mode) : onode(onode), mode(mode) {}
+ printer(const Onode &onode, uint16_t mode, uint32_t from, uint32_t end)
+ : onode(onode), mode(mode), from(from), end(end) {}
+ };
+ friend std::ostream &operator<<(std::ostream &out, const printer &p);
+ printer print(uint16_t mode) const { return printer(*this, mode); }
+ printer print(uint16_t mode, uint32_t from, uint32_t end) const {
+ return printer(*this, mode, from, end);
+ }
};
/// A generic Cache Shard
@@ -1682,6 +1754,7 @@ public:
int next() override;
std::string key() override;
ceph::buffer::list value() override;
+ std::string_view value_as_sv() override;
std::string tail_key() override {
return tail;
}
@@ -1767,6 +1840,14 @@ public:
values[STATFS_COMPRESSED_ALLOCATED] = st.data_compressed_allocated;
return *this;
}
+ bool operator==(const volatile_statfs& rhs) const {
+ return
+ values[STATFS_ALLOCATED] == rhs.values[STATFS_ALLOCATED] &&
+ values[STATFS_STORED] == rhs.values[STATFS_STORED] &&
+ values[STATFS_COMPRESSED_ORIGINAL] == rhs.values[STATFS_COMPRESSED_ORIGINAL] &&
+ values[STATFS_COMPRESSED] == rhs.values[STATFS_COMPRESSED] &&
+ values[STATFS_COMPRESSED_ALLOCATED] == rhs.values[STATFS_COMPRESSED_ALLOCATED];
+ }
bool is_empty() {
return values[STATFS_ALLOCATED] == 0 &&
values[STATFS_STORED] == 0 &&
@@ -2017,6 +2098,20 @@ public:
Throttle throttle_deferred_bytes; ///< submit to deferred complete
public:
+ ceph::mutex lock = ceph::make_mutex("BlueStoreThrottle::max_lock");
+
+ std::atomic<uint64_t> transactions = 0;
+
+ int64_t bytes_observed_max = 0;
+ utime_t bytes_max_ts;
+ uint64_t transactions_observed_max = 0;
+ utime_t transactions_max_ts;
+
+ uint64_t get_current() {
+ return throttle_bytes.get_current();
+ }
+
+ public:
BlueStoreThrottle(CephContext *cct) :
throttle_bytes(cct, "bluestore_throttle_bytes", 0),
throttle_deferred_bytes(cct, "bluestore_throttle_deferred_bytes", 0)
@@ -2042,8 +2137,9 @@ public:
KeyValueDB &db,
TransContext &txc,
ceph::mono_clock::time_point);
- void release_kv_throttle(uint64_t cost) {
+ void release_kv_throttle(uint64_t cost, uint64_t txcs) {
throttle_bytes.put(cost);
+ transactions -= txcs;
}
void release_deferred_throttle(uint64_t cost) {
throttle_deferred_bytes.put(cost);
@@ -2275,6 +2371,9 @@ public:
bool apply_defer();
};
+ class Writer;
+ friend class Writer;
+
// --------------------------------------------------------
// members
private:
@@ -2372,6 +2471,7 @@ private:
std::numeric_limits<decltype(min_alloc_size)>::digits,
"not enough bits for min_alloc_size");
bool elastic_shared_blobs = false; ///< use smart ExtentMap::dup to reduce shared blob count
+ bool use_write_v2 = false; ///< use new write path
enum {
// Please preserve the order since it's DB persistent
@@ -2402,6 +2502,7 @@ private:
uint64_t kv_ios = 0;
uint64_t kv_throttle_costs = 0;
+ uint64_t kv_throttle_txcs = 0;
// cache trim control
uint64_t cache_size = 0; ///< total cache size
@@ -3109,6 +3210,7 @@ public:
int dump_bluefs_sizes(std::ostream& out);
void trim_free_space(const std::string& type, std::ostream& outss);
+ static int zap_device(CephContext* cct, const std::string& dev);
public:
int statfs(struct store_statfs_t *buf,
@@ -3316,15 +3418,6 @@ public:
std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
) override;
-#ifdef WITH_SEASTAR
- int omap_get_values(
- CollectionHandle &c, ///< [in] Collection containing oid
- const ghobject_t &oid, ///< [in] Object containing omap
- const std::optional<std::string> &start_after, ///< [in] Keys to get
- std::map<std::string, ceph::buffer::list> *out ///< [out] Returned keys and values
- ) override;
-#endif
-
/// Filters keys into out which are defined on oid
int omap_check_keys(
CollectionHandle &c, ///< [in] Collection containing oid
@@ -3338,6 +3431,13 @@ public:
const ghobject_t &oid ///< [in] object
) override;
+ int omap_iterate(
+ CollectionHandle &c, ///< [in] collection
+ const ghobject_t &oid, ///< [in] object
+ omap_iter_seek_t start_from, ///< [in] where the iterator should point to at the beginning
+ std::function<omap_iter_ret_t(std::string_view, std::string_view)> f
+ ) override;
+
void set_fsid(uuid_d u) override {
fsid = u;
}
@@ -3463,6 +3563,29 @@ public:
CephContext* cct, const std::string &path,
const bluestore_bdev_label_t& label, uint64_t disk_position = 0);
+ void debug_punch_hole_2(
+ CollectionRef& c,
+ OnodeRef& o,
+ uint32_t offset,
+ uint32_t length,
+ PExtentVector& released,
+ std::vector<BlobRef>& pruned_blobs,
+ std::set<SharedBlobRef>& shared_changed,
+ volatile_statfs& statfs_delta) {
+ _punch_hole_2(c.get(), o, offset, length, released,
+ pruned_blobs, shared_changed, statfs_delta);
+ }
+ Allocator*& debug_get_alloc() {
+ return alloc;
+ }
+ void debug_set_block_size(uint64_t _block_size) {
+ block_size = _block_size;
+ block_mask = ~(block_size - 1);
+ block_size_order = std::countr_zero(block_size);
+ }
+ void debug_set_prefer_deferred_size(uint64_t s) {
+ prefer_deferred_size = s;
+ }
inline void log_latency(const char* name,
int idx,
const ceph::timespan& lat,
@@ -3555,12 +3678,13 @@ private:
// --------------------------------------------------------
// write ops
-
+ public:
struct WriteContext {
bool buffered = false; ///< buffered write
bool compress = false; ///< compressed write
- uint64_t target_blob_size = 0; ///< target (max) blob size
+ uint8_t csum_type = 0; ///< checksum type for new blobs
unsigned csum_order = 0; ///< target checksum chunk order
+ uint64_t target_blob_size = 0; ///< target (max) blob size
old_extent_map_t old_extents; ///< must deref these blobs
interval_set<uint64_t> extents_to_gc; ///< extents for garbage collection
@@ -3609,6 +3733,7 @@ private:
buffered = other.buffered;
compress = other.compress;
target_blob_size = other.target_blob_size;
+ csum_type = other.csum_type;
csum_order = other.csum_order;
}
void write(
@@ -3638,6 +3763,16 @@ private:
uint64_t loffs_end,
uint64_t min_alloc_size);
};
+ private:
+ BlueStore::extent_map_t::iterator _punch_hole_2(
+ Collection* c,
+ OnodeRef& o,
+ uint32_t offset,
+ uint32_t length,
+ PExtentVector& released,
+ std::vector<BlobRef>& pruned_blobs,
+ std::set<SharedBlobRef>& shared_changed,
+ volatile_statfs& statfs_delta);
void _do_write_small(
TransContext *txc,
CollectionRef &c,
@@ -3705,6 +3840,13 @@ private:
uint64_t length,
ceph::buffer::list& bl,
WriteContext *wctx);
+ int _do_write_v2(
+ TransContext *txc,
+ CollectionRef &c,
+ OnodeRef& o,
+ uint64_t offset, uint64_t length,
+ ceph::buffer::list& bl,
+ uint32_t fadvise_flags);
int _touch(TransContext *txc,
CollectionRef& c,