summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/blk/kernel/KernelDevice.cc79
-rw-r--r--src/common/options/global.yaml.in23
2 files changed, 83 insertions, 19 deletions
diff --git a/src/blk/kernel/KernelDevice.cc b/src/blk/kernel/KernelDevice.cc
index 2ea44e6a6e1..62f91fa7bc8 100644
--- a/src/blk/kernel/KernelDevice.cc
+++ b/src/blk/kernel/KernelDevice.cc
@@ -20,6 +20,7 @@
#include <fcntl.h>
#include <sys/file.h>
+#include <boost/container/flat_map.hpp>
#include <boost/lockfree/queue.hpp>
#include "KernelDevice.h"
@@ -28,6 +29,7 @@
#include "include/types.h"
#include "include/compat.h"
#include "include/stringify.h"
+#include "include/str_map.h"
#include "common/blkdev.h"
#include "common/errno.h"
#if defined(__FreeBSD__)
@@ -1102,8 +1104,8 @@ struct ExplicitHugePagePool {
}
}
- bool empty_estimation() const {
- return region_q.empty();
+ size_t get_buffer_size() const {
+ return buffer_size;
}
private:
@@ -1111,8 +1113,59 @@ private:
region_queue_t region_q;
};
+struct HugePagePoolOfPools {
+ HugePagePoolOfPools(const std::map<size_t, size_t> conf)
+ : pools(conf.size(), [conf] (size_t index, auto emplacer) {
+ ceph_assert(index < conf.size());
+ // it could be replaced with a state-mutating lambda and
+ // `conf::erase()` but performance is not a concern here.
+ const auto [buffer_size, buffers_in_pool] =
+ *std::next(std::begin(conf), index);
+ emplacer.emplace(buffer_size, buffers_in_pool);
+ }) {
+ }
+
+ ceph::unique_leakable_ptr<buffer::raw> try_create(const size_t size) {
+ // thankfully to `conf` being a `std::map` we store the pools
+ // sorted by buffer sizes. this would allow to clamp to log(n)
+ // but I doubt admins want to have dozens of accelerated buffer
+ // size. let's keep this simple for now.
+ if (auto iter = std::find_if(std::begin(pools), std::end(pools),
+ [size] (const auto& pool) {
+ return size == pool.get_buffer_size();
+ });
+ iter != std::end(pools)) {
+ return iter->try_create();
+ }
+ return nullptr;
+ }
+
+ static HugePagePoolOfPools from_desc(const std::string& conf);
+
+private:
+ // let's have some space inside (for 2 MB and 4 MB perhaps?)
+ // NOTE: we need tiny_vector as the boost::lockfree queue inside
+ // pool is not-movable.
+ ceph::containers::tiny_vector<ExplicitHugePagePool, 2> pools;
+};
-#define LUCKY_BUFFER_SIZE 4 * 1024 * 1024
+
+HugePagePoolOfPools HugePagePoolOfPools::from_desc(const std::string& desc) {
+ std::map<size_t, size_t> conf; // buffer_size -> buffers_in_pool
+ std::map<std::string, std::string> exploded_str_conf;
+ get_str_map(desc, &exploded_str_conf);
+ for (const auto& [buffer_size_s, buffers_in_pool_s] : exploded_str_conf) {
+ size_t buffer_size, buffers_in_pool;
+ if (sscanf(buffer_size_s.c_str(), "%zu", &buffer_size) != 1) {
+ ceph_abort("can't parse a key in the configuration");
+ }
+ if (sscanf(buffers_in_pool_s.c_str(), "%zu", &buffers_in_pool) != 1) {
+ ceph_abort("can't parse a value in the configuration");
+ }
+ conf[buffer_size] = buffers_in_pool;
+ }
+ return HugePagePoolOfPools{std::move(conf)};
+}
// create a buffer basing on user-configurable. it's intended to make
// our buffers THP-able.
@@ -1122,26 +1175,22 @@ ceph::unique_leakable_ptr<buffer::raw> KernelDevice::create_custom_aligned(
// just to preserve the logic of create_small_page_aligned().
if (len < CEPH_PAGE_SIZE) {
return ceph::buffer::create_small_page_aligned(len);
- } else if (len == LUCKY_BUFFER_SIZE) {
- static ExplicitHugePagePool hp_pool{
- LUCKY_BUFFER_SIZE,
- cct->_conf->bdev_read_preallocated_huge_buffer_num
- };
- if (auto lucky_raw = hp_pool.try_create(); lucky_raw) {
+ } else {
+ static HugePagePoolOfPools hp_pools = HugePagePoolOfPools::from_desc(
+ cct->_conf.get_val<std::string>("bdev_read_preallocated_huge_buffers")
+ );
+ if (auto lucky_raw = hp_pools.try_create(len); lucky_raw) {
dout(20) << __func__ << " allocated from huge pool"
<< " lucky_raw.data=" << (void*)lucky_raw->get_data()
- << " bdev_read_preallocated_huge_buffer_num="
- << cct->_conf->bdev_read_preallocated_huge_buffer_num
+ << " bdev_read_preallocated_huge_buffers="
+ << cct->_conf.get_val<std::string>("bdev_read_preallocated_huge_buffers")
<< dendl;
return lucky_raw;
} else {
// fallthrough due to empty buffer pool. this can happen also
// when the configurable was explicitly set to 0.
dout(20) << __func__ << " cannot allocate from huge pool"
- << " hp_pool.empty_estimation=" << hp_pool.empty_estimation()
- << " bdev_read_preallocated_huge_buffer_num="
- << cct->_conf->bdev_read_preallocated_huge_buffer_num
- << dendl;
+ << dendl;
}
}
const size_t custom_alignment = cct->_conf->bdev_read_buffer_alignment;
diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in
index 6bc97f20185..0f209ea823b 100644
--- a/src/common/options/global.yaml.in
+++ b/src/common/options/global.yaml.in
@@ -3920,11 +3920,26 @@ options:
level: advanced
default: 4_K
with_legacy: true
-- name: bdev_read_preallocated_huge_buffer_num
- type: size
+- name: bdev_read_preallocated_huge_buffers
+ type: str
level: advanced
- default: 128
- with_legacy: true
+ desc: description of pools arrangement for huge page-based read buffers
+ long_desc: Arrangement of preallocated, huge pages-based pools for reading
+ from a KernelDevice. Applied to minimize size of scatter-gather lists
+ sent to NICs. Targets really big buffers (>= 2 or 4 MBs).
+ Keep in mind the system must be configured accordingly (see /proc/sys/vm/nr_hugepages).
+ Otherwise the OSD wil fail early.
+ Beware BlueStore, by default, stores large chunks across many smaller blobs.
+ Increasing bluestore_max_blob_size changes that, and thus allows the data to
+ be read back into small number of huge page-backed buffers.
+ fmt_desc: List of key=value pairs delimited by comma, semicolon or tab.
+ key specifies the targeted read size and must be expressed in bytes.
+ value specifies the number of preallocated buffers.
+ For instance, to preallocate 64 buffers that will be used to serve
+ 2 MB-sized read requests and 128 for 4 MB, someone needs to set
+ "2097152=64,4194304=128".
+ see_also:
+ - bluestore_max_blob_size
- name: bdev_debug_aio
type: bool
level: dev