2 files changed, 83 insertions, 19 deletions
diff --git a/src/blk/kernel/KernelDevice.cc b/src/blk/kernel/KernelDevice.cc
index 2ea44e6a6e1..62f91fa7bc8 100644
--- a/src/blk/kernel/KernelDevice.cc
+++ b/src/blk/kernel/KernelDevice.cc
@@ -20,6 +20,7 @@
 #include <fcntl.h>
 #include <sys/file.h>
 
+#include <boost/container/flat_map.hpp>
 #include <boost/lockfree/queue.hpp>
 
 #include "KernelDevice.h"
@@ -28,6 +29,7 @@
 #include "include/types.h"
 #include "include/compat.h"
 #include "include/stringify.h"
+#include "include/str_map.h"
 #include "common/blkdev.h"
 #include "common/errno.h"
 #if defined(__FreeBSD__)
@@ -1102,8 +1104,8 @@ struct ExplicitHugePagePool {
     }
   }
 
-  bool empty_estimation() const {
-    return region_q.empty();
+  size_t get_buffer_size() const {
+    return buffer_size;
   }
 
 private:
@@ -1111,8 +1113,59 @@ private:
   region_queue_t region_q;
 };
 
+struct HugePagePoolOfPools {
+  HugePagePoolOfPools(const std::map<size_t, size_t> conf)
+    : pools(conf.size(), [conf] (size_t index, auto emplacer) {
+        ceph_assert(index < conf.size());
+        // it could be replaced with a state-mutating lambda and
+        // `conf::erase()` but performance is not a concern here.
+        const auto [buffer_size, buffers_in_pool] =
+          *std::next(std::begin(conf), index);
+        emplacer.emplace(buffer_size, buffers_in_pool);
+      }) {
+  }
+
+  ceph::unique_leakable_ptr<buffer::raw> try_create(const size_t size) {
+    // thankfully to `conf` being a `std::map` we store the pools
+    // sorted by buffer sizes. this would allow to clamp to log(n)
+    // but I doubt admins want to have dozens of accelerated buffer
+    // size. let's keep this simple for now.
+    if (auto iter = std::find_if(std::begin(pools), std::end(pools),
+                                 [size] (const auto& pool) {
+                                   return size == pool.get_buffer_size();
+                                 });
+        iter != std::end(pools)) {
+      return iter->try_create();
+    }
+    return nullptr;
+  }
+
+  static HugePagePoolOfPools from_desc(const std::string& conf);
+
+private:
+  // let's have some space inside (for 2 MB and 4 MB perhaps?)
+  // NOTE: we need tiny_vector as the boost::lockfree queue inside
+  // pool is not-movable.
+  ceph::containers::tiny_vector<ExplicitHugePagePool, 2> pools;
+};
 
-#define LUCKY_BUFFER_SIZE 4 * 1024 * 1024
+
+HugePagePoolOfPools HugePagePoolOfPools::from_desc(const std::string& desc) {
+  std::map<size_t, size_t> conf; // buffer_size -> buffers_in_pool
+  std::map<std::string, std::string> exploded_str_conf;
+  get_str_map(desc, &exploded_str_conf);
+  for (const auto& [buffer_size_s, buffers_in_pool_s] : exploded_str_conf) {
+    size_t buffer_size, buffers_in_pool;
+    if (sscanf(buffer_size_s.c_str(), "%zu", &buffer_size) != 1) {
+      ceph_abort("can't parse a key in the configuration");
+    }
+    if (sscanf(buffers_in_pool_s.c_str(), "%zu", &buffers_in_pool) != 1) {
+      ceph_abort("can't parse a value in the configuration");
+    }
+    conf[buffer_size] = buffers_in_pool;
+  }
+  return HugePagePoolOfPools{std::move(conf)};
+}
 
 // create a buffer basing on user-configurable. it's intended to make
 // our buffers THP-able.
@@ -1122,26 +1175,22 @@ ceph::unique_leakable_ptr<buffer::raw> KernelDevice::create_custom_aligned(
   // just to preserve the logic of create_small_page_aligned().
   if (len < CEPH_PAGE_SIZE) {
     return ceph::buffer::create_small_page_aligned(len);
-  } else if (len == LUCKY_BUFFER_SIZE) {
-    static ExplicitHugePagePool hp_pool{
-      LUCKY_BUFFER_SIZE,
-      cct->_conf->bdev_read_preallocated_huge_buffer_num
-    };
-    if (auto lucky_raw = hp_pool.try_create(); lucky_raw) {
+  } else {
+    static HugePagePoolOfPools hp_pools = HugePagePoolOfPools::from_desc(
+      cct->_conf.get_val<std::string>("bdev_read_preallocated_huge_buffers")
+    );
+    if (auto lucky_raw = hp_pools.try_create(len); lucky_raw) {
       dout(20) << __func__ << " allocated from huge pool"
 	       << " lucky_raw.data=" << (void*)lucky_raw->get_data()
-	       << " bdev_read_preallocated_huge_buffer_num="
-	       << cct->_conf->bdev_read_preallocated_huge_buffer_num
+	       << " bdev_read_preallocated_huge_buffers="
+	       << cct->_conf.get_val<std::string>("bdev_read_preallocated_huge_buffers")
 	       << dendl;
       return lucky_raw;
     } else {
       // fallthrough due to empty buffer pool. this can happen also
       // when the configurable was explicitly set to 0.
       dout(20) << __func__ << " cannot allocate from huge pool"
-	       << " hp_pool.empty_estimation=" << hp_pool.empty_estimation()
-	       << " bdev_read_preallocated_huge_buffer_num="
-	       << cct->_conf->bdev_read_preallocated_huge_buffer_num
-	       << dendl;
+               << dendl;
     }
   }
   const size_t custom_alignment = cct->_conf->bdev_read_buffer_alignment;
diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in
index 6bc97f20185..0f209ea823b 100644
--- a/src/common/options/global.yaml.in
+++ b/src/common/options/global.yaml.in
@@ -3920,11 +3920,26 @@ options:
   level: advanced
   default: 4_K
   with_legacy: true
-- name: bdev_read_preallocated_huge_buffer_num
-  type: size
+- name: bdev_read_preallocated_huge_buffers
+  type: str
   level: advanced
-  default: 128
-  with_legacy: true
+  desc: description of pools arrangement for huge page-based read buffers
+  long_desc: Arrangement of preallocated, huge pages-based pools for reading
+    from a KernelDevice. Applied to minimize size of scatter-gather lists
+    sent to NICs. Targets really  big buffers (>= 2 or 4 MBs).
+    Keep in mind the system must be configured accordingly (see /proc/sys/vm/nr_hugepages).
+    Otherwise the OSD wil fail early.
+    Beware BlueStore, by default, stores large chunks across many smaller blobs.
+    Increasing bluestore_max_blob_size changes that, and thus allows the data to
+    be read back into small number of huge page-backed buffers.
+  fmt_desc: List of key=value pairs delimited by comma, semicolon or tab.
+    key specifies the targeted read size and must be expressed in bytes.
+    value specifies the number of preallocated buffers.
+    For instance, to preallocate 64 buffers that will be used to serve
+    2 MB-sized read requests and 128 for 4 MB, someone needs to set
+    "2097152=64,4194304=128".
+  see_also:
+  - bluestore_max_blob_size
 - name: bdev_debug_aio
   type: bool
   level: dev