summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorLukáš Ondráček <lukas.ondracek@nic.cz>2024-05-29 14:42:51 +0200
committerOto Šťáva <oto.stava@nic.cz>2024-06-04 17:40:55 +0200
commit05bdc82884a07034964d7edb49cb402a666d3dc8 (patch)
treef318153f095b11778c5b70b2ac33fb7a735ea850 /lib
parentfixup! rrl: renaming, movements, create defer protolayer (diff)
downloadknot-resolver-05bdc82884a07034964d7edb49cb402a666d3dc8.tar.xz
knot-resolver-05bdc82884a07034964d7edb49cb402a666d3dc8.zip
rrl: tidying
Diffstat (limited to 'lib')
-rw-r--r--lib/kru-avx2.c2
-rw-r--r--lib/kru-decay.inc.c82
-rw-r--r--lib/kru-generic.c2
-rw-r--r--lib/kru.inc.c107
4 files changed, 87 insertions, 106 deletions
diff --git a/lib/kru-avx2.c b/lib/kru-avx2.c
index 453d1336..e6875f75 100644
--- a/lib/kru-avx2.c
+++ b/lib/kru-avx2.c
@@ -41,7 +41,7 @@
#define USE_SSE41 1
#include "lib/defines.h"
-#include "./kru.inc.c"
+#include "./kru.inc.c" // NOLINT(bugprone-suspicious-include)
KR_EXPORT
const struct kru_api KRU_AVX2 = KRU_API_INITIALIZER;
diff --git a/lib/kru-decay.inc.c b/lib/kru-decay.inc.c
deleted file mode 100644
index 002007bb..00000000
--- a/lib/kru-decay.inc.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/* Copyright (C) 2024 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
-
- This program is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
-
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
-
- You should have received a copy of the GNU General Public License
- along with this program. If not, see <https://www.gnu.org/licenses/>.
- */
-
-#include <math.h>
-
-/// Parametrization for speed of decay.
-struct decay_config {
- /// Bit shift per tick, fractional
- double shift_bits;
-
- /// Ticks to get zero loads
- uint32_t max_ticks;
-
- uint32_t mult_cache[32];
-};
-
-static inline void decay_initialize(struct decay_config *decay, kru_price_t max_decay)
-{
- decay->shift_bits = log2(KRU_LIMIT - 1) - log2(KRU_LIMIT - 1 - max_decay);
- decay->max_ticks = 18 / decay->shift_bits;
-
- decay->mult_cache[0] = 0; // not used
- for (size_t ticks = 1; ticks < sizeof(decay->mult_cache) / sizeof(*decay->mult_cache); ticks++) {
- decay->mult_cache[ticks] = exp2(32 - decay->shift_bits * ticks) + 0.5;
- }
-}
-
-/// Catch up the time drift with configurably slower decay.
-static inline void update_time(struct load_cl *l, const uint32_t time_now,
- const struct decay_config *decay)
-{
- uint32_t ticks;
- uint32_t time_last = atomic_load_explicit(&l->time, memory_order_relaxed);
- do {
- ticks = time_now - time_last;
- if (__builtin_expect(!ticks, true)) // we optimize for time not advancing
- return;
- // We accept some desynchronization of time_now (e.g. from different threads).
- if (ticks > (uint32_t)-1024)
- return;
- } while (!atomic_compare_exchange_weak_explicit(&l->time, &time_last, time_now, memory_order_relaxed, memory_order_relaxed));
-
- // If we passed here, we have acquired a time difference we are responsibe for.
-
- // Don't bother with complex computations if lots of ticks have passed. (little to no speed-up)
- if (ticks > decay->max_ticks) {
- memset(l->loads, 0, sizeof(l->loads));
- return;
- }
-
- uint32_t mult;
- if (__builtin_expect(ticks < sizeof(decay->mult_cache) / sizeof(*decay->mult_cache), 1)) {
- mult = decay->mult_cache[ticks];
- } else {
- mult = exp2(32 - decay->shift_bits * ticks) + 0.5;
- }
-
- for (int i = 0; i < LOADS_LEN; ++i) {
- // We perform decay for the acquired time difference; decays from different threads are commutative.
- _Atomic uint16_t *load_at = (_Atomic uint16_t *)&l->loads[i];
- uint16_t l1, load_orig = atomic_load_explicit(load_at, memory_order_relaxed);
- const uint16_t rnd = rand_bits(16);
- do {
- uint64_t m = (((uint64_t)load_orig << 16)) * mult;
- m = (m >> 32) + ((m >> 31) & 1);
- l1 = (m >> 16) + (rnd < (uint16_t)m);
- } while (!atomic_compare_exchange_weak_explicit(load_at, &load_orig, l1, memory_order_relaxed, memory_order_relaxed));
- }
-}
diff --git a/lib/kru-generic.c b/lib/kru-generic.c
index 8d122fd8..a6e4f319 100644
--- a/lib/kru-generic.c
+++ b/lib/kru-generic.c
@@ -15,7 +15,7 @@
*/
#include "lib/defines.h"
-#include "./kru.inc.c"
+#include "./kru.inc.c" // NOLINT(bugprone-suspicious-include)
KR_EXPORT
const struct kru_api KRU_GENERIC = KRU_API_INITIALIZER;
diff --git a/lib/kru.inc.c b/lib/kru.inc.c
index 5f630fb7..6d75fd7e 100644
--- a/lib/kru.inc.c
+++ b/lib/kru.inc.c
@@ -46,30 +46,9 @@ Size (`loads_bits` = log2 length):
#include <stdbool.h>
#include <stddef.h>
#include <string.h>
+#include <math.h>
#include "./kru.h"
-
-/// Block of loads sharing the same time, so that we're more space-efficient.
-/// It's exactly a single cache line.
-struct load_cl {
- _Atomic uint32_t time;
- #define LOADS_LEN 15
- uint16_t ids[LOADS_LEN];
- uint16_t loads[LOADS_LEN];
-} ALIGNED_CPU_CACHE;
-static_assert(64 == sizeof(struct load_cl), "bad size of struct load_cl");
-
-inline static uint64_t rand_bits(unsigned int bits) {
- static _Thread_local uint64_t state = 3723796604792068981ull;
- const uint64_t prime1 = 11737314301796036329ull;
- const uint64_t prime2 = 3107264277052274849ull;
- state = prime1 * state + prime2;
- //return state & ((1 << bits) - 1);
- return state >> (64 - bits);
-}
-
-#include "./kru-decay.inc.c"
-
#include "contrib/ucw/lib.h"
#include "libdnssec/error.h"
#include "libdnssec/random.h"
@@ -92,6 +71,27 @@ typedef uint64_t hash_t;
#include <x86intrin.h>
#endif
+/// Block of loads sharing the same time, so that we're more space-efficient.
+/// It's exactly a single cache line.
+struct load_cl {
+ _Atomic uint32_t time;
+ #define LOADS_LEN 15
+ uint16_t ids[LOADS_LEN];
+ uint16_t loads[LOADS_LEN];
+} ALIGNED_CPU_CACHE;
+static_assert(64 == sizeof(struct load_cl), "bad size of struct load_cl");
+
+/// Parametrization for speed of decay.
+struct decay_config {
+ /// Bit shift per tick, fractional
+ double shift_bits;
+
+ /// Ticks to get zero loads
+ uint32_t max_ticks;
+
+ uint32_t mult_cache[32];
+};
+
struct kru {
#if USE_AES
/// Hashing secret. Random but shared by all users of the table.
@@ -111,6 +111,69 @@ struct kru {
struct load_cl load_cls[][TABLE_COUNT];
};
+inline static uint64_t rand_bits(unsigned int bits) {
+ static _Thread_local uint64_t state = 3723796604792068981ull;
+ const uint64_t prime1 = 11737314301796036329ull;
+ const uint64_t prime2 = 3107264277052274849ull;
+ state = prime1 * state + prime2;
+ //return state & ((1 << bits) - 1);
+ return state >> (64 - bits);
+}
+
+static inline void decay_initialize(struct decay_config *decay, kru_price_t max_decay)
+{
+ decay->shift_bits = log2(KRU_LIMIT - 1) - log2(KRU_LIMIT - 1 - max_decay);
+ decay->max_ticks = 18 / decay->shift_bits;
+
+ decay->mult_cache[0] = 0; // not used
+ for (size_t ticks = 1; ticks < sizeof(decay->mult_cache) / sizeof(*decay->mult_cache); ticks++) {
+ decay->mult_cache[ticks] = exp2(32 - decay->shift_bits * ticks) + 0.5;
+ }
+}
+
+/// Catch up the time drift with configurably slower decay.
+static inline void update_time(struct load_cl *l, const uint32_t time_now,
+ const struct decay_config *decay)
+{
+ uint32_t ticks;
+ uint32_t time_last = atomic_load_explicit(&l->time,memory_order_relaxed);
+ do {
+ ticks = time_now - time_last;
+ if (__builtin_expect(!ticks, true)) // we optimize for time not advancing
+ return;
+ // We accept some desynchronization of time_now (e.g. from different threads).
+ if (ticks > (uint32_t)-1024)
+ return;
+ } while (!atomic_compare_exchange_weak_explicit(&l->time, &time_last, time_now, memory_order_relaxed, memory_order_relaxed));
+
+ // If we passed here, we have acquired a time difference we are responsibe for.
+
+ // Don't bother with complex computations if lots of ticks have passed. (little to no speed-up)
+ if (ticks > decay->max_ticks) {
+ memset(l->loads, 0, sizeof(l->loads));
+ return;
+ }
+
+ uint32_t mult;
+ if (__builtin_expect(ticks < sizeof(decay->mult_cache) / sizeof(*decay->mult_cache), 1)) {
+ mult = decay->mult_cache[ticks];
+ } else {
+ mult = exp2(32 - decay->shift_bits * ticks) + 0.5;
+ }
+
+ for (int i = 0; i < LOADS_LEN; ++i) {
+ // We perform decay for the acquired time difference; decays from different threads are commutative.
+ _Atomic uint16_t *load_at = (_Atomic uint16_t *)&l->loads[i];
+ uint16_t l1, load_orig = atomic_load_explicit(load_at, memory_order_relaxed);
+ const uint16_t rnd = rand_bits(16);
+ do {
+ uint64_t m = (((uint64_t)load_orig << 16)) * mult;
+ m = (m >> 32) + ((m >> 31) & 1);
+ l1 = (m >> 16) + (rnd < (uint16_t)m);
+ } while (!atomic_compare_exchange_weak_explicit(load_at, &load_orig, l1, memory_order_relaxed, memory_order_relaxed));
+ }
+}
+
/// Convert capacity_log to loads_bits
static inline int32_t capacity2loads(int capacity_log)
{