rrl: update KRU

author: Daniel Salzman <daniel.salzman@nic.cz> 2024-05-30 19:17:59 +0200
committer: Daniel Salzman <daniel.salzman@nic.cz> 2024-08-12 08:01:55 +0200
commit: 9b42ca0904fd57922c00656ee93c6efff990092f (patch)
tree: 519ab2afa2f0a0accd162f3a0fb9c94b6b01f01f
parent: module: add protocol processing callback (diff)
download: knot-9b42ca0904fd57922c00656ee93c6efff990092f.tar.xz
knot-9b42ca0904fd57922c00656ee93c6efff990092f.zip
7 files changed, 174 insertions, 133 deletions
diff --git a/Knot.files b/Knot.files
index 22b8d65e1..7e0330174 100644
--- a/Knot.files
+++ b/Knot.files
@@ -286,7 +286,6 @@ src/knot/modules/queryacl/queryacl.c
 src/knot/modules/rrl/functions.c
 src/knot/modules/rrl/functions.h
 src/knot/modules/rrl/kru-avx2.c
-src/knot/modules/rrl/kru-decay.inc.c
 src/knot/modules/rrl/kru-generic.c
 src/knot/modules/rrl/kru.h
 src/knot/modules/rrl/kru.inc.c
diff --git a/src/knot/modules/rrl/Makefile.inc b/src/knot/modules/rrl/Makefile.inc
index 9a26b0571..fbb9eb098 100644
--- a/src/knot/modules/rrl/Makefile.inc
+++ b/src/knot/modules/rrl/Makefile.inc
@@ -4,8 +4,7 @@ knot_modules_rrl_la_SOURCES = knot/modules/rrl/rrl.c \
                               knot/modules/rrl/kru-generic.c \
                               knot/modules/rrl/kru-avx2.c \
                               knot/modules/rrl/kru.h
-noinst_HEADERS =              knot/modules/rrl/kru-decay.inc.c \
-                              knot/modules/rrl/kru.inc.c
+noinst_HEADERS =              knot/modules/rrl/kru.inc.c
 EXTRA_DIST +=                 knot/modules/rrl/rrl.rst
 
 if STATIC_MODULE_rrl
diff --git a/src/knot/modules/rrl/functions.c b/src/knot/modules/rrl/functions.c
index a2673dba4..fcee6c3ae 100644
--- a/src/knot/modules/rrl/functions.c
+++ b/src/knot/modules/rrl/functions.c
@@ -46,7 +46,7 @@ struct rrl_table {
 	kru_price_t v6_prices[RRL_V6_PREFIXES_CNT];
 	uint32_t log_period;
 	_Atomic uint32_t log_time;
-	uint8_t kru[] ALIGNED(64);
+	_Alignas(64) uint8_t kru[];
 };
 
 static void addr_tostr(char *dst, size_t maxlen, const struct sockaddr_storage *ss)
@@ -132,20 +132,20 @@ int rrl_query(rrl_table_t *rrl, const struct sockaddr_storage *remote, knotd_mod
 	clock_gettime(CLOCK_MONOTONIC_COARSE, &now_ts);
 	uint32_t now = now_ts.tv_sec * 1000 + now_ts.tv_nsec / 1000000;
 
-	uint8_t key[16] ALIGNED(16) = {0, };
 	uint8_t limited_prefix;
+	_Alignas(16) uint8_t key[16] = { 0 };
 	if (remote->ss_family == AF_INET6) {
 		struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)remote;
 		memcpy(key, &ipv6->sin6_addr, 16);
 
 		limited_prefix = KRU.limited_multi_prefix_or((struct kru *)rrl->kru, now,
-				1, key, RRL_V6_PREFIXES, rrl->v6_prices, RRL_V6_PREFIXES_CNT);
+				1, key, RRL_V6_PREFIXES, rrl->v6_prices, RRL_V6_PREFIXES_CNT, NULL);
 	} else {
 		struct sockaddr_in *ipv4 = (struct sockaddr_in *)remote;
 		memcpy(key, &ipv4->sin_addr, 4);
 
 		limited_prefix = KRU.limited_multi_prefix_or((struct kru *)rrl->kru, now,
-				0, key, RRL_V4_PREFIXES, rrl->v4_prices, RRL_V4_PREFIXES_CNT);
+				0, key, RRL_V4_PREFIXES, rrl->v4_prices, RRL_V4_PREFIXES_CNT, NULL);
 	}
 
 	uint32_t log_time_orig = atomic_load_explicit(&rrl->log_time, memory_order_relaxed);
diff --git a/src/knot/modules/rrl/kru-decay.inc.c b/src/knot/modules/rrl/kru-decay.inc.c
deleted file mode 100644
index 002007bb6..000000000
--- a/src/knot/modules/rrl/kru-decay.inc.c
+++ /dev/null
@@ -1,82 +0,0 @@
-/*  Copyright (C) 2024 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
-
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-#include <math.h>
-
-/// Parametrization for speed of decay.
-struct decay_config {
-	/// Bit shift per tick, fractional
-	double shift_bits;
-
-	/// Ticks to get zero loads
-	uint32_t max_ticks;
-
-	uint32_t mult_cache[32];
-};
-
-static inline void decay_initialize(struct decay_config *decay, kru_price_t max_decay)
-{
-	decay->shift_bits = log2(KRU_LIMIT - 1) - log2(KRU_LIMIT - 1 - max_decay);
-	decay->max_ticks = 18 / decay->shift_bits;
-
-	decay->mult_cache[0] = 0;  // not used
-	for (size_t ticks = 1; ticks < sizeof(decay->mult_cache) / sizeof(*decay->mult_cache); ticks++) {
-		decay->mult_cache[ticks] = exp2(32 - decay->shift_bits * ticks) + 0.5;
-	}
-}
-
-/// Catch up the time drift with configurably slower decay.
-static inline void update_time(struct load_cl *l, const uint32_t time_now,
-			const struct decay_config *decay)
-{
-	uint32_t ticks;
-	uint32_t time_last = atomic_load_explicit(&l->time, memory_order_relaxed);
-	do {
-		ticks = time_now - time_last;
-		if (__builtin_expect(!ticks, true)) // we optimize for time not advancing
-			return;
-		// We accept some desynchronization of time_now (e.g. from different threads).
-		if (ticks > (uint32_t)-1024)
-			return;
-	} while (!atomic_compare_exchange_weak_explicit(&l->time, &time_last, time_now, memory_order_relaxed, memory_order_relaxed));
-
-	// If we passed here, we have acquired a time difference we are responsibe for.
-
-	// Don't bother with complex computations if lots of ticks have passed. (little to no speed-up)
-	if (ticks > decay->max_ticks) {
-		memset(l->loads, 0, sizeof(l->loads));
-		return;
-	}
-
-	uint32_t mult;
-	if (__builtin_expect(ticks < sizeof(decay->mult_cache) / sizeof(*decay->mult_cache), 1)) {
-		mult = decay->mult_cache[ticks];
-	} else {
-		mult = exp2(32 - decay->shift_bits * ticks) + 0.5;
-	}
-
-	for (int i = 0; i < LOADS_LEN; ++i) {
-		// We perform decay for the acquired time difference; decays from different threads are commutative.
-		_Atomic uint16_t *load_at = (_Atomic uint16_t *)&l->loads[i];
-		uint16_t l1, load_orig = atomic_load_explicit(load_at, memory_order_relaxed);
-		const uint16_t rnd = rand_bits(16);
-		do {
-			uint64_t m = (((uint64_t)load_orig << 16)) * mult;
-			m = (m >> 32) + ((m >> 31) & 1);
-			l1 = (m >> 16) + (rnd < (uint16_t)m);
-		} while (!atomic_compare_exchange_weak_explicit(load_at, &load_orig, l1, memory_order_relaxed, memory_order_relaxed));
-	}
-}
diff --git a/src/knot/modules/rrl/kru.h b/src/knot/modules/rrl/kru.h
index 95731aedb..7eef6c01f 100644
--- a/src/knot/modules/rrl/kru.h
+++ b/src/knot/modules/rrl/kru.h
@@ -20,13 +20,7 @@
 #include <stddef.h>
 #include <stdint.h>
 
-#if __GNUC__ >= 4 || __clang_major__ >= 4
-	#define ALIGNED_CPU_CACHE __attribute__((aligned(64)))
-	#define ALIGNED(_bytes)   __attribute__((aligned(_bytes)))
-#else
-	#define ALIGNED_CPU_CACHE
-	#define ALIGNED(_bytes)
-#endif
+#define ALIGNED_CPU_CACHE _Alignas(64)
 
 // An unsigned integral type used for prices, blocking occurs when sum of prices overflows.
 // Greater than 16-bit type enables randomized fractional incrementing as the internal counters are still 16-bit.
@@ -76,10 +70,21 @@ struct kru_api {
 	/// Updates KRU only if no query is blocked, unless a race condition occurs --
 	/// in such a case all longer prefixes might have been updated.
 	/// The key of i-th query consists of prefixes[i] bits of key, prefixes[i], and namespace.
+	/// If zero is returned, *max_load_out (unless NULL) is set to
+	/// the maximum of final values of the involved counters normalized to the limit 2^16.
 	uint8_t (*limited_multi_prefix_or)(struct kru *kru, uint32_t time_now,
-			uint8_t namespace, uint8_t key[static 16], uint8_t *prefixes, kru_price_t *prices, size_t queries_cnt);
+			uint8_t namespace, uint8_t key[static 16], uint8_t *prefixes, kru_price_t *prices, size_t queries_cnt, uint16_t *max_load_out);
+
+	/// Multiple queries based on different prefixes of a single key.
+	/// Returns the maximum of final values of the involved counters normalized to the limit 2^16
+	/// and stores the corresponding prefix (value in prefixes) to *prefix_out (unless NULL).
+	/// Set prices to NULL to skip updating; otherwise, KRU is always updated, using maximal allowed value on overflow.
+	/// The key of i-th query consists of prefixes[i] bits of key, prefixes[i], and namespace.
+	uint16_t (*load_multi_prefix_max)(struct kru *kru, uint32_t time_now,
+			uint8_t namespace, uint8_t key[static 16], uint8_t *prefixes, kru_price_t *prices, size_t queries_cnt, uint8_t *prefix_out);
 };
+
 // The functions are stored this way to make it easier to switch
 // implementation based on detected CPU.
 extern struct kru_api KRU;
-extern const struct kru_api KRU_GENERIC, KRU_AVX2; // for tests only
+extern const struct kru_api KRU_GENERIC, KRU_AVX2;
diff --git a/src/knot/modules/rrl/kru.inc.c b/src/knot/modules/rrl/kru.inc.c
index b3a662af3..49e359e8e 100644
--- a/src/knot/modules/rrl/kru.inc.c
+++ b/src/knot/modules/rrl/kru.inc.c
@@ -46,30 +46,10 @@ Size (`loads_bits` = log2 length):
 #include <stdbool.h>
 #include <stddef.h>
 #include <string.h>
+#include <math.h>
 
 #include "./kru.h"
-
-/// Block of loads sharing the same time, so that we're more space-efficient.
-/// It's exactly a single cache line.
-struct load_cl {
-	_Atomic uint32_t time;
-	#define LOADS_LEN 15
-	uint16_t ids[LOADS_LEN];
-	uint16_t loads[LOADS_LEN];
-} ALIGNED_CPU_CACHE;
-static_assert(64 == sizeof(struct load_cl), "bad size of struct load_cl");
-
-inline static uint64_t rand_bits(unsigned int bits) {
-	static _Thread_local uint64_t state = 3723796604792068981ull;
-	const uint64_t prime1 = 11737314301796036329ull;
-	const uint64_t prime2 = 3107264277052274849ull;
-	state = prime1 * state + prime2;
-	//return state & ((1 << bits) - 1);
-	return state >> (64 - bits);
-}
-
-#include "./kru-decay.inc.c"
-
+#include "contrib/macros.h"
 #include "libdnssec/error.h"
 #include "libdnssec/random.h"
 typedef uint64_t hash_t;
@@ -91,11 +71,33 @@ typedef uint64_t hash_t;
 	#include <x86intrin.h>
 #endif
 
+/// Block of loads sharing the same time, so that we're more space-efficient.
+/// It's exactly a single cache line.
+struct load_cl {
+	ALIGNED_CPU_CACHE
+	_Atomic uint32_t time;
+	#define LOADS_LEN 15
+	uint16_t ids[LOADS_LEN];
+	uint16_t loads[LOADS_LEN];
+};
+static_assert(64 == sizeof(struct load_cl), "bad size of struct load_cl");
+
+/// Parametrization for speed of decay.
+struct decay_config {
+	/// Bit shift per tick, fractional
+	double shift_bits;
+
+	/// Ticks to get zero loads
+	uint32_t max_ticks;
+
+	uint32_t mult_cache[32];
+};
+
 struct kru {
 #if USE_AES
 	/// Hashing secret.  Random but shared by all users of the table.
 	/// Let's not make it too large, so that header fits into 64 Bytes.
-	char hash_key[48] ALIGNED(32);
+	_Alignas(32) char hash_key[48];
 #else
 	/// Hashing secret.  Random but shared by all users of the table.
 	SIPHASH_KEY hash_key;
@@ -110,6 +112,70 @@ struct kru {
 	struct load_cl load_cls[][TABLE_COUNT];
 };
 
+inline static uint64_t rand_bits(unsigned int bits)
+{
+	static _Thread_local uint64_t state = 3723796604792068981ull;
+	const uint64_t prime1 = 11737314301796036329ull;
+	const uint64_t prime2 = 3107264277052274849ull;
+	state = prime1 * state + prime2;
+	//return state & ((1 << bits) - 1);
+	return state >> (64 - bits);
+}
+
+static inline void decay_initialize(struct decay_config *decay, kru_price_t max_decay)
+{
+	decay->shift_bits = log2(KRU_LIMIT - 1) - log2(KRU_LIMIT - 1 - max_decay);
+	decay->max_ticks = 18 / decay->shift_bits;
+
+	decay->mult_cache[0] = 0;  // not used
+	for (size_t ticks = 1; ticks < sizeof(decay->mult_cache) / sizeof(*decay->mult_cache); ticks++) {
+		decay->mult_cache[ticks] = exp2(32 - decay->shift_bits * ticks) + 0.5;
+	}
+}
+
+/// Catch up the time drift with configurably slower decay.
+static inline void update_time(struct load_cl *l, const uint32_t time_now,
+			const struct decay_config *decay)
+{
+	uint32_t ticks;
+	uint32_t time_last = atomic_load_explicit(&l->time, memory_order_relaxed);
+	do {
+		ticks = time_now - time_last;
+		if (__builtin_expect(!ticks, true)) // we optimize for time not advancing
+			return;
+		// We accept some desynchronization of time_now (e.g. from different threads).
+		if (ticks > (uint32_t)-1024)
+			return;
+	} while (!atomic_compare_exchange_weak_explicit(&l->time, &time_last, time_now, memory_order_relaxed, memory_order_relaxed));
+
+	// If we passed here, we have acquired a time difference we are responsibe for.
+
+	// Don't bother with complex computations if lots of ticks have passed. (little to no speed-up)
+	if (ticks > decay->max_ticks) {
+		memset(l->loads, 0, sizeof(l->loads));
+		return;
+	}
+
+	uint32_t mult;
+	if (__builtin_expect(ticks < sizeof(decay->mult_cache) / sizeof(*decay->mult_cache), 1)) {
+		mult = decay->mult_cache[ticks];
+	} else {
+		mult = exp2(32 - decay->shift_bits * ticks) + 0.5;
+	}
+
+	for (int i = 0; i < LOADS_LEN; ++i) {
+		// We perform decay for the acquired time difference; decays from different threads are commutative.
+		_Atomic uint16_t *load_at = (_Atomic uint16_t *)&l->loads[i];
+		uint16_t l1, load_orig = atomic_load_explicit(load_at, memory_order_relaxed);
+		const uint16_t rnd = rand_bits(16);
+		do {
+			uint64_t m = (((uint64_t)load_orig << 16)) * mult;
+			m = (m >> 32) + ((m >> 31) & 1);
+			l1 = (m >> 16) + (rnd < (uint16_t)m);
+		} while (!atomic_compare_exchange_weak_explicit(load_at, &load_orig, l1, memory_order_relaxed, memory_order_relaxed));
+	}
+}
+
 /// Convert capacity_log to loads_bits
 static inline int32_t capacity2loads(int capacity_log)
 {
@@ -166,6 +232,7 @@ struct query_ctx {
 	kru_price_t price;
 	uint16_t price16, limit16;
 	uint16_t id;
+	uint16_t final_load_value;  // set by kru_limited_update if not blocked
 	uint16_t *load;
 };
 
@@ -205,7 +272,6 @@ static inline void kru_limited_prefetch(struct kru *kru, uint32_t time_now, uint
 	ctx->id = hash;
 }
 
-
 /// Phase 1/3 of a query -- hash, prefetch, ctx init. Based on a bit prefix of one 16-byte key.
 static inline void kru_limited_prefetch_prefix(struct kru *kru, uint32_t time_now, uint8_t namespace, uint8_t key[static 16], uint8_t prefix, kru_price_t price, struct query_ctx *ctx)
 {
@@ -335,15 +401,17 @@ static inline bool kru_limited_fetch(struct kru *kru, struct query_ctx *ctx)
 	}
 #endif
 
+	ctx->final_load_value = 0;
 	return false;
 
 load_found:;
-	return (*ctx->load >= ctx->limit16);
+	ctx->final_load_value = *ctx->load;
+	return (ctx->final_load_value >= ctx->limit16);
 }
 
 /// Phase 3/3 of a query -- state update, return value overrides previous answer in case of race.
-/// Not needed if blocked by fetch phase.
-static inline bool kru_limited_update(struct kru *kru, struct query_ctx *ctx)
+/// Not needed if blocked by fetch phase. If overflow_update is activated, false is always returned.
+static inline bool kru_limited_update(struct kru *kru, struct query_ctx *ctx, bool overflow_update)
 {
 	_Atomic uint16_t *load_at;
 	if (!ctx->load) {
@@ -408,10 +476,20 @@ static inline bool kru_limited_update(struct kru *kru, struct query_ctx *ctx)
 	const uint16_t price = ctx->price16;
 	const uint16_t limit = ctx->limit16;
 	uint16_t load_orig = atomic_load_explicit(load_at, memory_order_relaxed);
+	uint16_t load_new;
 	do {
-		if (load_orig >= limit)
-			return true;
-	} while (!atomic_compare_exchange_weak_explicit(load_at, &load_orig, load_orig + price, memory_order_relaxed, memory_order_relaxed));
+		if (load_orig >= limit) {
+			if (overflow_update) {
+				load_new = -1;
+			} else {
+				return true;
+			}
+		} else {
+			load_new = load_orig + price;
+		}
+	} while (!atomic_compare_exchange_weak_explicit(load_at, &load_orig, load_new, memory_order_relaxed, memory_order_relaxed));
+
+	ctx->final_load_value = load_new;
 	return false;
 }
 
@@ -429,7 +507,7 @@ static bool kru_limited_multi_or(struct kru *kru, uint32_t time_now, uint8_t **k
 	bool ret = false;
 
 	for (size_t i = 0; i < queries_cnt; i++) {
-		ret |= kru_limited_update(kru, ctx + i);
+		ret |= kru_limited_update(kru, ctx + i, false);
 	}
 
 	return ret;
@@ -450,7 +528,7 @@ static bool kru_limited_multi_or_nobreak(struct kru *kru, uint32_t time_now, uin
 	if (ret) return true;
 
 	for (size_t i = 0; i < queries_cnt; i++) {
-		if (kru_limited_update(kru, ctx + i))
+		if (kru_limited_update(kru, ctx + i, false))
 			ret = true;
 	}
 
@@ -458,7 +536,7 @@ static bool kru_limited_multi_or_nobreak(struct kru *kru, uint32_t time_now, uin
 }
 
 static uint8_t kru_limited_multi_prefix_or(struct kru *kru, uint32_t time_now, uint8_t namespace,
-                                           uint8_t key[static 16], uint8_t *prefixes, kru_price_t *prices, size_t queries_cnt)
+                                           uint8_t key[static 16], uint8_t *prefixes, kru_price_t *prices, size_t queries_cnt, uint16_t *max_load_out)
 {
 	struct query_ctx ctx[queries_cnt];
 
@@ -472,13 +550,54 @@ static uint8_t kru_limited_multi_prefix_or(struct kru *kru, uint32_t time_now, u
 	}
 
 	for (int i = queries_cnt - 1; i >= 0; i--) {
-		if (kru_limited_update(kru, ctx + i))
+		if (kru_limited_update(kru, ctx + i, false))
 			return prefixes[i];
 	}
 
+	if (max_load_out) {
+		*max_load_out = 0;
+		for (size_t i = 0; i < queries_cnt; i++) {
+			*max_load_out = MAX(*max_load_out, ctx[i].final_load_value);
+		}
+	}
+
 	return 0;
 }
 
+static uint16_t kru_load_multi_prefix_max(struct kru *kru, uint32_t time_now, uint8_t namespace,
+                                           uint8_t key[static 16], uint8_t *prefixes, kru_price_t *prices, size_t queries_cnt, uint8_t *prefix_out)
+{
+	struct query_ctx ctx[queries_cnt];
+
+	for (size_t i = 0; i < queries_cnt; i++) {
+		kru_limited_prefetch_prefix(kru, time_now, namespace, key, prefixes[i], (prices ? prices[i] : 0), ctx + i);
+	}
+
+	for (size_t i = 0; i < queries_cnt; i++) {
+		kru_limited_fetch(kru, ctx + i);
+	}
+
+	if (prices) {
+		for (int i = queries_cnt - 1; i >= 0; i--) {
+			kru_limited_update(kru, ctx + i, true);
+		}
+	}
+
+	uint8_t prefix = 0;
+	uint16_t max_load = 0;
+	for (size_t i = 0; i < queries_cnt; i++) {
+		if (max_load < ctx[i].final_load_value) {
+			max_load = ctx[i].final_load_value;
+			prefix = prefixes[i];
+		}
+	}
+	if (prefix_out) {
+		*prefix_out = prefix;
+	}
+
+	return max_load;
+}
+
 /// Update limiting and return true iff it hit the limit instead.
 static bool kru_limited(struct kru *kru, uint32_t time_now, uint8_t key[static 16], kru_price_t price)
 {
@@ -492,4 +611,5 @@ static bool kru_limited(struct kru *kru, uint32_t time_now, uint8_t key[static 1
 	.limited_multi_or = kru_limited_multi_or, \
 	.limited_multi_or_nobreak = kru_limited_multi_or_nobreak, \
 	.limited_multi_prefix_or = kru_limited_multi_prefix_or, \
+	.load_multi_prefix_max = kru_load_multi_prefix_max, \
 }
diff --git a/tests/modules/test_rrl.c b/tests/modules/test_rrl.c
index a8fb6aacf..d948d21b4 100644
--- a/tests/modules/test_rrl.c
+++ b/tests/modules/test_rrl.c
@@ -75,7 +75,7 @@ struct kru_generic {
 	// ...
 };
 struct kru_avx2 {
-	char hash_key[48] ALIGNED(32);
+	_Alignas(32) char hash_key[48];
 	// ...
 };
author	Daniel Salzman <daniel.salzman@nic.cz>	2024-05-30 19:17:59 +0200
committer	Daniel Salzman <daniel.salzman@nic.cz>	2024-08-12 08:01:55 +0200
commit	9b42ca0904fd57922c00656ee93c6efff990092f (patch)
tree	519ab2afa2f0a0accd162f3a0fb9c94b6b01f01f
parent	module: add protocol processing callback (diff)
download	knot-9b42ca0904fd57922c00656ee93c6efff990092f.tar.xz knot-9b42ca0904fd57922c00656ee93c6efff990092f.zip