diff options
author | Daniel Salzman <daniel.salzman@nic.cz> | 2024-05-30 19:17:59 +0200 |
---|---|---|
committer | Daniel Salzman <daniel.salzman@nic.cz> | 2024-08-12 08:01:55 +0200 |
commit | 9b42ca0904fd57922c00656ee93c6efff990092f (patch) | |
tree | 519ab2afa2f0a0accd162f3a0fb9c94b6b01f01f | |
parent | module: add protocol processing callback (diff) | |
download | knot-9b42ca0904fd57922c00656ee93c6efff990092f.tar.xz knot-9b42ca0904fd57922c00656ee93c6efff990092f.zip |
rrl: update KRU
-rw-r--r-- | Knot.files | 1 | ||||
-rw-r--r-- | src/knot/modules/rrl/Makefile.inc | 3 | ||||
-rw-r--r-- | src/knot/modules/rrl/functions.c | 8 | ||||
-rw-r--r-- | src/knot/modules/rrl/kru-decay.inc.c | 82 | ||||
-rw-r--r-- | src/knot/modules/rrl/kru.h | 23 | ||||
-rw-r--r-- | src/knot/modules/rrl/kru.inc.c | 188 | ||||
-rw-r--r-- | tests/modules/test_rrl.c | 2 |
7 files changed, 174 insertions, 133 deletions
diff --git a/Knot.files b/Knot.files index 22b8d65e1..7e0330174 100644 --- a/Knot.files +++ b/Knot.files @@ -286,7 +286,6 @@ src/knot/modules/queryacl/queryacl.c src/knot/modules/rrl/functions.c src/knot/modules/rrl/functions.h src/knot/modules/rrl/kru-avx2.c -src/knot/modules/rrl/kru-decay.inc.c src/knot/modules/rrl/kru-generic.c src/knot/modules/rrl/kru.h src/knot/modules/rrl/kru.inc.c diff --git a/src/knot/modules/rrl/Makefile.inc b/src/knot/modules/rrl/Makefile.inc index 9a26b0571..fbb9eb098 100644 --- a/src/knot/modules/rrl/Makefile.inc +++ b/src/knot/modules/rrl/Makefile.inc @@ -4,8 +4,7 @@ knot_modules_rrl_la_SOURCES = knot/modules/rrl/rrl.c \ knot/modules/rrl/kru-generic.c \ knot/modules/rrl/kru-avx2.c \ knot/modules/rrl/kru.h -noinst_HEADERS = knot/modules/rrl/kru-decay.inc.c \ - knot/modules/rrl/kru.inc.c +noinst_HEADERS = knot/modules/rrl/kru.inc.c EXTRA_DIST += knot/modules/rrl/rrl.rst if STATIC_MODULE_rrl diff --git a/src/knot/modules/rrl/functions.c b/src/knot/modules/rrl/functions.c index a2673dba4..fcee6c3ae 100644 --- a/src/knot/modules/rrl/functions.c +++ b/src/knot/modules/rrl/functions.c @@ -46,7 +46,7 @@ struct rrl_table { kru_price_t v6_prices[RRL_V6_PREFIXES_CNT]; uint32_t log_period; _Atomic uint32_t log_time; - uint8_t kru[] ALIGNED(64); + _Alignas(64) uint8_t kru[]; }; static void addr_tostr(char *dst, size_t maxlen, const struct sockaddr_storage *ss) @@ -132,20 +132,20 @@ int rrl_query(rrl_table_t *rrl, const struct sockaddr_storage *remote, knotd_mod clock_gettime(CLOCK_MONOTONIC_COARSE, &now_ts); uint32_t now = now_ts.tv_sec * 1000 + now_ts.tv_nsec / 1000000; - uint8_t key[16] ALIGNED(16) = {0, }; uint8_t limited_prefix; + _Alignas(16) uint8_t key[16] = { 0 }; if (remote->ss_family == AF_INET6) { struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)remote; memcpy(key, &ipv6->sin6_addr, 16); limited_prefix = KRU.limited_multi_prefix_or((struct kru *)rrl->kru, now, - 1, key, RRL_V6_PREFIXES, rrl->v6_prices, RRL_V6_PREFIXES_CNT); + 1, key, RRL_V6_PREFIXES, rrl->v6_prices, RRL_V6_PREFIXES_CNT, NULL); } else { struct sockaddr_in *ipv4 = (struct sockaddr_in *)remote; memcpy(key, &ipv4->sin_addr, 4); limited_prefix = KRU.limited_multi_prefix_or((struct kru *)rrl->kru, now, - 0, key, RRL_V4_PREFIXES, rrl->v4_prices, RRL_V4_PREFIXES_CNT); + 0, key, RRL_V4_PREFIXES, rrl->v4_prices, RRL_V4_PREFIXES_CNT, NULL); } uint32_t log_time_orig = atomic_load_explicit(&rrl->log_time, memory_order_relaxed); diff --git a/src/knot/modules/rrl/kru-decay.inc.c b/src/knot/modules/rrl/kru-decay.inc.c deleted file mode 100644 index 002007bb6..000000000 --- a/src/knot/modules/rrl/kru-decay.inc.c +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright (C) 2024 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see <https://www.gnu.org/licenses/>. - */ - -#include <math.h> - -/// Parametrization for speed of decay. -struct decay_config { - /// Bit shift per tick, fractional - double shift_bits; - - /// Ticks to get zero loads - uint32_t max_ticks; - - uint32_t mult_cache[32]; -}; - -static inline void decay_initialize(struct decay_config *decay, kru_price_t max_decay) -{ - decay->shift_bits = log2(KRU_LIMIT - 1) - log2(KRU_LIMIT - 1 - max_decay); - decay->max_ticks = 18 / decay->shift_bits; - - decay->mult_cache[0] = 0; // not used - for (size_t ticks = 1; ticks < sizeof(decay->mult_cache) / sizeof(*decay->mult_cache); ticks++) { - decay->mult_cache[ticks] = exp2(32 - decay->shift_bits * ticks) + 0.5; - } -} - -/// Catch up the time drift with configurably slower decay. -static inline void update_time(struct load_cl *l, const uint32_t time_now, - const struct decay_config *decay) -{ - uint32_t ticks; - uint32_t time_last = atomic_load_explicit(&l->time, memory_order_relaxed); - do { - ticks = time_now - time_last; - if (__builtin_expect(!ticks, true)) // we optimize for time not advancing - return; - // We accept some desynchronization of time_now (e.g. from different threads). - if (ticks > (uint32_t)-1024) - return; - } while (!atomic_compare_exchange_weak_explicit(&l->time, &time_last, time_now, memory_order_relaxed, memory_order_relaxed)); - - // If we passed here, we have acquired a time difference we are responsibe for. - - // Don't bother with complex computations if lots of ticks have passed. (little to no speed-up) - if (ticks > decay->max_ticks) { - memset(l->loads, 0, sizeof(l->loads)); - return; - } - - uint32_t mult; - if (__builtin_expect(ticks < sizeof(decay->mult_cache) / sizeof(*decay->mult_cache), 1)) { - mult = decay->mult_cache[ticks]; - } else { - mult = exp2(32 - decay->shift_bits * ticks) + 0.5; - } - - for (int i = 0; i < LOADS_LEN; ++i) { - // We perform decay for the acquired time difference; decays from different threads are commutative. - _Atomic uint16_t *load_at = (_Atomic uint16_t *)&l->loads[i]; - uint16_t l1, load_orig = atomic_load_explicit(load_at, memory_order_relaxed); - const uint16_t rnd = rand_bits(16); - do { - uint64_t m = (((uint64_t)load_orig << 16)) * mult; - m = (m >> 32) + ((m >> 31) & 1); - l1 = (m >> 16) + (rnd < (uint16_t)m); - } while (!atomic_compare_exchange_weak_explicit(load_at, &load_orig, l1, memory_order_relaxed, memory_order_relaxed)); - } -} diff --git a/src/knot/modules/rrl/kru.h b/src/knot/modules/rrl/kru.h index 95731aedb..7eef6c01f 100644 --- a/src/knot/modules/rrl/kru.h +++ b/src/knot/modules/rrl/kru.h @@ -20,13 +20,7 @@ #include <stddef.h> #include <stdint.h> -#if __GNUC__ >= 4 || __clang_major__ >= 4 - #define ALIGNED_CPU_CACHE __attribute__((aligned(64))) - #define ALIGNED(_bytes) __attribute__((aligned(_bytes))) -#else - #define ALIGNED_CPU_CACHE - #define ALIGNED(_bytes) -#endif +#define ALIGNED_CPU_CACHE _Alignas(64) // An unsigned integral type used for prices, blocking occurs when sum of prices overflows. // Greater than 16-bit type enables randomized fractional incrementing as the internal counters are still 16-bit. @@ -76,10 +70,21 @@ struct kru_api { /// Updates KRU only if no query is blocked, unless a race condition occurs -- /// in such a case all longer prefixes might have been updated. /// The key of i-th query consists of prefixes[i] bits of key, prefixes[i], and namespace. + /// If zero is returned, *max_load_out (unless NULL) is set to + /// the maximum of final values of the involved counters normalized to the limit 2^16. uint8_t (*limited_multi_prefix_or)(struct kru *kru, uint32_t time_now, - uint8_t namespace, uint8_t key[static 16], uint8_t *prefixes, kru_price_t *prices, size_t queries_cnt); + uint8_t namespace, uint8_t key[static 16], uint8_t *prefixes, kru_price_t *prices, size_t queries_cnt, uint16_t *max_load_out); + + /// Multiple queries based on different prefixes of a single key. + /// Returns the maximum of final values of the involved counters normalized to the limit 2^16 + /// and stores the corresponding prefix (value in prefixes) to *prefix_out (unless NULL). + /// Set prices to NULL to skip updating; otherwise, KRU is always updated, using maximal allowed value on overflow. + /// The key of i-th query consists of prefixes[i] bits of key, prefixes[i], and namespace. + uint16_t (*load_multi_prefix_max)(struct kru *kru, uint32_t time_now, + uint8_t namespace, uint8_t key[static 16], uint8_t *prefixes, kru_price_t *prices, size_t queries_cnt, uint8_t *prefix_out); }; + // The functions are stored this way to make it easier to switch // implementation based on detected CPU. extern struct kru_api KRU; -extern const struct kru_api KRU_GENERIC, KRU_AVX2; // for tests only +extern const struct kru_api KRU_GENERIC, KRU_AVX2; diff --git a/src/knot/modules/rrl/kru.inc.c b/src/knot/modules/rrl/kru.inc.c index b3a662af3..49e359e8e 100644 --- a/src/knot/modules/rrl/kru.inc.c +++ b/src/knot/modules/rrl/kru.inc.c @@ -46,30 +46,10 @@ Size (`loads_bits` = log2 length): #include <stdbool.h> #include <stddef.h> #include <string.h> +#include <math.h> #include "./kru.h" - -/// Block of loads sharing the same time, so that we're more space-efficient. -/// It's exactly a single cache line. -struct load_cl { - _Atomic uint32_t time; - #define LOADS_LEN 15 - uint16_t ids[LOADS_LEN]; - uint16_t loads[LOADS_LEN]; -} ALIGNED_CPU_CACHE; -static_assert(64 == sizeof(struct load_cl), "bad size of struct load_cl"); - -inline static uint64_t rand_bits(unsigned int bits) { - static _Thread_local uint64_t state = 3723796604792068981ull; - const uint64_t prime1 = 11737314301796036329ull; - const uint64_t prime2 = 3107264277052274849ull; - state = prime1 * state + prime2; - //return state & ((1 << bits) - 1); - return state >> (64 - bits); -} - -#include "./kru-decay.inc.c" - +#include "contrib/macros.h" #include "libdnssec/error.h" #include "libdnssec/random.h" typedef uint64_t hash_t; @@ -91,11 +71,33 @@ typedef uint64_t hash_t; #include <x86intrin.h> #endif +/// Block of loads sharing the same time, so that we're more space-efficient. +/// It's exactly a single cache line. +struct load_cl { + ALIGNED_CPU_CACHE + _Atomic uint32_t time; + #define LOADS_LEN 15 + uint16_t ids[LOADS_LEN]; + uint16_t loads[LOADS_LEN]; +}; +static_assert(64 == sizeof(struct load_cl), "bad size of struct load_cl"); + +/// Parametrization for speed of decay. +struct decay_config { + /// Bit shift per tick, fractional + double shift_bits; + + /// Ticks to get zero loads + uint32_t max_ticks; + + uint32_t mult_cache[32]; +}; + struct kru { #if USE_AES /// Hashing secret. Random but shared by all users of the table. /// Let's not make it too large, so that header fits into 64 Bytes. - char hash_key[48] ALIGNED(32); + _Alignas(32) char hash_key[48]; #else /// Hashing secret. Random but shared by all users of the table. SIPHASH_KEY hash_key; @@ -110,6 +112,70 @@ struct kru { struct load_cl load_cls[][TABLE_COUNT]; }; +inline static uint64_t rand_bits(unsigned int bits) +{ + static _Thread_local uint64_t state = 3723796604792068981ull; + const uint64_t prime1 = 11737314301796036329ull; + const uint64_t prime2 = 3107264277052274849ull; + state = prime1 * state + prime2; + //return state & ((1 << bits) - 1); + return state >> (64 - bits); +} + +static inline void decay_initialize(struct decay_config *decay, kru_price_t max_decay) +{ + decay->shift_bits = log2(KRU_LIMIT - 1) - log2(KRU_LIMIT - 1 - max_decay); + decay->max_ticks = 18 / decay->shift_bits; + + decay->mult_cache[0] = 0; // not used + for (size_t ticks = 1; ticks < sizeof(decay->mult_cache) / sizeof(*decay->mult_cache); ticks++) { + decay->mult_cache[ticks] = exp2(32 - decay->shift_bits * ticks) + 0.5; + } +} + +/// Catch up the time drift with configurably slower decay. +static inline void update_time(struct load_cl *l, const uint32_t time_now, + const struct decay_config *decay) +{ + uint32_t ticks; + uint32_t time_last = atomic_load_explicit(&l->time, memory_order_relaxed); + do { + ticks = time_now - time_last; + if (__builtin_expect(!ticks, true)) // we optimize for time not advancing + return; + // We accept some desynchronization of time_now (e.g. from different threads). + if (ticks > (uint32_t)-1024) + return; + } while (!atomic_compare_exchange_weak_explicit(&l->time, &time_last, time_now, memory_order_relaxed, memory_order_relaxed)); + + // If we passed here, we have acquired a time difference we are responsibe for. + + // Don't bother with complex computations if lots of ticks have passed. (little to no speed-up) + if (ticks > decay->max_ticks) { + memset(l->loads, 0, sizeof(l->loads)); + return; + } + + uint32_t mult; + if (__builtin_expect(ticks < sizeof(decay->mult_cache) / sizeof(*decay->mult_cache), 1)) { + mult = decay->mult_cache[ticks]; + } else { + mult = exp2(32 - decay->shift_bits * ticks) + 0.5; + } + + for (int i = 0; i < LOADS_LEN; ++i) { + // We perform decay for the acquired time difference; decays from different threads are commutative. + _Atomic uint16_t *load_at = (_Atomic uint16_t *)&l->loads[i]; + uint16_t l1, load_orig = atomic_load_explicit(load_at, memory_order_relaxed); + const uint16_t rnd = rand_bits(16); + do { + uint64_t m = (((uint64_t)load_orig << 16)) * mult; + m = (m >> 32) + ((m >> 31) & 1); + l1 = (m >> 16) + (rnd < (uint16_t)m); + } while (!atomic_compare_exchange_weak_explicit(load_at, &load_orig, l1, memory_order_relaxed, memory_order_relaxed)); + } +} + /// Convert capacity_log to loads_bits static inline int32_t capacity2loads(int capacity_log) { @@ -166,6 +232,7 @@ struct query_ctx { kru_price_t price; uint16_t price16, limit16; uint16_t id; + uint16_t final_load_value; // set by kru_limited_update if not blocked uint16_t *load; }; @@ -205,7 +272,6 @@ static inline void kru_limited_prefetch(struct kru *kru, uint32_t time_now, uint ctx->id = hash; } - /// Phase 1/3 of a query -- hash, prefetch, ctx init. Based on a bit prefix of one 16-byte key. static inline void kru_limited_prefetch_prefix(struct kru *kru, uint32_t time_now, uint8_t namespace, uint8_t key[static 16], uint8_t prefix, kru_price_t price, struct query_ctx *ctx) { @@ -335,15 +401,17 @@ static inline bool kru_limited_fetch(struct kru *kru, struct query_ctx *ctx) } #endif + ctx->final_load_value = 0; return false; load_found:; - return (*ctx->load >= ctx->limit16); + ctx->final_load_value = *ctx->load; + return (ctx->final_load_value >= ctx->limit16); } /// Phase 3/3 of a query -- state update, return value overrides previous answer in case of race. -/// Not needed if blocked by fetch phase. -static inline bool kru_limited_update(struct kru *kru, struct query_ctx *ctx) +/// Not needed if blocked by fetch phase. If overflow_update is activated, false is always returned. +static inline bool kru_limited_update(struct kru *kru, struct query_ctx *ctx, bool overflow_update) { _Atomic uint16_t *load_at; if (!ctx->load) { @@ -408,10 +476,20 @@ static inline bool kru_limited_update(struct kru *kru, struct query_ctx *ctx) const uint16_t price = ctx->price16; const uint16_t limit = ctx->limit16; uint16_t load_orig = atomic_load_explicit(load_at, memory_order_relaxed); + uint16_t load_new; do { - if (load_orig >= limit) - return true; - } while (!atomic_compare_exchange_weak_explicit(load_at, &load_orig, load_orig + price, memory_order_relaxed, memory_order_relaxed)); + if (load_orig >= limit) { + if (overflow_update) { + load_new = -1; + } else { + return true; + } + } else { + load_new = load_orig + price; + } + } while (!atomic_compare_exchange_weak_explicit(load_at, &load_orig, load_new, memory_order_relaxed, memory_order_relaxed)); + + ctx->final_load_value = load_new; return false; } @@ -429,7 +507,7 @@ static bool kru_limited_multi_or(struct kru *kru, uint32_t time_now, uint8_t **k bool ret = false; for (size_t i = 0; i < queries_cnt; i++) { - ret |= kru_limited_update(kru, ctx + i); + ret |= kru_limited_update(kru, ctx + i, false); } return ret; @@ -450,7 +528,7 @@ static bool kru_limited_multi_or_nobreak(struct kru *kru, uint32_t time_now, uin if (ret) return true; for (size_t i = 0; i < queries_cnt; i++) { - if (kru_limited_update(kru, ctx + i)) + if (kru_limited_update(kru, ctx + i, false)) ret = true; } @@ -458,7 +536,7 @@ static bool kru_limited_multi_or_nobreak(struct kru *kru, uint32_t time_now, uin } static uint8_t kru_limited_multi_prefix_or(struct kru *kru, uint32_t time_now, uint8_t namespace, - uint8_t key[static 16], uint8_t *prefixes, kru_price_t *prices, size_t queries_cnt) + uint8_t key[static 16], uint8_t *prefixes, kru_price_t *prices, size_t queries_cnt, uint16_t *max_load_out) { struct query_ctx ctx[queries_cnt]; @@ -472,13 +550,54 @@ static uint8_t kru_limited_multi_prefix_or(struct kru *kru, uint32_t time_now, u } for (int i = queries_cnt - 1; i >= 0; i--) { - if (kru_limited_update(kru, ctx + i)) + if (kru_limited_update(kru, ctx + i, false)) return prefixes[i]; } + if (max_load_out) { + *max_load_out = 0; + for (size_t i = 0; i < queries_cnt; i++) { + *max_load_out = MAX(*max_load_out, ctx[i].final_load_value); + } + } + return 0; } +static uint16_t kru_load_multi_prefix_max(struct kru *kru, uint32_t time_now, uint8_t namespace, + uint8_t key[static 16], uint8_t *prefixes, kru_price_t *prices, size_t queries_cnt, uint8_t *prefix_out) +{ + struct query_ctx ctx[queries_cnt]; + + for (size_t i = 0; i < queries_cnt; i++) { + kru_limited_prefetch_prefix(kru, time_now, namespace, key, prefixes[i], (prices ? prices[i] : 0), ctx + i); + } + + for (size_t i = 0; i < queries_cnt; i++) { + kru_limited_fetch(kru, ctx + i); + } + + if (prices) { + for (int i = queries_cnt - 1; i >= 0; i--) { + kru_limited_update(kru, ctx + i, true); + } + } + + uint8_t prefix = 0; + uint16_t max_load = 0; + for (size_t i = 0; i < queries_cnt; i++) { + if (max_load < ctx[i].final_load_value) { + max_load = ctx[i].final_load_value; + prefix = prefixes[i]; + } + } + if (prefix_out) { + *prefix_out = prefix; + } + + return max_load; +} + /// Update limiting and return true iff it hit the limit instead. static bool kru_limited(struct kru *kru, uint32_t time_now, uint8_t key[static 16], kru_price_t price) { @@ -492,4 +611,5 @@ static bool kru_limited(struct kru *kru, uint32_t time_now, uint8_t key[static 1 .limited_multi_or = kru_limited_multi_or, \ .limited_multi_or_nobreak = kru_limited_multi_or_nobreak, \ .limited_multi_prefix_or = kru_limited_multi_prefix_or, \ + .load_multi_prefix_max = kru_load_multi_prefix_max, \ } diff --git a/tests/modules/test_rrl.c b/tests/modules/test_rrl.c index a8fb6aacf..d948d21b4 100644 --- a/tests/modules/test_rrl.c +++ b/tests/modules/test_rrl.c @@ -75,7 +75,7 @@ struct kru_generic { // ... }; struct kru_avx2 { - char hash_key[48] ALIGNED(32); + _Alignas(32) char hash_key[48]; // ... }; |