diff options
author | Jan Hak <jan.hak@nic.cz> | 2021-01-25 16:20:23 +0100 |
---|---|---|
committer | Daniel Salzman <daniel.salzman@nic.cz> | 2021-04-08 18:52:27 +0200 |
commit | 59eb2408f8bd00ff223057f87bdbf0fbc9f799fa (patch) | |
tree | 5e2df5abc357c59d7b7f0a7e770b0faf11e623f7 | |
parent | dnssec: fix little double-free (diff) | |
download | knot-59eb2408f8bd00ff223057f87bdbf0fbc9f799fa.tar.xz knot-59eb2408f8bd00ff223057f87bdbf0fbc9f799fa.zip |
server: add polling wrapper for unix poll and epoll
-rw-r--r-- | Knot.files | 3 | ||||
-rw-r--r-- | configure.ac | 7 | ||||
-rw-r--r-- | src/knot/Makefile.inc | 3 | ||||
-rw-r--r-- | src/knot/common/apoll.h | 79 | ||||
-rw-r--r-- | src/knot/common/epoll_ctx.c | 261 | ||||
-rw-r--r-- | src/knot/common/epoll_ctx.h | 227 | ||||
-rw-r--r-- | src/knot/common/fdset.c | 144 | ||||
-rw-r--r-- | src/knot/common/fdset.h | 151 | ||||
-rw-r--r-- | src/knot/server/tcp-handler.c | 72 | ||||
-rw-r--r-- | src/knot/server/udp-handler.c | 31 | ||||
-rw-r--r-- | src/utils/knotd/main.c | 1 |
11 files changed, 878 insertions, 101 deletions
diff --git a/Knot.files b/Knot.files index 61b6366ec..b0f235857 100644 --- a/Knot.files +++ b/Knot.files @@ -108,6 +108,9 @@ src/knot/catalog/generate.c src/knot/catalog/generate.h src/knot/catalog/interpret.c src/knot/catalog/interpret.h +src/knot/common/apoll.h +src/knot/common/epoll_ctx.c +src/knot/common/epoll_ctx.h src/knot/common/evsched.c src/knot/common/evsched.h src/knot/common/fdset.c diff --git a/configure.ac b/configure.ac index 885986442..42161fcc0 100644 --- a/configure.ac +++ b/configure.ac @@ -639,6 +639,12 @@ AC_CHECK_HEADERS_ONCE([pthread_np.h sys/uio.h bsd/string.h]) AC_CHECK_FUNCS([accept4 clock_gettime fgetln getline initgroups malloc_trim \ setgroups strlcat strlcpy sysctlbyname]) +# Check for a better polling method. +AC_CHECK_FUNCS([epoll_create], [ + AC_DEFINE([HAVE_EPOLL], [1], [epoll available]) + poll_method=epoll], [poll_method=poll] +) + # Check for robust memory cleanup implementations. AC_CHECK_FUNC([explicit_bzero], [ AC_DEFINE([HAVE_EXPLICIT_BZERO], [1], [explicit_bzero available]) @@ -738,6 +744,7 @@ result_msg_base=" Knot DNS $VERSION Use recvmmsg: ${enable_recvmmsg} Use SO_REUSEPORT(_LB): ${enable_reuseport} XDP support: ${enable_xdp} + Polling method: ${poll_method} Memory allocator: ${with_memory_allocator} Fast zone parser: ${enable_fastparser} Utilities with IDN: ${with_libidn} diff --git a/src/knot/Makefile.inc b/src/knot/Makefile.inc index 5cc44fb33..88b1f13e2 100644 --- a/src/knot/Makefile.inc +++ b/src/knot/Makefile.inc @@ -119,6 +119,9 @@ libknotd_la_SOURCES = \ knot/query/query.h \ knot/query/requestor.c \ knot/query/requestor.h \ + knot/common/apoll.h \ + knot/common/epoll_ctx.c \ + knot/common/epoll_ctx.h \ knot/common/evsched.c \ knot/common/evsched.h \ knot/common/fdset.c \ diff --git a/src/knot/common/apoll.h b/src/knot/common/apoll.h new file mode 100644 index 000000000..8c666a191 --- /dev/null +++ b/src/knot/common/apoll.h @@ -0,0 +1,79 @@ +/* Copyright (C) 2021 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#pragma once + +#ifdef HAVE_EPOLL + #include "knot/common/epoll_ctx.h" + + #define apoll_t epoll_ctx_t + #define apoll_it_t epoll_it_t + #define apoll_sweep_state epoll_ctx_sweep_state_t + + #define APOLL_CTX_KEEP EPOLL_CTX_KEEP + #define APOLL_CTX_SWEEP EPOLL_CTX_SWEEP + + #define APOLL_POLLIN EPOLLIN + + #define APOLL_CTX_INIT_SIZE EPOLL_INIT_SIZE + + #define apoll_init(ctx, size) epoll_ctx_init(ctx, size) + #define apoll_close(ctx) epoll_ctx_close(ctx) + #define apoll_clear(ctx) epoll_ctx_clear(ctx) + #define apoll_add(ctx, fd, events, usrctx) epoll_ctx_add(ctx, fd, events, usrctx) + #define apoll_set_watchdog(ctx, idx, interval) epoll_ctx_set_watchdog(ctx, idx, interval) + #define apoll_get_length(ctx) epoll_ctx_get_length(ctx) + #define apoll_get_fd(ctx, idx) epoll_ctx_get_fd(ctx, idx) + #define apoll_poll(ctx, it, offset, timeout) epoll_ctx_wait(ctx, it, offset, timeout) + #define apoll_sweep(ctx, cb, data) epoll_ctx_sweep(ctx, cb, data) + #define apoll_it_next(it) epoll_it_next(it) + #define apoll_it_done(it) epoll_it_done(it) + #define apoll_it_remove(it) epoll_it_remove(it) + #define apoll_it_get_fd(it) epoll_it_get_fd(it) + #define apoll_it_get_idx(it) epoll_it_get_idx(it) + #define apoll_it_ev_is_pollin(it) epoll_it_ev_is_pollin(it) + #define apoll_it_ev_is_error(it) epoll_it_ev_is_err(it) +#else + #include "knot/common/fdset.h" + + #define apoll_t fdset_t + #define apoll_it_t fdset_it_t + #define apoll_sweep_state fdset_sweep_state_t + + #define APOLL_CTX_KEEP FDSET_KEEP + #define APOLL_CTX_SWEEP FDSET_SWEEP + + #define APOLL_POLLIN POLLIN + + #define APOLL_CTX_INIT_SIZE FDSET_INIT_SIZE + + #define apoll_init(ctx, size) fdset_init(ctx, size) + #define apoll_close(ctx) + #define apoll_clear(ctx) fdset_clear(ctx) + #define apoll_add(ctx, fd, events, usrctx) fdset_add(ctx, fd, events, usrctx) + #define apoll_set_watchdog(ctx, idx, interval) fdset_set_watchdog(ctx, idx, interval) + #define apoll_get_length(ctx) fdset_get_length(ctx) + #define apoll_get_fd(ctx, idx) fdset_get_fd(ctx, idx) + #define apoll_poll(ctx, it, offset, timeout) fdset_poll(ctx, it, offset, timeout) + #define apoll_sweep(ctx, cb, data) fdset_sweep(ctx, cb, data) + #define apoll_it_next(it) fdset_it_next(it) + #define apoll_it_done(it) fdset_it_done(it) + #define apoll_it_remove(it) fdset_it_remove(it) + #define apoll_it_get_fd(it) fdset_it_get_fd(it) + #define apoll_it_get_idx(it) fdset_it_get_idx(it) + #define apoll_it_ev_is_pollin(it) fdset_it_ev_is_pollin(it) + #define apoll_it_ev_is_error(it) fdset_it_ev_is_err(it) +#endif diff --git a/src/knot/common/epoll_ctx.c b/src/knot/common/epoll_ctx.c new file mode 100644 index 000000000..46ad8f1fa --- /dev/null +++ b/src/knot/common/epoll_ctx.c @@ -0,0 +1,261 @@ +/* Copyright (C) 2021 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#ifdef HAVE_EPOLL + +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <time.h> +#include <assert.h> +#include "knot/common/epoll_ctx.h" +#include "contrib/time.h" +#include "libknot/errcode.h" + +/* Realloc memory or return error (part of epoll_ctx_resize). */ +#define MEM_RESIZE(tmp, p, n) \ + if ((tmp = realloc((p), (n) * sizeof(*p))) == NULL) \ + return KNOT_ENOMEM; \ + (p) = tmp; + +static int epoll_ctx_resize(epoll_ctx_t *set, const unsigned size) +{ + void *tmp = NULL; + MEM_RESIZE(tmp, set->usrctx, size); + MEM_RESIZE(tmp, set->timeout, size); + MEM_RESIZE(tmp, set->ev, size); + set->size = size; + return KNOT_EOK; +} + +int epoll_ctx_init(epoll_ctx_t *ctx, const unsigned size) +{ + if (ctx == NULL) { + return KNOT_EINVAL; + } + + memset(ctx, 0, sizeof(epoll_ctx_t)); + + ctx->efd = epoll_create1(0); + if (ctx->efd < 0) { + return KNOT_EMFILE; + } + + return epoll_ctx_resize(ctx, size); +} + +int epoll_ctx_clear(epoll_ctx_t* ctx) +{ + if (ctx == NULL) { + return KNOT_EINVAL; + } + + int bck = ctx->efd; + free(ctx->ev); + free(ctx->usrctx); + free(ctx->timeout); + free(ctx->recv_ev); + memset(ctx, 0, sizeof(epoll_ctx_t)); + ctx->efd = bck; + return KNOT_EOK; +} + +void epoll_ctx_close(const epoll_ctx_t* ctx) +{ + close(ctx->efd); +} + +int epoll_ctx_add(epoll_ctx_t *ctx, const int fd, const unsigned events, void *usrctx) +{ + if (ctx == NULL || fd < 0) { + return KNOT_EINVAL; + } + + /* Realloc needed. */ + if (ctx->n == ctx->size && epoll_ctx_resize(ctx, ctx->size + EPOLL_INIT_SIZE)) + return KNOT_ENOMEM; + + /* Initialize. */ + const int i = ctx->n++; + ctx->ev[i].data.fd = fd; + ctx->ev[i].events = events; + ctx->usrctx[i] = usrctx; + ctx->timeout[i] = 0; + struct epoll_event ev = { + .data.u64 = i, + .events = events + }; + epoll_ctl(ctx->efd, EPOLL_CTL_ADD, fd, &ev); + + return i; +} + +static int epoll_ctx_remove(epoll_ctx_t *ctx, const unsigned idx) +{ + if (ctx == NULL || idx >= ctx->n) { + return KNOT_EINVAL; + } + + epoll_ctl(ctx->efd, EPOLL_CTL_DEL, ctx->ev[idx].data.fd, NULL); + const unsigned last = --ctx->n; + /* Nothing else if it is the last one. + * Move last -> i if some remain. */ + if (idx < last) { + ctx->ev[idx] = ctx->ev[last]; + ctx->timeout[idx] = ctx->timeout[last]; + ctx->usrctx[idx] = ctx->usrctx[last]; + struct epoll_event ev = { + .data.u64 = idx, + .events = ctx->ev[idx].events + }; + epoll_ctl(ctx->efd, EPOLL_CTL_MOD, ctx->ev[last].data.fd, &ev); + } + + return KNOT_EOK; +} + +int epoll_ctx_wait(epoll_ctx_t *ctx, epoll_it_t *it, const unsigned offset, const int timeout) +{ + if (ctx->recv_size != ctx->size) { + void *tmp = NULL; + MEM_RESIZE(tmp, ctx->recv_ev, ctx->size); + ctx->recv_size = ctx->size; + } + + it->ctx = ctx; + it->ptr = ctx->recv_ev; + it->offset = offset; + + /* + * NOTE: Can't skip offset without bunch of syscalls!! + * Because of that it waits for `ctx->n` (every socket). Offset is set when TCP + * trotlling is ON. Sometimes it can return with sockets where none of them are + * connection socket, but it should not be common. + * But it can cause problems when adopted in other use-case. + */ + return it->unprocessed = epoll_wait(ctx->efd, ctx->recv_ev, ctx->n, timeout * 1000); +} + +int epoll_ctx_set_watchdog(epoll_ctx_t *ctx, const unsigned idx, const int interval) +{ + if (ctx == NULL || idx >= ctx->n) { + return KNOT_EINVAL; + } + + /* Lift watchdog if interval is negative. */ + if (interval < 0) { + ctx->timeout[idx] = 0; + return KNOT_EOK; + } + + /* Update clock. */ + const struct timespec now = time_now(); + ctx->timeout[idx] = now.tv_sec + interval; /* Only seconds precision. */ + return KNOT_EOK; +} + +int epoll_ctx_get_fd(const epoll_ctx_t *ctx, const unsigned idx) +{ + if (ctx == NULL || idx >= ctx->n) { + return KNOT_EINVAL; + } + + return ctx->ev[idx].data.u64; +} + +unsigned epoll_ctx_get_length(const epoll_ctx_t *ctx) +{ + assert(ctx); + return ctx->n; +} + +int epoll_ctx_sweep(epoll_ctx_t* ctx, const epoll_ctx_sweep_cb_t cb, void *data) +{ + if (ctx == NULL || cb == NULL) { + return KNOT_EINVAL; + } + + /* Get time threshold. */ + const struct timespec now = time_now(); + unsigned idx = 0; + while (idx < ctx->n) { + /* Check sweep state, remove if requested. */ + if (ctx->timeout[idx] > 0 && ctx->timeout[idx] <= now.tv_sec) { + const int fd = epoll_ctx_get_fd(ctx, idx); + if (cb(ctx, fd, data) == EPOLL_CTX_SWEEP) { + if (epoll_ctx_remove(ctx, idx) == KNOT_EOK) { + continue; /* Stay on the index. */ + } + } + } + ++idx; + } + + return KNOT_EOK; +} + +void epoll_it_next(epoll_it_t *it) +{ + do { + it->ptr++; + it->unprocessed--; + } while (it->unprocessed > 0 && epoll_it_get_idx(it) < it->offset); +} + +int epoll_it_done(const epoll_it_t *it) +{ + return it->unprocessed <= 0; +} + +int epoll_it_remove(epoll_it_t *it) +{ + if (it == NULL || it->ctx == NULL) { + return KNOT_EINVAL; + } + epoll_ctx_t *ctx = it->ctx; + const int idx = epoll_it_get_idx(it); + epoll_ctx_remove(ctx, idx); + /* Iterator should return on last valid already processed element. */ + /* On `next` call (in for-loop) will point on first unprocessed. */ + it->ptr--; + return KNOT_EOK; +} + +int epoll_it_get_fd(const epoll_it_t *it) +{ + assert(it != NULL); + return it->ctx->ev[epoll_it_get_idx(it)].data.fd; +} + +unsigned epoll_it_get_idx(const epoll_it_t *it) +{ + assert(it != NULL); + return it->ptr->data.u64; +} + +int epoll_it_ev_is_pollin(const epoll_it_t *it) +{ + assert(it != NULL); + return it->ptr->events & EPOLLIN; +} + +int epoll_it_ev_is_err(const epoll_it_t *it) +{ + assert(it != NULL); + return it->ptr->events & (EPOLLERR|EPOLLHUP); +} + +#endif diff --git a/src/knot/common/epoll_ctx.h b/src/knot/common/epoll_ctx.h new file mode 100644 index 000000000..a8fdc5f66 --- /dev/null +++ b/src/knot/common/epoll_ctx.h @@ -0,0 +1,227 @@ +/* Copyright (C) 2021 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +/*! + * \brief I/O multiplexing with context and timeouts for each fd. + */ + +#pragma once + +#ifdef HAVE_EPOLL + +#include <stddef.h> +#include <signal.h> +#include <sys/time.h> +#include <sys/epoll.h> + +#define EPOLL_INIT_SIZE 256 /* Resize step. */ + +/*! \brief Set of filedescriptors with associated context and timeouts. */ +typedef struct epoll_ctx { + int efd; /*!< File descriptor of epoll. */ + unsigned n; /*!< Active fds. */ + unsigned size; /*!< Array size (allocated). */ + unsigned recv_size; /*!< Size of array for received events. */ + struct epoll_event *ev; /*!< Epoll event storage for each fd */ + void* *usrctx; /*!< Context for each fd. */ + time_t *timeout; /*!< Timeout for each fd (seconds precision). */ + struct epoll_event *recv_ev; /*!< Array for received events. */ +} epoll_ctx_t; + +typedef struct epoll_it { + epoll_ctx_t *ctx; /*!< Iterator related context. */ + struct epoll_event *ptr; /*!< Pointer on processed event. */ + int offset; /*!< Event index offset. */ + int unprocessed; /*!< Unprocessed events left. */ +} epoll_it_t; + +/*! \brief Mark-and-sweep state. */ +typedef enum epoll_ctx_sweep_state { + EPOLL_CTX_KEEP, + EPOLL_CTX_SWEEP +} epoll_ctx_sweep_state_t; + +/*! \brief Sweep callback (set, index, data) */ +typedef enum epoll_ctx_sweep_state (*epoll_ctx_sweep_cb_t)(epoll_ctx_t*, int, void*); + +/*! + * \brief Initialize epoll_ctx to given size. + * + * \param ctx Target ctx. + * \param size Initial ctx size. + * + * \retval ret == 0 if successful. + * \retval ret < 0 on error. + */ +int epoll_ctx_init(epoll_ctx_t *ctx, const unsigned size); + +/*! + * \brief Clear whole context of epoll_ctx. + * + * \param ctx Target ctx. + * + * \retval ret == 0 if successful. + * \retval ret < 0 on error. + */ +int epoll_ctx_clear(epoll_ctx_t* ctx); + +/*! + * \brief Close epoll related file descriptor. + * + * \param ctx Target ctx. + */ +void epoll_ctx_close(const epoll_ctx_t* ctx); + +/*! + * \brief Add file descriptor to watched ctx. + * + * \param ctx Target ctx. + * \param fd Added file descriptor. + * \param events Mask of watched events. + * \param usrctx Context (optional). + * + * \retval ret >= 0 is index of the added fd. + * \retval ret < 0 on errors. + */ +int epoll_ctx_add(epoll_ctx_t *ctx, const int fd, const unsigned events, void *usrctx); + +/*! + * \brief Wait for receive events. + * + * \param ctx Target ctx. + * \param it Event iterator storage. + * \param offset Index of first event. + * \param timeout Timeout of operation (negative number for unlimited). + * + * \retval ret >= 0 represents number of events received. + * \retval ret < 0 on error. + */ +int epoll_ctx_wait(epoll_ctx_t *ctx, epoll_it_t *it, const unsigned offset, const int timeout); + +/*! + * \brief Set file descriptor watchdog interval. + * + * Set time (interval from now) after which the associated file descriptor + * should be sweeped (see epoll_ctx_sweep). Good example is setting a grace period + * of N seconds between socket activity. If socket is not active within + * <now, now + interval>, it is sweeped and potentially closed. + * + * \param ctx Target ctx. + * \param i Index for the file descriptor. + * \param interval Allowed interval without activity (seconds). + * -1 disables watchdog timer + * + * \retval ret == 0 on success. + * \retval ret < 0 on errors. + */ +int epoll_ctx_set_watchdog(epoll_ctx_t *ctx, const unsigned idx, const int interval); + +/*! + * \brief Returns file descriptor based on index. + * + * \param ctx Target ctx. + * \param idx Index of the file descriptor. + * + * \retval ret >= 0 for file descriptor. + * \retval ret < 0 on errors. + */ +int epoll_ctx_get_fd(const epoll_ctx_t *ctx, const unsigned idx); + +/*! + * \brief Returns number of file descriptors stored in ctx. + * + * \param ctx Target ctx. + * + * \retval Number of descriptors stored + */ +unsigned epoll_ctx_get_length(const epoll_ctx_t *ctx); + +/*! + * \brief Sweep file descriptors with exceeding inactivity period. + * + * \param ctx Target ctx. + * \param cb Callback for sweeped descriptors. + * \param data Pointer to extra data. + * + * \retval number of sweeped descriptors. + * \retval -1 on errors. + */ +int epoll_ctx_sweep(epoll_ctx_t* ctx, const epoll_ctx_sweep_cb_t cb, void *data); + +/*! + * \brief Move iterator on next received event. + * + * \param it Target iterator. + */ +void epoll_it_next(epoll_it_t *it); + +/*! + * \brief Decide if there is more received events. + * + * \param it Target iterator. + * + * \retval Logical flag represents 'done' state. + */ +int epoll_it_done(const epoll_it_t *it); + +/*! + * \brief Remove file descriptor from watched ctx. + * + * \param it Target iterator. + * + * \retval 0 if successful. + * \retval ret < 0 on error. + */ +int epoll_it_remove(epoll_it_t *it); + +/*! + * \brief Get file descriptor of event referenced by iterator. + * + * \param it Target iterator. + * + * \retval ret >= 0 for file descriptor. + * \retval ret < 0 on errors. + */ +int epoll_it_get_fd(const epoll_it_t *it); + +/*! + * \brief Get index of event in set referenced by iterator. + * + * \param it Target iterator. + * + * \retval Index of event. + */ +unsigned epoll_it_get_idx(const epoll_it_t *it); + +/*! + * \brief Decide if event referenced by iterator is POLLIN event. + * + * \param it Target iterator. + * + * \retval Logical flag represents 'POLLIN' event received. + */ +int epoll_it_ev_is_pollin(const epoll_it_t *it); + +/*! + * \brief Decide if event referenced by iterator is error event. + * + * \param it Target iterator. + * + * \retval Logical flag represents error event received. + */ +int epoll_it_ev_is_err(const epoll_it_t *it); + +#endif diff --git a/src/knot/common/fdset.c b/src/knot/common/fdset.c index bb836709a..aa98c845d 100644 --- a/src/knot/common/fdset.c +++ b/src/knot/common/fdset.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2011 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> +/* Copyright (C) 2021 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -14,10 +14,13 @@ along with this program. If not, see <https://www.gnu.org/licenses/>. */ +#ifdef ENABLE_POLL + #include <stdlib.h> #include <string.h> #include <unistd.h> #include <time.h> +#include <assert.h> #include "knot/common/fdset.h" #include "contrib/time.h" #include "libknot/errcode.h" @@ -28,7 +31,7 @@ return KNOT_ENOMEM; \ (p) = tmp; -static int fdset_resize(fdset_t *set, unsigned size) +static int fdset_resize(fdset_t *set, const unsigned size) { void *tmp = NULL; MEM_RESIZE(tmp, set->ctx, size); @@ -38,7 +41,7 @@ static int fdset_resize(fdset_t *set, unsigned size) return KNOT_EOK; } -int fdset_init(fdset_t *set, unsigned size) +int fdset_init(fdset_t *set, const unsigned size) { if (set == NULL) { return KNOT_EINVAL; @@ -61,7 +64,7 @@ int fdset_clear(fdset_t* set) return KNOT_EOK; } -int fdset_add(fdset_t *set, int fd, unsigned events, void *ctx) +int fdset_add(fdset_t *set, const int fd, const unsigned events, void *ctx) { if (set == NULL || fd < 0) { return KNOT_EINVAL; @@ -72,7 +75,7 @@ int fdset_add(fdset_t *set, int fd, unsigned events, void *ctx) return KNOT_ENOMEM; /* Initialize. */ - int i = set->n++; + const int i = set->n++; set->pfd[i].fd = fd; set->pfd[i].events = events; set->pfd[i].revents = 0; @@ -83,69 +86,148 @@ int fdset_add(fdset_t *set, int fd, unsigned events, void *ctx) return i; } -int fdset_remove(fdset_t *set, unsigned i) +//TODO should be static, but it is a dependency in tests (? remove from +// tests or something ?) +int fdset_remove(fdset_t *set, const unsigned idx) { - if (set == NULL || i >= set->n) { + if (set == NULL || idx >= set->n) { return KNOT_EINVAL; } - /* Decrement number of elms. */ - --set->n; - + const unsigned last = --set->n; /* Nothing else if it is the last one. * Move last -> i if some remain. */ - unsigned last = set->n; /* Already decremented */ - if (i < last) { - set->pfd[i] = set->pfd[last]; - set->timeout[i] = set->timeout[last]; - set->ctx[i] = set->ctx[last]; + if (idx < last) { + set->pfd[idx] = set->pfd[last]; + set->timeout[idx] = set->timeout[last]; + set->ctx[idx] = set->ctx[last]; } return KNOT_EOK; } -int fdset_set_watchdog(fdset_t* set, int i, int interval) +int fdset_poll(fdset_t *set, fdset_it_t *it, const unsigned offset, const int timeout) +{ + it->ctx = set; + it->idx = offset; + it->unprocessed = poll(&set->pfd[offset], set->n - offset, 1000 * timeout); + while (it->unprocessed > 0 && set->pfd[it->idx].revents == 0) { + it->idx++; + } + return it->unprocessed; +} + +int fdset_set_watchdog(fdset_t* set, const unsigned idx, const int interval) { - if (set == NULL || i >= set->n) { + if (set == NULL || idx >= set->n) { return KNOT_EINVAL; } /* Lift watchdog if interval is negative. */ if (interval < 0) { - set->timeout[i] = 0; + set->timeout[idx] = 0; return KNOT_EOK; } /* Update clock. */ - struct timespec now = time_now(); + const struct timespec now = time_now(); + set->timeout[idx] = now.tv_sec + interval; /* Only seconds precision. */ - set->timeout[i] = now.tv_sec + interval; /* Only seconds precision. */ return KNOT_EOK; } -int fdset_sweep(fdset_t* set, fdset_sweep_cb_t cb, void *data) +int fdset_get_fd(const fdset_t *set, const unsigned idx) { - if (set == NULL || cb == NULL) { + if (set == NULL || idx >= set->n) { return KNOT_EINVAL; } - /* Get time threshold. */ - struct timespec now = time_now(); + return set->pfd[idx].fd; +} - unsigned i = 0; - while (i < set->n) { +unsigned fdset_get_length(const fdset_t *set) +{ + assert(set); + return set->n; +} + +int fdset_sweep(fdset_t* set, const fdset_sweep_cb_t cb, void *data) +{ + if (set == NULL || cb == NULL) { + return KNOT_EINVAL; + } + + /* Get time threshold. */ + const struct timespec now = time_now(); + unsigned idx = 0; + while (idx < set->n) { /* Check sweep state, remove if requested. */ - if (set->timeout[i] > 0 && set->timeout[i] <= now.tv_sec) { - if (cb(set, i, data) == FDSET_SWEEP) { - if (fdset_remove(set, i) == KNOT_EOK) + if (set->timeout[idx] > 0 && set->timeout[idx] <= now.tv_sec) { + const int fd = fdset_get_fd(set, idx); + if (cb(set, fd, data) == FDSET_SWEEP) { + if (fdset_remove(set, idx) == KNOT_EOK) { continue; /* Stay on the index. */ + } } } + ++idx; + } + return KNOT_EOK; +} + +void fdset_it_next(fdset_it_t *it) +{ + if (--it->unprocessed > 0) { + while (it->ctx->pfd[++it->idx].revents == 0); /* nop */ + } +} + +int fdset_it_done(const fdset_it_t *it) +{ + return it->unprocessed <= 0; +} - /* Next descriptor. */ - ++i; +int fdset_it_remove(fdset_it_t *it) +{ + if (it == NULL || it->ctx == NULL) { + return KNOT_EINVAL; } + fdset_t *set = it->ctx; + const unsigned idx = fdset_it_get_idx(it); + fdset_remove(set, idx); + /* Iterator should return on last valid already processed element. */ + /* On `next` call (in for-loop) will point on first unprocessed. */ + --it->idx; return KNOT_EOK; } + +int fdset_it_get_fd(const fdset_it_t *it) +{ + if (it == NULL) { + return KNOT_EINVAL; + } + + return it->ctx->pfd[it->idx].fd; +} + +unsigned fdset_it_get_idx(const fdset_it_t *it) +{ + assert(it); + return it->idx; +} + +int fdset_it_ev_is_pollin(const fdset_it_t *it) +{ + assert(it); + return it->ctx->pfd[it->idx].revents & POLLIN; +} + +int fdset_it_ev_is_err(const fdset_it_t *it) +{ + assert(it); + return it->ctx->pfd[it->idx].revents & (POLLERR|POLLHUP|POLLNVAL); +} + +#endif diff --git a/src/knot/common/fdset.h b/src/knot/common/fdset.h index a5623f758..a75912c96 100644 --- a/src/knot/common/fdset.h +++ b/src/knot/common/fdset.h @@ -1,4 +1,4 @@ -/* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> +/* Copyright (C) 2021 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz> This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,6 +20,8 @@ #pragma once +#if !defined(HAVE_EPOLL) + #include <stddef.h> #include <poll.h> #include <sys/time.h> @@ -36,25 +38,40 @@ typedef struct fdset { time_t *timeout; /*!< Timeout for each fd (seconds precision). */ } fdset_t; +/*! \brief State of iterator over received events */ +typedef struct fdset_it { + fdset_t *ctx; /*!< Source fdset_t. */ + unsigned idx; /*!< Index of processed event. */ + int unprocessed; /*!< Unprocessed events left. */ +} fdset_it_t; + /*! \brief Mark-and-sweep state. */ -enum fdset_sweep_state { +typedef enum fdset_sweep_state { FDSET_KEEP, FDSET_SWEEP -}; +} fdset_sweep_state_t; /*! \brief Sweep callback (set, index, data) */ typedef enum fdset_sweep_state (*fdset_sweep_cb_t)(fdset_t*, int, void*); /*! * \brief Initialize fdset to given size. + * + * \param set Target set. + * \param size Initial set size. + * + * \retval ret == 0 if successful. + * \retval ret < 0 on error. */ -int fdset_init(fdset_t *set, unsigned size); +int fdset_init(fdset_t *set, const unsigned size); /*! - * \brief Destroy FDSET. + * \brief Clear whole context of FDSET. * - * \retval 0 if successful. - * \retval -1 on error. + * \param set Target set. + * + * \retval ret == 0 if successful. + * \retval ret < 0 on error. */ int fdset_clear(fdset_t* set); @@ -66,10 +83,10 @@ int fdset_clear(fdset_t* set); * \param events Mask of watched events. * \param ctx Context (optional). * - * \retval index of the added fd if successful. - * \retval -1 on errors. + * \retval ret >= 0 is index of the added fd. + * \retval ret < 0 on errors. */ -int fdset_add(fdset_t *set, int fd, unsigned events, void *ctx); +int fdset_add(fdset_t *set, const int fd, const unsigned events, void *ctx); /*! * \brief Remove file descriptor from watched set. @@ -78,9 +95,24 @@ int fdset_add(fdset_t *set, int fd, unsigned events, void *ctx); * \param i Index of the removed fd. * * \retval 0 if successful. - * \retval -1 on errors. + * \retval ret < 0 on errors. + */ +int fdset_remove(fdset_t *set, const unsigned idx); + +/*! + * \brief Wait for receive events. + * + * Skip events based on offset and set iterator on first event. + * + * \param set Target set. + * \param it Event iterator storage. + * \param offset Index of first event. + * \param timeout Timeout of operation (negative number for unlimited). + * + * \retval ret >= 0 represents number of events received. + * \retval ret < 0 on error. */ -int fdset_remove(fdset_t *set, unsigned i); +int fdset_poll(fdset_t *set, fdset_it_t *it, const unsigned offset, const int timeout); /*! * \brief Set file descriptor watchdog interval. @@ -91,14 +123,34 @@ int fdset_remove(fdset_t *set, unsigned i); * <now, now + interval>, it is sweeped and potentially closed. * * \param set Target set. - * \param i Index for the file descriptor. + * \param idx Index of the file descriptor. * \param interval Allowed interval without activity (seconds). * -1 disables watchdog timer * - * \retval 0 if successful. - * \retval -1 on errors. + * \retval ret == 0 on success. + * \retval ret < 0 on errors. */ -int fdset_set_watchdog(fdset_t* set, int i, int interval); +int fdset_set_watchdog(fdset_t *set, const unsigned idx, const int interval); + +/*! + * \brief Returns file descriptor based on index. + * + * \param set Target set. + * \param idx Index of the file descriptor. + * + * \retval ret >= 0 for file descriptor. + * \retval ret < 0 on errors. + */ +int fdset_get_fd(const fdset_t *set, const unsigned idx); + +/*! + * \brief Returns number of file descriptors stored in set. + * + * \param set Target set. + * + * \retval Number of descriptors stored + */ +unsigned fdset_get_length(const fdset_t *set); /*! * \brief Sweep file descriptors with exceeding inactivity period. @@ -110,4 +162,69 @@ int fdset_set_watchdog(fdset_t* set, int i, int interval); * \retval number of sweeped descriptors. * \retval -1 on errors. */ -int fdset_sweep(fdset_t* set, fdset_sweep_cb_t cb, void *data); +int fdset_sweep(fdset_t* set, const fdset_sweep_cb_t cb, void *data); + +/*! + * \brief Move iterator on next received event. + * + * \param it Target iterator. + */ +void fdset_it_next(fdset_it_t *it); + +/*! + * \brief Decide if there is more received events. + * + * \param it Target iterator. + * + * \retval Logical flag represents 'done' state. + */ +int fdset_it_done(const fdset_it_t *it); + +/*! + * \brief Remove file descriptor referenced by iterator from watched set. + * + * \param it Target iterator. + * + * \retval 0 if successful. + * \retval ret < 0 on error. + */ +int fdset_it_remove(fdset_it_t *it); + +/*! + * \brief Get file descriptor of event referenced by iterator. + * + * \param it Target iterator. + * + * \retval ret >= 0 for file descriptor. + * \retval ret < 0 on errors. + */ +int fdset_it_get_fd(const fdset_it_t *it); + +/*! + * \brief Get index of event in set referenced by iterator. + * + * \param it Target iterator. + * + * \retval Index of event. + */ +unsigned fdset_it_get_idx(const fdset_it_t *it); + +/*! + * \brief Decide if event referenced by iterator is POLLIN event. + * + * \param it Target iterator. + * + * \retval Logical flag represents 'POLLIN' event received. + */ +int fdset_it_ev_is_pollin(const fdset_it_t *it); + +/*! + * \brief Decide if event referenced by iterator is error event. + * + * \param it Target iterator. + * + * \retval Logical flag represents error event received. + */ +int fdset_it_ev_is_err(const fdset_it_t *it); + +#endif diff --git a/src/knot/server/tcp-handler.c b/src/knot/server/tcp-handler.c index 69cf347b1..7d17e79ba 100644 --- a/src/knot/server/tcp-handler.c +++ b/src/knot/server/tcp-handler.c @@ -32,6 +32,7 @@ #include "knot/server/server.h" #include "knot/server/tcp-handler.h" #include "knot/common/log.h" +#include "knot/common/apoll.h" #include "knot/nameserver/process_query.h" #include "knot/query/layer.h" #include "contrib/macros.h" @@ -49,7 +50,7 @@ typedef struct tcp_context { unsigned client_threshold; /*!< Index of first TCP client. */ struct timespec last_poll_time; /*!< Time of the last socket poll. */ bool is_throttled; /*!< TCP connections throttling switch. */ - fdset_t set; /*!< Set of server/client sockets. */ + apoll_t set; /*!< Set of server/client sockets. */ unsigned thread_id; /*!< Thread identifier. */ unsigned max_worker_fds; /*!< Max TCP clients per worker configuration + no. of ifaces. */ int idle_timeout; /*!< [s] TCP idle timeout configuration. */ @@ -75,11 +76,10 @@ static void update_tcp_conf(tcp_context_t *tcp) } /*! \brief Sweep TCP connection. */ -static enum fdset_sweep_state tcp_sweep(fdset_t *set, int i, void *data) +static apoll_sweep_state tcp_sweep(apoll_t *set, int fd, void *data) { UNUSED(data); - assert(set && i < set->n && i >= 0); - int fd = set->pfd[i].fd; + assert(set && fd >= 0); /* Best-effort, name and shame. */ struct sockaddr_storage ss; @@ -92,7 +92,7 @@ static enum fdset_sweep_state tcp_sweep(fdset_t *set, int i, void *data) close(fd); - return FDSET_SWEEP; + return APOLL_CTX_SWEEP; } static bool tcp_active_state(int state) @@ -117,13 +117,12 @@ static void tcp_log_error(struct sockaddr_storage *ss, const char *operation, in } static unsigned tcp_set_ifaces(const iface_t *ifaces, size_t n_ifaces, - fdset_t *fds, int thread_id) + apoll_t *fds, int thread_id) { if (n_ifaces == 0) { return 0; } - fdset_clear(fds); for (const iface_t *i = ifaces; i != ifaces + n_ifaces; i++) { if (i->fd_tcp_count == 0) { // Ignore XDP interface. assert(i->fd_xdp_count > 0); @@ -140,10 +139,10 @@ static unsigned tcp_set_ifaces(const iface_t *ifaces, size_t n_ifaces, tcp_id = thread_id - i->fd_udp_count; } #endif - fdset_add(fds, i->fd_tcp[tcp_id], POLLIN, NULL); + apoll_add(fds, i->fd_tcp[tcp_id], APOLL_POLLIN, NULL); } - return fds->n; + return apoll_get_length(fds); } static int tcp_handle(tcp_context_t *tcp, int fd, struct iovec *rx, struct iovec *tx) @@ -216,28 +215,28 @@ static int tcp_handle(tcp_context_t *tcp, int fd, struct iovec *rx, struct iovec static void tcp_event_accept(tcp_context_t *tcp, unsigned i) { /* Accept client. */ - int fd = tcp->set.pfd[i].fd; + int fd = apoll_get_fd((&tcp->set), i); int client = net_accept(fd, NULL); if (client >= 0) { /* Assign to fdset. */ - int next_id = fdset_add(&tcp->set, client, POLLIN, NULL); + int next_id = apoll_add(&tcp->set, client, APOLL_POLLIN, NULL); if (next_id < 0) { close(client); return; } /* Update watchdog timer. */ - fdset_set_watchdog(&tcp->set, next_id, tcp->idle_timeout); + apoll_set_watchdog(&tcp->set, next_id, tcp->idle_timeout); } } static int tcp_event_serve(tcp_context_t *tcp, unsigned i) { - int fd = tcp->set.pfd[i].fd; + int fd = apoll_get_fd((&tcp->set), i); int ret = tcp_handle(tcp, fd, &tcp->iov[0], &tcp->iov[1]); if (ret == KNOT_EOK) { /* Update socket activity timer. */ - fdset_set_watchdog(&tcp->set, i, tcp->idle_timeout); + apoll_set_watchdog(&tcp->set, i, tcp->idle_timeout); } return ret; @@ -245,48 +244,46 @@ static int tcp_event_serve(tcp_context_t *tcp, unsigned i) static void tcp_wait_for_events(tcp_context_t *tcp) { - fdset_t *set = &tcp->set; + apoll_t *set = &tcp->set; /* Check if throttled with many open TCP connections. */ - assert(set->n <= tcp->max_worker_fds); - tcp->is_throttled = set->n == tcp->max_worker_fds; + assert(apoll_get_length(set) <= tcp->max_worker_fds); + tcp->is_throttled = apoll_get_length(set) == tcp->max_worker_fds; /* If throttled, temporarily ignore new TCP connections. */ unsigned i = tcp->is_throttled ? tcp->client_threshold : 0; /* Wait for events. */ - int nfds = poll(&(set->pfd[i]), set->n - i, TCP_SWEEP_INTERVAL * 1000); + apoll_it_t it; + apoll_poll(set, &it, i, TCP_SWEEP_INTERVAL); /* Mark the time of last poll call. */ tcp->last_poll_time = time_now(); /* Process events. */ - while (nfds > 0 && i < set->n) { + for(; !apoll_it_done(&it); apoll_it_next(&it)) { bool should_close = false; - if (set->pfd[i].revents & (POLLERR|POLLHUP|POLLNVAL)) { - should_close = (i >= tcp->client_threshold); - --nfds; - } else if (set->pfd[i].revents & (POLLIN)) { + unsigned int idx = apoll_it_get_idx(&it); + if (apoll_it_ev_is_error(&it)) { + should_close = (idx >= tcp->client_threshold); + } else if (apoll_it_ev_is_pollin(&it)) { /* Master sockets - new connection to accept. */ - if (i < tcp->client_threshold) { + if (idx < tcp->client_threshold) { /* Don't accept more clients than configured. */ - if (set->n < tcp->max_worker_fds) { - tcp_event_accept(tcp, i); + if (apoll_get_length(set) < tcp->max_worker_fds) { + tcp_event_accept(tcp, idx); } /* Client sockets - already accepted connection or closed connection :-( */ - } else if (tcp_event_serve(tcp, i) != KNOT_EOK) { + } else if (tcp_event_serve(tcp, idx) != KNOT_EOK) { should_close = true; } - --nfds; } /* Evaluate. */ if (should_close) { - close(set->pfd[i].fd); - fdset_remove(set, i); - } else { - ++i; + close(apoll_get_fd(set, idx)); + apoll_it_remove(&it); } } } @@ -325,9 +322,6 @@ int tcp_master(dthread_t *thread) }; knot_layer_init(&tcp.layer, &mm, process_query_layer()); - /* Prepare initial buffer for listening and bound sockets. */ - fdset_init(&tcp.set, FDSET_INIT_SIZE); - /* Create iovec abstraction. */ for (unsigned i = 0; i < 2; ++i) { tcp.iov[i].iov_len = KNOT_WIRE_MAX_PKTSIZE; @@ -343,6 +337,9 @@ int tcp_master(dthread_t *thread) update_sweep_timer(&next_sweep); update_tcp_conf(&tcp); + /* Prepare initial buffer for listening and bound sockets. */ + apoll_init(&tcp.set, APOLL_CTX_INIT_SIZE); + /* Set descriptors for the configured interfaces. */ tcp.client_threshold = tcp_set_ifaces(handler->server->ifaces, handler->server->n_ifaces, @@ -362,7 +359,7 @@ int tcp_master(dthread_t *thread) /* Sweep inactive clients and refresh TCP configuration. */ if (tcp.last_poll_time.tv_sec >= next_sweep.tv_sec) { - fdset_sweep(&tcp.set, &tcp_sweep, NULL); + apoll_sweep(&tcp.set, &tcp_sweep, NULL); update_sweep_timer(&next_sweep); update_tcp_conf(&tcp); } @@ -372,7 +369,8 @@ finish: free(tcp.iov[0].iov_base); free(tcp.iov[1].iov_base); mp_delete(mm.ctx); - fdset_clear(&tcp.set); + apoll_close(&tcp.set); + apoll_clear(&tcp.set); return ret; } diff --git a/src/knot/server/udp-handler.c b/src/knot/server/udp-handler.c index 58b6a97a9..4d0cc7b38 100644 --- a/src/knot/server/udp-handler.c +++ b/src/knot/server/udp-handler.c @@ -34,6 +34,7 @@ #include "contrib/mempattern.h" #include "contrib/sockaddr.h" #include "contrib/ucw/mempool.h" +#include "knot/common/apoll.h" #include "knot/nameserver/process_query.h" #include "knot/query/layer.h" #include "knot/server/server.h" @@ -498,7 +499,7 @@ static int iface_udp_fd(const iface_t *iface, int thread_id, bool xdp_thread, } } -static unsigned udp_set_ifaces(const iface_t *ifaces, size_t n_ifaces, struct pollfd *fds, +static unsigned udp_set_ifaces(const iface_t *ifaces, size_t n_ifaces, apoll_t *fds, int thread_id, void **xdp_socket) { if (n_ifaces == 0) { @@ -515,9 +516,7 @@ static unsigned udp_set_ifaces(const iface_t *ifaces, size_t n_ifaces, struct po if (fd < 0) { continue; } - fds[count].fd = fd; - fds[count].events = POLLIN; - fds[count].revents = 0; + apoll_add(fds, fd, APOLL_POLLIN, NULL); count++; } @@ -576,10 +575,11 @@ int udp_master(dthread_t *thread) /* Allocate descriptors for the configured interfaces. */ void *xdp_socket = NULL; size_t nifs = handler->server->n_ifaces; - struct pollfd fds[nifs]; - unsigned nfds = udp_set_ifaces(handler->server->ifaces, nifs, fds, - thread_id, &xdp_socket); - if (nfds == 0) { + apoll_t fds; + apoll_init(&fds, nifs); + unsigned fds_count = udp_set_ifaces(handler->server->ifaces, nifs, &fds, + thread_id, &xdp_socket); + if (fds_count == 0) { goto finish; } @@ -591,8 +591,9 @@ int udp_master(dthread_t *thread) } /* Wait for events. */ - int events = poll(fds, nfds, -1); - if (events <= 0) { + apoll_it_t it; + int ret = apoll_poll(&fds, &it, 0, -1); + if (ret <= 0) { if (errno == EINTR || errno == EAGAIN) { continue; } @@ -600,12 +601,8 @@ int udp_master(dthread_t *thread) } /* Process the events. */ - for (unsigned i = 0; i < nfds && events > 0; i++) { - if (fds[i].revents == 0) { - continue; - } - events -= 1; - if (api->udp_recv(fds[i].fd, rq, xdp_socket) > 0) { + for(; !apoll_it_done(&it); apoll_it_next(&it)) { + if (api->udp_recv(apoll_it_get_fd(&it), rq, xdp_socket) > 0) { api->udp_handle(&udp, rq, xdp_socket); api->udp_send(rq, xdp_socket); } @@ -615,6 +612,8 @@ int udp_master(dthread_t *thread) finish: api->udp_deinit(rq); mp_delete(mm.ctx); + apoll_close(&fds); + apoll_clear(&fds); return KNOT_EOK; } diff --git a/src/utils/knotd/main.c b/src/utils/knotd/main.c index c89ed96d1..0cc5c072e 100644 --- a/src/utils/knotd/main.c +++ b/src/utils/knotd/main.c @@ -22,6 +22,7 @@ #include <stdlib.h> #include <unistd.h> #include <getopt.h> +#include <signal.h> #include <sys/stat.h> #include <urcu.h> |