diff options
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/bpf_inode_storage.c | 1 | ||||
-rw-r--r-- | kernel/bpf/bpf_lsm.c | 4 | ||||
-rw-r--r-- | kernel/bpf/bpf_task_storage.c | 1 | ||||
-rw-r--r-- | kernel/bpf/btf.c | 15 | ||||
-rw-r--r-- | kernel/bpf/cgroup.c | 19 | ||||
-rw-r--r-- | kernel/bpf/core.c | 4 | ||||
-rw-r--r-- | kernel/bpf/devmap.c | 11 | ||||
-rw-r--r-- | kernel/bpf/helpers.c | 64 | ||||
-rw-r--r-- | kernel/bpf/inode.c | 5 | ||||
-rw-r--r-- | kernel/bpf/log.c | 3 | ||||
-rw-r--r-- | kernel/bpf/lpm_trie.c | 2 | ||||
-rw-r--r-- | kernel/bpf/memalloc.c | 14 | ||||
-rw-r--r-- | kernel/bpf/ringbuf.c | 14 | ||||
-rw-r--r-- | kernel/bpf/syscall.c | 45 | ||||
-rw-r--r-- | kernel/bpf/task_iter.c | 8 | ||||
-rw-r--r-- | kernel/bpf/token.c | 1 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 134 |
17 files changed, 214 insertions, 131 deletions
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 29da6d3838f6..e16e79f8cd6d 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -16,7 +16,6 @@ #include <uapi/linux/btf.h> #include <linux/bpf_lsm.h> #include <linux/btf_ids.h> -#include <linux/fdtable.h> #include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(inode_cache); diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 6292ac5f9bd1..3bc61628ab25 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -339,10 +339,6 @@ BTF_ID(func, bpf_lsm_path_chmod) BTF_ID(func, bpf_lsm_path_chown) #endif /* CONFIG_SECURITY_PATH */ -#ifdef CONFIG_KEYS -BTF_ID(func, bpf_lsm_key_free) -#endif /* CONFIG_KEYS */ - BTF_ID(func, bpf_lsm_mmap_file) BTF_ID(func, bpf_lsm_netlink_send) BTF_ID(func, bpf_lsm_path_notify) diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index adf6dfe0ba68..1eb9852a9f8e 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -16,7 +16,6 @@ #include <linux/filter.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> -#include <linux/fdtable.h> #include <linux/rcupdate_trace.h> DEFINE_BPF_STORAGE_CACHE(task_cache); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 75e4fe83c509..5cd1c7a23848 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3523,7 +3523,7 @@ end: * (i + 1) * elem_size * where i is the repeat index and elem_size is the size of an element. */ -static int btf_repeat_fields(struct btf_field_info *info, +static int btf_repeat_fields(struct btf_field_info *info, int info_cnt, u32 field_cnt, u32 repeat_cnt, u32 elem_size) { u32 i, j; @@ -3543,6 +3543,12 @@ static int btf_repeat_fields(struct btf_field_info *info, } } + /* The type of struct size or variable size is u32, + * so the multiplication will not overflow. + */ + if (field_cnt * (repeat_cnt + 1) > info_cnt) + return -E2BIG; + cur = field_cnt; for (i = 0; i < repeat_cnt; i++) { memcpy(&info[cur], &info[0], field_cnt * sizeof(info[0])); @@ -3587,7 +3593,7 @@ static int btf_find_nested_struct(const struct btf *btf, const struct btf_type * info[i].off += off; if (nelems > 1) { - err = btf_repeat_fields(info, ret, nelems - 1, t->size); + err = btf_repeat_fields(info, info_cnt, ret, nelems - 1, t->size); if (err == 0) ret *= nelems; else @@ -3681,10 +3687,10 @@ static int btf_find_field_one(const struct btf *btf, if (ret == BTF_FIELD_IGNORE) return 0; - if (nelems > info_cnt) + if (!info_cnt) return -E2BIG; if (nelems > 1) { - ret = btf_repeat_fields(info, 1, nelems - 1, sz); + ret = btf_repeat_fields(info, info_cnt, 1, nelems - 1, sz); if (ret < 0) return ret; } @@ -8961,6 +8967,7 @@ int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo, if (!type) { bpf_log(ctx->log, "relo #%u: bad type id %u\n", relo_idx, relo->type_id); + kfree(specs); return -EINVAL; } diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index e7113d700b87..025d7e2214ae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -24,6 +24,23 @@ DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_CGROUP_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); +/* + * cgroup bpf destruction makes heavy use of work items and there can be a lot + * of concurrent destructions. Use a separate workqueue so that cgroup bpf + * destruction work items don't end up filling up max_active of system_wq + * which may lead to deadlock. + */ +static struct workqueue_struct *cgroup_bpf_destroy_wq; + +static int __init cgroup_bpf_wq_init(void) +{ + cgroup_bpf_destroy_wq = alloc_workqueue("cgroup_bpf_destroy", 0, 1); + if (!cgroup_bpf_destroy_wq) + panic("Failed to alloc workqueue for cgroup bpf destroy.\n"); + return 0; +} +core_initcall(cgroup_bpf_wq_init); + /* __always_inline is necessary to prevent indirect call through run_prog * function pointer. */ @@ -334,7 +351,7 @@ static void cgroup_bpf_release_fn(struct percpu_ref *ref) struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); - queue_work(system_wq, &cgrp->bpf.release_work); + queue_work(cgroup_bpf_destroy_wq, &cgrp->bpf.release_work); } /* Get underlying bpf_prog of bpf_prog_list entry, regardless if it's through diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4e07cc057d6f..e303626bdb2f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -21,7 +21,7 @@ #include <linux/filter.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> -#include <linux/random.h> +#include <linux/prandom.h> #include <linux/bpf.h> #include <linux/btf.h> #include <linux/objtool.h> @@ -40,7 +40,7 @@ #include <linux/execmem.h> #include <asm/barrier.h> -#include <asm/unaligned.h> +#include <linux/unaligned.h> /* Registers */ #define BPF_R0 regs[BPF_REG_0] diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 9e0e3b0a18e4..7878be18e9d2 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -333,9 +333,11 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, struct xdp_frame **frames, int n, - struct net_device *dev) + struct net_device *tx_dev, + struct net_device *rx_dev) { - struct xdp_txq_info txq = { .dev = dev }; + struct xdp_txq_info txq = { .dev = tx_dev }; + struct xdp_rxq_info rxq = { .dev = rx_dev }; struct xdp_buff xdp; int i, nframes = 0; @@ -346,6 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, xdp_convert_frame_to_buff(xdpf, &xdp); xdp.txq = &txq; + xdp.rxq = &rxq; act = bpf_prog_run_xdp(xdp_prog, &xdp); switch (act) { @@ -360,7 +363,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog, bpf_warn_invalid_xdp_action(NULL, xdp_prog, act); fallthrough; case XDP_ABORTED: - trace_xdp_exception(dev, xdp_prog, act); + trace_xdp_exception(tx_dev, xdp_prog, act); fallthrough; case XDP_DROP: xdp_return_frame_rx_napi(xdpf); @@ -388,7 +391,7 @@ static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) } if (bq->xdp_prog) { - to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev); + to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev, bq->dev_rx); if (!to_send) goto out; } diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 1a43d06eab28..3d45ebe8afb4 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -111,7 +111,7 @@ const struct bpf_func_proto bpf_map_pop_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) @@ -124,7 +124,7 @@ const struct bpf_func_proto bpf_map_peek_elem_proto = { .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, - .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT, + .arg2_type = ARG_PTR_TO_MAP_VALUE | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_3(bpf_map_lookup_percpu_elem, struct bpf_map *, map, void *, key, u32, cpu) @@ -538,7 +538,7 @@ const struct bpf_func_proto bpf_strtol_proto = { .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(s64), }; @@ -566,7 +566,7 @@ const struct bpf_func_proto bpf_strtoul_proto = { .arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY, .arg2_type = ARG_CONST_SIZE, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(u64), }; @@ -1742,7 +1742,7 @@ static const struct bpf_func_proto bpf_dynptr_from_mem_proto = { .arg1_type = ARG_PTR_TO_UNINIT_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_LOCAL | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, const struct bpf_dynptr_kern *, src, @@ -2851,21 +2851,47 @@ struct bpf_iter_bits { __u64 __opaque[2]; } __aligned(8); +#define BITS_ITER_NR_WORDS_MAX 511 + struct bpf_iter_bits_kern { union { - unsigned long *bits; - unsigned long bits_copy; + __u64 *bits; + __u64 bits_copy; }; - u32 nr_bits; + int nr_bits; int bit; } __aligned(8); +/* On 64-bit hosts, unsigned long and u64 have the same size, so passing + * a u64 pointer and an unsigned long pointer to find_next_bit() will + * return the same result, as both point to the same 8-byte area. + * + * For 32-bit little-endian hosts, using a u64 pointer or unsigned long + * pointer also makes no difference. This is because the first iterated + * unsigned long is composed of bits 0-31 of the u64 and the second unsigned + * long is composed of bits 32-63 of the u64. + * + * However, for 32-bit big-endian hosts, this is not the case. The first + * iterated unsigned long will be bits 32-63 of the u64, so swap these two + * ulong values within the u64. + */ +static void swap_ulong_in_u64(u64 *bits, unsigned int nr) +{ +#if (BITS_PER_LONG == 32) && defined(__BIG_ENDIAN) + unsigned int i; + + for (i = 0; i < nr; i++) + bits[i] = (bits[i] >> 32) | ((u64)(u32)bits[i] << 32); +#endif +} + /** * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area * @it: The new bpf_iter_bits to be created * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over * @nr_words: The size of the specified memory area, measured in 8-byte units. - * Due to the limitation of memalloc, it can't be greater than 512. + * The maximum value of @nr_words is @BITS_ITER_NR_WORDS_MAX. This limit may be + * further reduced by the BPF memory allocator implementation. * * This function initializes a new bpf_iter_bits structure for iterating over * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It @@ -2892,6 +2918,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w if (!unsafe_ptr__ign || !nr_words) return -EINVAL; + if (nr_words > BITS_ITER_NR_WORDS_MAX) + return -E2BIG; /* Optimization for u64 mask */ if (nr_bits == 64) { @@ -2899,10 +2927,15 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w if (err) return -EFAULT; + swap_ulong_in_u64(&kit->bits_copy, nr_words); + kit->nr_bits = nr_bits; return 0; } + if (bpf_mem_alloc_check_size(false, nr_bytes)) + return -E2BIG; + /* Fallback to memalloc */ kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes); if (!kit->bits) @@ -2914,6 +2947,8 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w return err; } + swap_ulong_in_u64(kit->bits, nr_words); + kit->nr_bits = nr_bits; return 0; } @@ -2930,17 +2965,16 @@ bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_w __bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it) { struct bpf_iter_bits_kern *kit = (void *)it; - u32 nr_bits = kit->nr_bits; - const unsigned long *bits; - int bit; + int bit = kit->bit, nr_bits = kit->nr_bits; + const void *bits; - if (nr_bits == 0) + if (!nr_bits || bit >= nr_bits) return NULL; bits = nr_bits == 64 ? &kit->bits_copy : kit->bits; - bit = find_next_bit(bits, nr_bits, kit->bit + 1); + bit = find_next_bit(bits, nr_bits, bit + 1); if (bit >= nr_bits) { - kit->nr_bits = 0; + kit->bit = bit; return NULL; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index d8fc5eba529d..9aaf5124648b 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -880,7 +880,7 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) const struct btf_type *enum_t; const char *enum_pfx; u64 *delegate_msk, msk = 0; - char *p; + char *p, *str; int val; /* ignore errors, fallback to hex */ @@ -911,7 +911,8 @@ static int bpf_parse_param(struct fs_context *fc, struct fs_parameter *param) return -EINVAL; } - while ((p = strsep(¶m->string, ":"))) { + str = param->string; + while ((p = strsep(&str, ":"))) { if (strcmp(p, "any") == 0) { msk |= ~0ULL; } else if (find_btf_enum_const(info.btf, enum_t, enum_pfx, p, &val)) { diff --git a/kernel/bpf/log.c b/kernel/bpf/log.c index 5aebfc3051e3..4a858fdb6476 100644 --- a/kernel/bpf/log.c +++ b/kernel/bpf/log.c @@ -688,8 +688,7 @@ static void print_reg_state(struct bpf_verifier_env *env, if (t == SCALAR_VALUE && reg->precise) verbose(env, "P"); if (t == SCALAR_VALUE && tnum_is_const(reg->var_off)) { - /* reg->off should be 0 for SCALAR_VALUE */ - verbose_snum(env, reg->var_off.value + reg->off); + verbose_snum(env, reg->var_off.value); return; } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 0218a5132ab5..9b60eda0f727 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -655,7 +655,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) if (!key || key->prefixlen > trie->max_prefixlen) goto find_leftmost; - node_stack = kmalloc_array(trie->max_prefixlen, + node_stack = kmalloc_array(trie->max_prefixlen + 1, sizeof(struct lpm_trie_node *), GFP_ATOMIC | __GFP_NOWARN); if (!node_stack) diff --git a/kernel/bpf/memalloc.c b/kernel/bpf/memalloc.c index b3858a76e0b3..146f5b57cfb1 100644 --- a/kernel/bpf/memalloc.c +++ b/kernel/bpf/memalloc.c @@ -35,6 +35,8 @@ */ #define LLIST_NODE_SZ sizeof(struct llist_node) +#define BPF_MEM_ALLOC_SIZE_MAX 4096 + /* similar to kmalloc, but sizeof == 8 bucket is gone */ static u8 size_index[24] __ro_after_init = { 3, /* 8 */ @@ -65,7 +67,7 @@ static u8 size_index[24] __ro_after_init = { static int bpf_mem_cache_idx(size_t size) { - if (!size || size > 4096) + if (!size || size > BPF_MEM_ALLOC_SIZE_MAX) return -1; if (size <= 192) @@ -1005,3 +1007,13 @@ void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags) return !ret ? NULL : ret + LLIST_NODE_SZ; } + +int bpf_mem_alloc_check_size(bool percpu, size_t size) +{ + /* The size of percpu allocation doesn't have LLIST_NODE_SZ overhead */ + if ((percpu && size > BPF_MEM_ALLOC_SIZE_MAX) || + (!percpu && size > BPF_MEM_ALLOC_SIZE_MAX - LLIST_NODE_SZ)) + return -E2BIG; + + return 0; +} diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index e20b90c36131..e1cfe890e0be 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -29,7 +29,7 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; - spinlock_t spinlock ____cacheline_aligned_in_smp; + raw_spinlock_t spinlock ____cacheline_aligned_in_smp; /* For user-space producer ring buffers, an atomic_t busy bit is used * to synchronize access to the ring buffers in the kernel, rather than * the spinlock that is used for kernel-producer ring buffers. This is @@ -173,7 +173,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) if (!rb) return NULL; - spin_lock_init(&rb->spinlock); + raw_spin_lock_init(&rb->spinlock); atomic_set(&rb->busy, 0); init_waitqueue_head(&rb->waitq); init_irq_work(&rb->work, bpf_ringbuf_notify); @@ -421,10 +421,10 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) cons_pos = smp_load_acquire(&rb->consumer_pos); if (in_nmi()) { - if (!spin_trylock_irqsave(&rb->spinlock, flags)) + if (!raw_spin_trylock_irqsave(&rb->spinlock, flags)) return NULL; } else { - spin_lock_irqsave(&rb->spinlock, flags); + raw_spin_lock_irqsave(&rb->spinlock, flags); } pend_pos = rb->pending_pos; @@ -450,7 +450,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) */ if (new_prod_pos - cons_pos > rb->mask || new_prod_pos - pend_pos > rb->mask) { - spin_unlock_irqrestore(&rb->spinlock, flags); + raw_spin_unlock_irqrestore(&rb->spinlock, flags); return NULL; } @@ -462,7 +462,7 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) /* pairs with consumer's smp_load_acquire() */ smp_store_release(&rb->producer_pos, new_prod_pos); - spin_unlock_irqrestore(&rb->spinlock, flags); + raw_spin_unlock_irqrestore(&rb->spinlock, flags); return (void *)hdr + BPF_RINGBUF_HDR_SZ; } @@ -632,7 +632,7 @@ const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT, + .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT | MEM_WRITE, }; BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a8f1808a1ca5..c5aa127ed4cc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3069,13 +3069,17 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_link *link = filp->private_data; const struct bpf_prog *prog = link->prog; + enum bpf_link_type type = link->type; char prog_tag[sizeof(prog->tag) * 2 + 1] = { }; - seq_printf(m, - "link_type:\t%s\n" - "link_id:\t%u\n", - bpf_link_type_strs[link->type], - link->id); + if (type < ARRAY_SIZE(bpf_link_type_strs) && bpf_link_type_strs[type]) { + seq_printf(m, "link_type:\t%s\n", bpf_link_type_strs[type]); + } else { + WARN_ONCE(1, "missing BPF_LINK_TYPE(...) for link type %u\n", type); + seq_printf(m, "link_type:\t<%u>\n", type); + } + seq_printf(m, "link_id:\t%u\n", link->id); + if (prog) { bin2hex(prog_tag, prog->tag, sizeof(prog->tag)); seq_printf(m, @@ -3565,15 +3569,16 @@ static void bpf_perf_link_dealloc(struct bpf_link *link) } static int bpf_perf_link_fill_common(const struct perf_event *event, - char __user *uname, u32 ulen, + char __user *uname, u32 *ulenp, u64 *probe_offset, u64 *probe_addr, u32 *fd_type, unsigned long *missed) { const char *buf; - u32 prog_id; + u32 prog_id, ulen; size_t len; int err; + ulen = *ulenp; if (!ulen ^ !uname) return -EINVAL; @@ -3581,10 +3586,17 @@ static int bpf_perf_link_fill_common(const struct perf_event *event, probe_offset, probe_addr, missed); if (err) return err; + + if (buf) { + len = strlen(buf); + *ulenp = len + 1; + } else { + *ulenp = 1; + } if (!uname) return 0; + if (buf) { - len = strlen(buf); err = bpf_copy_to_user(uname, buf, ulen, len); if (err) return err; @@ -3609,7 +3621,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.kprobe.func_name); ulen = info->perf_event.kprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, &type, &missed); if (err) return err; @@ -3617,7 +3629,7 @@ static int bpf_perf_link_fill_kprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_KRETPROBE; else info->perf_event.type = BPF_PERF_EVENT_KPROBE; - + info->perf_event.kprobe.name_len = ulen; info->perf_event.kprobe.offset = offset; info->perf_event.kprobe.missed = missed; if (!kallsyms_show_value(current_cred())) @@ -3639,7 +3651,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, uname = u64_to_user_ptr(info->perf_event.uprobe.file_name); ulen = info->perf_event.uprobe.name_len; - err = bpf_perf_link_fill_common(event, uname, ulen, &offset, &addr, + err = bpf_perf_link_fill_common(event, uname, &ulen, &offset, &addr, &type, NULL); if (err) return err; @@ -3648,6 +3660,7 @@ static int bpf_perf_link_fill_uprobe(const struct perf_event *event, info->perf_event.type = BPF_PERF_EVENT_URETPROBE; else info->perf_event.type = BPF_PERF_EVENT_UPROBE; + info->perf_event.uprobe.name_len = ulen; info->perf_event.uprobe.offset = offset; info->perf_event.uprobe.cookie = event->bpf_cookie; return 0; @@ -3673,12 +3686,18 @@ static int bpf_perf_link_fill_tracepoint(const struct perf_event *event, { char __user *uname; u32 ulen; + int err; uname = u64_to_user_ptr(info->perf_event.tracepoint.tp_name); ulen = info->perf_event.tracepoint.name_len; + err = bpf_perf_link_fill_common(event, uname, &ulen, NULL, NULL, NULL, NULL); + if (err) + return err; + info->perf_event.type = BPF_PERF_EVENT_TRACEPOINT; + info->perf_event.tracepoint.name_len = ulen; info->perf_event.tracepoint.cookie = event->bpf_cookie; - return bpf_perf_link_fill_common(event, uname, ulen, NULL, NULL, NULL, NULL); + return 0; } static int bpf_perf_link_fill_perf_event(const struct perf_event *event, @@ -5877,7 +5896,7 @@ static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = { .arg1_type = ARG_PTR_TO_MEM, .arg2_type = ARG_CONST_SIZE_OR_ZERO, .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_ALIGNED, + .arg4_type = ARG_PTR_TO_FIXED_SIZE_MEM | MEM_UNINIT | MEM_WRITE | MEM_ALIGNED, .arg4_size = sizeof(u64), }; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 02aa9db8d796..98d9b4c0daff 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -5,7 +5,6 @@ #include <linux/namei.h> #include <linux/pid_namespace.h> #include <linux/fs.h> -#include <linux/fdtable.h> #include <linux/filter.h> #include <linux/bpf_mem_alloc.h> #include <linux/btf_ids.h> @@ -99,7 +98,7 @@ static struct task_struct *task_seq_get_next(struct bpf_iter_seq_task_common *co rcu_read_lock(); pid = find_pid_ns(common->pid, common->ns); if (pid) { - task = get_pid_task(pid, PIDTYPE_TGID); + task = get_pid_task(pid, PIDTYPE_PID); *tid = common->pid; } rcu_read_unlock(); @@ -286,17 +285,14 @@ again: curr_fd = 0; } - rcu_read_lock(); - f = task_lookup_next_fdget_rcu(curr_task, &curr_fd); + f = fget_task_next(curr_task, &curr_fd); if (f) { /* set info->fd */ info->fd = curr_fd; - rcu_read_unlock(); return f; } /* the current task is done, go to the next task */ - rcu_read_unlock(); put_task_struct(curr_task); if (info->common.type == BPF_TASK_ITER_TID) { diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index dcbec1a0dfb3..26057aa13503 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -1,6 +1,5 @@ #include <linux/bpf.h> #include <linux/vmalloc.h> -#include <linux/fdtable.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/kernel.h> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9a7ed527e47e..bb99bada7e2e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2750,10 +2750,16 @@ static struct btf *__find_kfunc_desc_btf(struct bpf_verifier_env *env, b->module = mod; b->offset = offset; + /* sort() reorders entries by value, so b may no longer point + * to the right entry after this + */ sort(tab->descs, tab->nr_descs, sizeof(tab->descs[0]), kfunc_btf_cmp_by_off, NULL); + } else { + btf = b->btf; } - return b->btf; + + return btf; } void bpf_free_kfunc_btf_tab(struct bpf_kfunc_btf_tab *tab) @@ -6333,10 +6339,10 @@ static void coerce_reg_to_size_sx(struct bpf_reg_state *reg, int size) /* both of s64_max/s64_min positive or negative */ if ((s64_max >= 0) == (s64_min >= 0)) { - reg->smin_value = reg->s32_min_value = s64_min; - reg->smax_value = reg->s32_max_value = s64_max; - reg->umin_value = reg->u32_min_value = s64_min; - reg->umax_value = reg->u32_max_value = s64_max; + reg->s32_min_value = reg->smin_value = s64_min; + reg->s32_max_value = reg->smax_value = s64_max; + reg->u32_min_value = reg->umin_value = s64_min; + reg->u32_max_value = reg->umax_value = s64_max; reg->var_off = tnum_range(s64_min, s64_max); return; } @@ -6798,20 +6804,10 @@ static int check_stack_slot_within_bounds(struct bpf_verifier_env *env, struct bpf_func_state *state, enum bpf_access_type t) { - struct bpf_insn_aux_data *aux = &env->insn_aux_data[env->insn_idx]; - int min_valid_off, max_bpf_stack; - - /* If accessing instruction is a spill/fill from bpf_fastcall pattern, - * add room for all caller saved registers below MAX_BPF_STACK. - * In case if bpf_fastcall rewrite won't happen maximal stack depth - * would be checked by check_max_stack_depth_subprog(). - */ - max_bpf_stack = MAX_BPF_STACK; - if (aux->fastcall_pattern) - max_bpf_stack += CALLER_SAVED_REGS * BPF_REG_SIZE; + int min_valid_off; if (t == BPF_WRITE || env->allow_uninit_stack) - min_valid_off = -max_bpf_stack; + min_valid_off = -MAX_BPF_STACK; else min_valid_off = -state->allocated_stack; @@ -7432,7 +7428,8 @@ mark: } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, - int access_size, bool zero_size_allowed, + int access_size, enum bpf_access_type access_type, + bool zero_size_allowed, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; @@ -7444,7 +7441,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); case PTR_TO_MAP_KEY: - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7452,15 +7449,13 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return check_mem_region_access(env, regno, reg->off, access_size, reg->map_ptr->key_size, false); case PTR_TO_MAP_VALUE: - if (check_map_access_type(env, regno, reg->off, access_size, - meta && meta->raw_mode ? BPF_WRITE : - BPF_READ)) + if (check_map_access_type(env, regno, reg->off, access_size, access_type)) return -EACCES; return check_map_access(env, regno, reg->off, access_size, zero_size_allowed, ACCESS_HELPER); case PTR_TO_MEM: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7471,7 +7466,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, zero_size_allowed); case PTR_TO_BUF: if (type_is_rdonly_mem(reg->type)) { - if (meta && meta->raw_mode) { + if (access_type == BPF_WRITE) { verbose(env, "R%d cannot write into %s\n", regno, reg_type_str(env, reg->type)); return -EACCES; @@ -7499,7 +7494,6 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, * Dynamically check it now. */ if (!env->ops->convert_ctx_access) { - enum bpf_access_type atype = meta && meta->raw_mode ? BPF_WRITE : BPF_READ; int offset = access_size - 1; /* Allow zero-byte read from PTR_TO_CTX */ @@ -7507,7 +7501,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, return zero_size_allowed ? 0 : -EACCES; return check_mem_access(env, env->insn_idx, regno, offset, BPF_B, - atype, -1, false, false); + access_type, -1, false, false); } fallthrough; @@ -7532,6 +7526,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, */ static int check_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, u32 regno, + enum bpf_access_type access_type, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { @@ -7547,15 +7542,12 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, */ meta->msize_max_value = reg->umax_value; - /* The register is SCALAR_VALUE; the access check - * happens using its boundaries. + /* The register is SCALAR_VALUE; the access check happens using + * its boundaries. For unprivileged variable accesses, disable + * raw mode so that the program is required to initialize all + * the memory that the helper could just partially fill up. */ if (!tnum_is_const(reg->var_off)) - /* For unprivileged variable accesses, disable raw - * mode so that the program is required to - * initialize all the memory that the helper could - * just partially fill up. - */ meta = NULL; if (reg->smin_value < 0) { @@ -7575,9 +7567,8 @@ static int check_mem_size_reg(struct bpf_verifier_env *env, regno); return -EACCES; } - err = check_helper_mem_access(env, regno - 1, - reg->umax_value, - zero_size_allowed, meta); + err = check_helper_mem_access(env, regno - 1, reg->umax_value, + access_type, zero_size_allowed, meta); if (!err) err = mark_chain_precision(env, regno); return err; @@ -7588,13 +7579,11 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg { bool may_be_null = type_may_be_null(reg->type); struct bpf_reg_state saved_reg; - struct bpf_call_arg_meta meta; int err; if (register_is_null(reg)) return 0; - memset(&meta, 0, sizeof(meta)); /* Assuming that the register contains a value check if the memory * access is safe. Temporarily save and restore the register's state as * the conversion shouldn't be visible to a caller. @@ -7604,10 +7593,8 @@ static int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg mark_ptr_not_null_reg(reg); } - err = check_helper_mem_access(env, regno, mem_size, true, &meta); - /* Check access for BPF_WRITE */ - meta.raw_mode = true; - err = err ?: check_helper_mem_access(env, regno, mem_size, true, &meta); + err = check_helper_mem_access(env, regno, mem_size, BPF_READ, true, NULL); + err = err ?: check_helper_mem_access(env, regno, mem_size, BPF_WRITE, true, NULL); if (may_be_null) *reg = saved_reg; @@ -7633,13 +7620,12 @@ static int check_kfunc_mem_size_reg(struct bpf_verifier_env *env, struct bpf_reg mark_ptr_not_null_reg(mem_reg); } - err = check_mem_size_reg(env, reg, regno, true, &meta); - /* Check access for BPF_WRITE */ - meta.raw_mode = true; - err = err ?: check_mem_size_reg(env, reg, regno, true, &meta); + err = check_mem_size_reg(env, reg, regno, BPF_READ, true, &meta); + err = err ?: check_mem_size_reg(env, reg, regno, BPF_WRITE, true, &meta); if (may_be_null) *mem_reg = saved_reg; + return err; } @@ -8942,9 +8928,8 @@ skip_type_check: verbose(env, "invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_helper_mem_access(env, regno, - meta->map_ptr->key_size, false, - NULL); + err = check_helper_mem_access(env, regno, meta->map_ptr->key_size, + BPF_READ, false, NULL); break; case ARG_PTR_TO_MAP_VALUE: if (type_may_be_null(arg_type) && register_is_null(reg)) @@ -8959,9 +8944,9 @@ skip_type_check: return -EACCES; } meta->raw_mode = arg_type & MEM_UNINIT; - err = check_helper_mem_access(env, regno, - meta->map_ptr->value_size, false, - meta); + err = check_helper_mem_access(env, regno, meta->map_ptr->value_size, + arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, + false, meta); break; case ARG_PTR_TO_PERCPU_BTF_ID: if (!reg->btf_id) { @@ -9003,7 +8988,9 @@ skip_type_check: */ meta->raw_mode = arg_type & MEM_UNINIT; if (arg_type & MEM_FIXED_SIZE) { - err = check_helper_mem_access(env, regno, fn->arg_size[arg], false, meta); + err = check_helper_mem_access(env, regno, fn->arg_size[arg], + arg_type & MEM_WRITE ? BPF_WRITE : BPF_READ, + false, meta); if (err) return err; if (arg_type & MEM_ALIGNED) @@ -9011,10 +8998,16 @@ skip_type_check: } break; case ARG_CONST_SIZE: - err = check_mem_size_reg(env, reg, regno, false, meta); + err = check_mem_size_reg(env, reg, regno, + fn->arg_type[arg - 1] & MEM_WRITE ? + BPF_WRITE : BPF_READ, + false, meta); break; case ARG_CONST_SIZE_OR_ZERO: - err = check_mem_size_reg(env, reg, regno, true, meta); + err = check_mem_size_reg(env, reg, regno, + fn->arg_type[arg - 1] & MEM_WRITE ? + BPF_WRITE : BPF_READ, + true, meta); break; case ARG_PTR_TO_DYNPTR: err = process_dynptr_func(env, regno, insn_idx, arg_type, 0); @@ -14264,12 +14257,13 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, * r1 += 0x1 * if r2 < 1000 goto ... * use r1 in memory access - * So remember constant delta between r2 and r1 and update r1 after - * 'if' condition. + * So for 64-bit alu remember constant delta between r2 and r1 and + * update r1 after 'if' condition. */ - if (env->bpf_capable && BPF_OP(insn->code) == BPF_ADD && - dst_reg->id && is_reg_const(src_reg, alu32)) { - u64 val = reg_const_value(src_reg, alu32); + if (env->bpf_capable && + BPF_OP(insn->code) == BPF_ADD && !alu32 && + dst_reg->id && is_reg_const(src_reg, false)) { + u64 val = reg_const_value(src_reg, false); if ((dst_reg->id & BPF_ADD_CONST) || /* prevent overflow in sync_linked_regs() later */ @@ -15326,8 +15320,12 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s continue; if ((!(reg->id & BPF_ADD_CONST) && !(known_reg->id & BPF_ADD_CONST)) || reg->off == known_reg->off) { + s32 saved_subreg_def = reg->subreg_def; + copy_register_state(reg, known_reg); + reg->subreg_def = saved_subreg_def; } else { + s32 saved_subreg_def = reg->subreg_def; s32 saved_off = reg->off; fake_reg.type = SCALAR_VALUE; @@ -15340,6 +15338,7 @@ static void sync_linked_regs(struct bpf_verifier_state *vstate, struct bpf_reg_s * otherwise another sync_linked_regs() will be incorrect. */ reg->off = saved_off; + reg->subreg_def = saved_subreg_def; scalar32_min_max_add(reg, &fake_reg); scalar_min_max_add(reg, &fake_reg); @@ -17877,9 +17876,11 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *sl, **pprev; struct bpf_verifier_state *cur = env->cur_state, *new, *loop_entry; int i, j, n, err, states_cnt = 0; - bool force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx); - bool add_new_state = force_new_state; - bool force_exact; + bool force_new_state, add_new_state, force_exact; + + force_new_state = env->test_state_freq || is_force_checkpoint(env, insn_idx) || + /* Avoid accumulating infinitely long jmp history */ + cur->jmp_history_cnt > 40; /* bpf progs typically have pruning point every 4 instructions * http://vger.kernel.org/bpfconf2019.html#session-1 @@ -17889,6 +17890,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * In tests that amounts to up to 50% reduction into total verifier * memory consumption and 20% verifier time speedup. */ + add_new_state = force_new_state; if (env->jmps_processed - env->prev_jmps_processed >= 2 && env->insn_processed - env->prev_insn_processed >= 8) add_new_state = true; @@ -21201,7 +21203,7 @@ patch_map_ops_generic: delta += cnt - 1; env->prog = prog = new_prog; insn = new_prog->insnsi + i + delta; - continue; + goto next_insn; } /* Implement bpf_kptr_xchg inline */ @@ -22310,7 +22312,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 /* 'struct bpf_verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ - env = kzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); + env = kvzalloc(sizeof(struct bpf_verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; @@ -22546,6 +22548,6 @@ err_unlock: mutex_unlock(&bpf_verifier_lock); vfree(env->insn_aux_data); err_free_env: - kfree(env); + kvfree(env); return ret; } |