diff options
Diffstat (limited to 'kernel')
44 files changed, 856 insertions, 505 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 279ba5c420a4..561d96affe9f 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2728,8 +2728,8 @@ void __audit_ptrace(struct task_struct *t) context->target_auid = audit_get_loginuid(t); context->target_uid = task_uid(t); context->target_sessionid = audit_get_sessionid(t); - security_task_getlsmprop_obj(t, &context->target_ref); strscpy(context->target_comm, t->comm); + security_task_getlsmprop_obj(t, &context->target_ref); } /** @@ -2755,8 +2755,8 @@ int audit_signal_info_syscall(struct task_struct *t) ctx->target_auid = audit_get_loginuid(t); ctx->target_uid = t_uid; ctx->target_sessionid = audit_get_sessionid(t); - security_task_getlsmprop_obj(t, &ctx->target_ref); strscpy(ctx->target_comm, t->comm); + security_task_getlsmprop_obj(t, &ctx->target_ref); return 0; } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 9762bdddf1de..410028633621 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -53,3 +53,9 @@ obj-$(CONFIG_BPF_SYSCALL) += relo_core.o obj-$(CONFIG_BPF_SYSCALL) += btf_iter.o obj-$(CONFIG_BPF_SYSCALL) += btf_relocate.o obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o + +CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_queue_stack_maps.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_lpm_trie.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_ringbuf.o = $(CC_FLAGS_FTRACE) diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 3bc61628ab25..967492b65185 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -375,8 +375,6 @@ BTF_ID(func, bpf_lsm_socket_socketpair) BTF_ID(func, bpf_lsm_syslog) BTF_ID(func, bpf_lsm_task_alloc) -BTF_ID(func, bpf_lsm_current_getsecid_subj) -BTF_ID(func, bpf_lsm_task_getsecid_obj) BTF_ID(func, bpf_lsm_task_prctl) BTF_ID(func, bpf_lsm_task_setscheduler) BTF_ID(func, bpf_lsm_task_to_inode) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index e7a59e6462a9..e5a5f023cedd 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -6439,6 +6439,101 @@ int btf_ctx_arg_offset(const struct btf *btf, const struct btf_type *func_proto, return off; } +struct bpf_raw_tp_null_args { + const char *func; + u64 mask; +}; + +static const struct bpf_raw_tp_null_args raw_tp_null_args[] = { + /* sched */ + { "sched_pi_setprio", 0x10 }, + /* ... from sched_numa_pair_template event class */ + { "sched_stick_numa", 0x100 }, + { "sched_swap_numa", 0x100 }, + /* afs */ + { "afs_make_fs_call", 0x10 }, + { "afs_make_fs_calli", 0x10 }, + { "afs_make_fs_call1", 0x10 }, + { "afs_make_fs_call2", 0x10 }, + { "afs_protocol_error", 0x1 }, + { "afs_flock_ev", 0x10 }, + /* cachefiles */ + { "cachefiles_lookup", 0x1 | 0x200 }, + { "cachefiles_unlink", 0x1 }, + { "cachefiles_rename", 0x1 }, + { "cachefiles_prep_read", 0x1 }, + { "cachefiles_mark_active", 0x1 }, + { "cachefiles_mark_failed", 0x1 }, + { "cachefiles_mark_inactive", 0x1 }, + { "cachefiles_vfs_error", 0x1 }, + { "cachefiles_io_error", 0x1 }, + { "cachefiles_ondemand_open", 0x1 }, + { "cachefiles_ondemand_copen", 0x1 }, + { "cachefiles_ondemand_close", 0x1 }, + { "cachefiles_ondemand_read", 0x1 }, + { "cachefiles_ondemand_cread", 0x1 }, + { "cachefiles_ondemand_fd_write", 0x1 }, + { "cachefiles_ondemand_fd_release", 0x1 }, + /* ext4, from ext4__mballoc event class */ + { "ext4_mballoc_discard", 0x10 }, + { "ext4_mballoc_free", 0x10 }, + /* fib */ + { "fib_table_lookup", 0x100 }, + /* filelock */ + /* ... from filelock_lock event class */ + { "posix_lock_inode", 0x10 }, + { "fcntl_setlk", 0x10 }, + { "locks_remove_posix", 0x10 }, + { "flock_lock_inode", 0x10 }, + /* ... from filelock_lease event class */ + { "break_lease_noblock", 0x10 }, + { "break_lease_block", 0x10 }, + { "break_lease_unblock", 0x10 }, + { "generic_delete_lease", 0x10 }, + { "time_out_leases", 0x10 }, + /* host1x */ + { "host1x_cdma_push_gather", 0x10000 }, + /* huge_memory */ + { "mm_khugepaged_scan_pmd", 0x10 }, + { "mm_collapse_huge_page_isolate", 0x1 }, + { "mm_khugepaged_scan_file", 0x10 }, + { "mm_khugepaged_collapse_file", 0x10 }, + /* kmem */ + { "mm_page_alloc", 0x1 }, + { "mm_page_pcpu_drain", 0x1 }, + /* .. from mm_page event class */ + { "mm_page_alloc_zone_locked", 0x1 }, + /* netfs */ + { "netfs_failure", 0x10 }, + /* power */ + { "device_pm_callback_start", 0x10 }, + /* qdisc */ + { "qdisc_dequeue", 0x1000 }, + /* rxrpc */ + { "rxrpc_recvdata", 0x1 }, + { "rxrpc_resend", 0x10 }, + /* sunrpc */ + { "xs_stream_read_data", 0x1 }, + /* ... from xprt_cong_event event class */ + { "xprt_reserve_cong", 0x10 }, + { "xprt_release_cong", 0x10 }, + { "xprt_get_cong", 0x10 }, + { "xprt_put_cong", 0x10 }, + /* tcp */ + { "tcp_send_reset", 0x11 }, + /* tegra_apb_dma */ + { "tegra_dma_tx_status", 0x100 }, + /* timer_migration */ + { "tmigr_update_events", 0x1 }, + /* writeback, from writeback_folio_template event class */ + { "writeback_dirty_folio", 0x10 }, + { "folio_wait_writeback", 0x10 }, + /* rdma */ + { "mr_integ_alloc", 0x2000 }, + /* bpf_testmod */ + { "bpf_testmod_test_read", 0x0 }, +}; + bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) @@ -6449,6 +6544,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, const char *tname = prog->aux->attach_func_name; struct bpf_verifier_log *log = info->log; const struct btf_param *args; + bool ptr_err_raw_tp = false; const char *tag_value; u32 nr_args, arg; int i, ret; @@ -6543,6 +6639,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, return false; } + if (size != sizeof(u64)) { + bpf_log(log, "func '%s' size %d must be 8\n", + tname, size); + return false; + } + /* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */ for (i = 0; i < prog->aux->ctx_arg_info_size; i++) { const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i]; @@ -6588,12 +6690,42 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (prog_args_trusted(prog)) info->reg_type |= PTR_TRUSTED; - /* Raw tracepoint arguments always get marked as maybe NULL */ - if (bpf_prog_is_raw_tp(prog)) - info->reg_type |= PTR_MAYBE_NULL; - else if (btf_param_match_suffix(btf, &args[arg], "__nullable")) + if (btf_param_match_suffix(btf, &args[arg], "__nullable")) info->reg_type |= PTR_MAYBE_NULL; + if (prog->expected_attach_type == BPF_TRACE_RAW_TP) { + struct btf *btf = prog->aux->attach_btf; + const struct btf_type *t; + const char *tname; + + /* BTF lookups cannot fail, return false on error */ + t = btf_type_by_id(btf, prog->aux->attach_btf_id); + if (!t) + return false; + tname = btf_name_by_offset(btf, t->name_off); + if (!tname) + return false; + /* Checked by bpf_check_attach_target */ + tname += sizeof("btf_trace_") - 1; + for (i = 0; i < ARRAY_SIZE(raw_tp_null_args); i++) { + /* Is this a func with potential NULL args? */ + if (strcmp(tname, raw_tp_null_args[i].func)) + continue; + if (raw_tp_null_args[i].mask & (0x1 << (arg * 4))) + info->reg_type |= PTR_MAYBE_NULL; + /* Is the current arg IS_ERR? */ + if (raw_tp_null_args[i].mask & (0x2 << (arg * 4))) + ptr_err_raw_tp = true; + break; + } + /* If we don't know NULL-ness specification and the tracepoint + * is coming from a loadable module, be conservative and mark + * argument as PTR_MAYBE_NULL. + */ + if (i == ARRAY_SIZE(raw_tp_null_args) && btf_is_module(btf)) + info->reg_type |= PTR_MAYBE_NULL; + } + if (tgt_prog) { enum bpf_prog_type tgt_type; @@ -6638,6 +6770,15 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, bpf_log(log, "func '%s' arg%d has btf_id %d type %s '%s'\n", tname, arg, info->btf_id, btf_type_str(t), __btf_name_by_offset(btf, t->name_off)); + + /* Perform all checks on the validity of type for this argument, but if + * we know it can be IS_ERR at runtime, scrub pointer type and mark as + * scalar. + */ + if (ptr_err_raw_tp) { + bpf_log(log, "marking pointer arg%d as scalar as it may encode error", arg); + info->reg_type = SCALAR_VALUE; + } return true; } EXPORT_SYMBOL_GPL(btf_ctx_access); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a2327c4fdc8b..da729cbbaeb9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -539,6 +539,8 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) { + int err; + /* Branch offsets can't overflow when program is shrinking, no need * to call bpf_adj_branches(..., true) here */ @@ -546,7 +548,9 @@ int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt) sizeof(struct bpf_insn) * (prog->len - off - cnt)); prog->len -= cnt; - return WARN_ON_ONCE(bpf_adj_branches(prog, off, off + cnt, off, false)); + err = bpf_adj_branches(prog, off, off + cnt, off, false); + WARN_ON_ONCE(err); + return err; } static void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp) @@ -2936,7 +2940,7 @@ void __weak bpf_jit_compile(struct bpf_prog *prog) { } -bool __weak bpf_helper_changes_pkt_data(void *func) +bool __weak bpf_helper_changes_pkt_data(enum bpf_func_id func_id) { return false; } diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 7878be18e9d2..3aa002a47a96 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -184,7 +184,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int i; + u32 i; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were @@ -821,7 +821,7 @@ static long dev_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; - int k = *(u32 *)key; + u32 k = *(u32 *)key; if (k >= map->max_entries) return -EINVAL; @@ -838,7 +838,7 @@ static long dev_map_hash_delete_elem(struct bpf_map *map, void *key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); struct bpf_dtab_netdev *old_dev; - int k = *(u32 *)key; + u32 k = *(u32 *)key; unsigned long flags; int ret = -ENOENT; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 9b60eda0f727..f8bc1e096182 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -15,6 +15,7 @@ #include <net/ipv6.h> #include <uapi/linux/btf.h> #include <linux/btf_ids.h> +#include <linux/bpf_mem_alloc.h> /* Intermediate node */ #define LPM_TREE_NODE_FLAG_IM BIT(0) @@ -22,7 +23,6 @@ struct lpm_trie_node; struct lpm_trie_node { - struct rcu_head rcu; struct lpm_trie_node __rcu *child[2]; u32 prefixlen; u32 flags; @@ -32,10 +32,11 @@ struct lpm_trie_node { struct lpm_trie { struct bpf_map map; struct lpm_trie_node __rcu *root; + struct bpf_mem_alloc ma; size_t n_entries; size_t max_prefixlen; size_t data_size; - spinlock_t lock; + raw_spinlock_t lock; }; /* This trie implements a longest prefix match algorithm that can be used to @@ -287,17 +288,18 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key) return found->data + trie->data_size; } -static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, - const void *value) +static struct lpm_trie_node *lpm_trie_node_alloc(struct lpm_trie *trie, + const void *value, + bool disable_migration) { struct lpm_trie_node *node; - size_t size = sizeof(struct lpm_trie_node) + trie->data_size; - if (value) - size += trie->map.value_size; + if (disable_migration) + migrate_disable(); + node = bpf_mem_cache_alloc(&trie->ma); + if (disable_migration) + migrate_enable(); - node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN, - trie->map.numa_node); if (!node) return NULL; @@ -310,12 +312,22 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, return node; } +static int trie_check_add_elem(struct lpm_trie *trie, u64 flags) +{ + if (flags == BPF_EXIST) + return -ENOENT; + if (trie->n_entries == trie->map.max_entries) + return -ENOSPC; + trie->n_entries++; + return 0; +} + /* Called from syscall or from eBPF program */ static long trie_update_elem(struct bpf_map *map, void *_key, void *value, u64 flags) { struct lpm_trie *trie = container_of(map, struct lpm_trie, map); - struct lpm_trie_node *node, *im_node = NULL, *new_node = NULL; + struct lpm_trie_node *node, *im_node, *new_node; struct lpm_trie_node *free_node = NULL; struct lpm_trie_node __rcu **slot; struct bpf_lpm_trie_key_u8 *key = _key; @@ -330,22 +342,14 @@ static long trie_update_elem(struct bpf_map *map, if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - spin_lock_irqsave(&trie->lock, irq_flags); - - /* Allocate and fill a new node */ - - if (trie->n_entries == trie->map.max_entries) { - ret = -ENOSPC; - goto out; - } - - new_node = lpm_trie_node_alloc(trie, value); - if (!new_node) { - ret = -ENOMEM; - goto out; - } + /* Allocate and fill a new node. Need to disable migration before + * invoking bpf_mem_cache_alloc(). + */ + new_node = lpm_trie_node_alloc(trie, value, true); + if (!new_node) + return -ENOMEM; - trie->n_entries++; + raw_spin_lock_irqsave(&trie->lock, irq_flags); new_node->prefixlen = key->prefixlen; RCU_INIT_POINTER(new_node->child[0], NULL); @@ -364,8 +368,7 @@ static long trie_update_elem(struct bpf_map *map, matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || - node->prefixlen == key->prefixlen || - node->prefixlen == trie->max_prefixlen) + node->prefixlen == key->prefixlen) break; next_bit = extract_bit(key->data, node->prefixlen); @@ -376,6 +379,10 @@ static long trie_update_elem(struct bpf_map *map, * simply assign the @new_node to that slot and be done. */ if (!node) { + ret = trie_check_add_elem(trie, flags); + if (ret) + goto out; + rcu_assign_pointer(*slot, new_node); goto out; } @@ -384,18 +391,30 @@ static long trie_update_elem(struct bpf_map *map, * which already has the correct data array set. */ if (node->prefixlen == matchlen) { + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) { + if (flags == BPF_NOEXIST) { + ret = -EEXIST; + goto out; + } + } else { + ret = trie_check_add_elem(trie, flags); + if (ret) + goto out; + } + new_node->child[0] = node->child[0]; new_node->child[1] = node->child[1]; - if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) - trie->n_entries--; - rcu_assign_pointer(*slot, new_node); free_node = node; goto out; } + ret = trie_check_add_elem(trie, flags); + if (ret) + goto out; + /* If the new node matches the prefix completely, it must be inserted * as an ancestor. Simply insert it between @node and *@slot. */ @@ -406,8 +425,10 @@ static long trie_update_elem(struct bpf_map *map, goto out; } - im_node = lpm_trie_node_alloc(trie, NULL); + /* migration is disabled within the locked scope */ + im_node = lpm_trie_node_alloc(trie, NULL, false); if (!im_node) { + trie->n_entries--; ret = -ENOMEM; goto out; } @@ -429,16 +450,13 @@ static long trie_update_elem(struct bpf_map *map, rcu_assign_pointer(*slot, im_node); out: - if (ret) { - if (new_node) - trie->n_entries--; + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); - kfree(new_node); - kfree(im_node); - } - - spin_unlock_irqrestore(&trie->lock, irq_flags); - kfree_rcu(free_node, rcu); + migrate_disable(); + if (ret) + bpf_mem_cache_free(&trie->ma, new_node); + bpf_mem_cache_free_rcu(&trie->ma, free_node); + migrate_enable(); return ret; } @@ -459,7 +477,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) if (key->prefixlen > trie->max_prefixlen) return -EINVAL; - spin_lock_irqsave(&trie->lock, irq_flags); + raw_spin_lock_irqsave(&trie->lock, irq_flags); /* Walk the tree looking for an exact key/length match and keeping * track of the path we traverse. We will need to know the node @@ -535,9 +553,12 @@ static long trie_delete_elem(struct bpf_map *map, void *_key) free_node = node; out: - spin_unlock_irqrestore(&trie->lock, irq_flags); - kfree_rcu(free_parent, rcu); - kfree_rcu(free_node, rcu); + raw_spin_unlock_irqrestore(&trie->lock, irq_flags); + + migrate_disable(); + bpf_mem_cache_free_rcu(&trie->ma, free_parent); + bpf_mem_cache_free_rcu(&trie->ma, free_node); + migrate_enable(); return ret; } @@ -559,6 +580,8 @@ out: static struct bpf_map *trie_alloc(union bpf_attr *attr) { struct lpm_trie *trie; + size_t leaf_size; + int err; /* check sanity of attributes */ if (attr->max_entries == 0 || @@ -581,9 +604,19 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) offsetof(struct bpf_lpm_trie_key_u8, data); trie->max_prefixlen = trie->data_size * 8; - spin_lock_init(&trie->lock); + raw_spin_lock_init(&trie->lock); + /* Allocate intermediate and leaf nodes from the same allocator */ + leaf_size = sizeof(struct lpm_trie_node) + trie->data_size + + trie->map.value_size; + err = bpf_mem_alloc_init(&trie->ma, leaf_size, false); + if (err) + goto free_out; return &trie->map; + +free_out: + bpf_map_area_free(trie); + return ERR_PTR(err); } static void trie_free(struct bpf_map *map) @@ -615,13 +648,17 @@ static void trie_free(struct bpf_map *map) continue; } - kfree(node); + /* No bpf program may access the map, so freeing the + * node without waiting for the extra RCU GP. + */ + bpf_mem_cache_raw_free(node); RCU_INIT_POINTER(*slot, NULL); break; } } out: + bpf_mem_alloc_destroy(&trie->ma); bpf_map_area_free(trie); } @@ -633,7 +670,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) struct lpm_trie_node **node_stack = NULL; int err = 0, stack_ptr = -1; unsigned int next_bit; - size_t matchlen; + size_t matchlen = 0; /* The get_next_key follows postorder. For the 4 node example in * the top of this file, the trie_get_next_key() returns the following @@ -672,7 +709,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) next_bit = extract_bit(key->data, node->prefixlen); node = rcu_dereference(node->child[next_bit]); } - if (!node || node->prefixlen != key->prefixlen || + if (!node || node->prefixlen != matchlen || (node->flags & LPM_TREE_NODE_FLAG_IM)) goto find_leftmost; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1c4ebb326785..77f56674aaa9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -420,25 +420,6 @@ static struct btf_record *reg_btf_record(const struct bpf_reg_state *reg) return rec; } -static bool mask_raw_tp_reg_cond(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) { - return reg->type == (PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL) && - bpf_prog_is_raw_tp(env->prog) && !reg->ref_obj_id; -} - -static bool mask_raw_tp_reg(const struct bpf_verifier_env *env, struct bpf_reg_state *reg) -{ - if (!mask_raw_tp_reg_cond(env, reg)) - return false; - reg->type &= ~PTR_MAYBE_NULL; - return true; -} - -static void unmask_raw_tp_reg(struct bpf_reg_state *reg, bool result) -{ - if (result) - reg->type |= PTR_MAYBE_NULL; -} - static bool subprog_is_global(const struct bpf_verifier_env *env, int subprog) { struct bpf_func_info_aux *aux = env->prog->aux->func_info_aux; @@ -1202,14 +1183,17 @@ static bool is_spilled_scalar_reg64(const struct bpf_stack_state *stack) /* Mark stack slot as STACK_MISC, unless it is already STACK_INVALID, in which * case they are equivalent, or it's STACK_ZERO, in which case we preserve * more precise STACK_ZERO. - * Note, in uprivileged mode leaving STACK_INVALID is wrong, so we take - * env->allow_ptr_leaks into account and force STACK_MISC, if necessary. + * Regardless of allow_ptr_leaks setting (i.e., privileged or unprivileged + * mode), we won't promote STACK_INVALID to STACK_MISC. In privileged case it is + * unnecessary as both are considered equivalent when loading data and pruning, + * in case of unprivileged mode it will be incorrect to allow reads of invalid + * slots. */ static void mark_stack_slot_misc(struct bpf_verifier_env *env, u8 *stype) { if (*stype == STACK_ZERO) return; - if (env->allow_ptr_leaks && *stype == STACK_INVALID) + if (*stype == STACK_INVALID) return; *stype = STACK_MISC; } @@ -2594,16 +2578,36 @@ static int cmp_subprogs(const void *a, const void *b) ((struct bpf_subprog_info *)b)->start; } +/* Find subprogram that contains instruction at 'off' */ +static struct bpf_subprog_info *find_containing_subprog(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *vals = env->subprog_info; + int l, r, m; + + if (off >= env->prog->len || off < 0 || env->subprog_cnt == 0) + return NULL; + + l = 0; + r = env->subprog_cnt - 1; + while (l < r) { + m = l + (r - l + 1) / 2; + if (vals[m].start <= off) + l = m; + else + r = m - 1; + } + return &vals[l]; +} + +/* Find subprogram that starts exactly at 'off' */ static int find_subprog(struct bpf_verifier_env *env, int off) { struct bpf_subprog_info *p; - p = bsearch(&off, env->subprog_info, env->subprog_cnt, - sizeof(env->subprog_info[0]), cmp_subprogs); - if (!p) + p = find_containing_subprog(env, off); + if (!p || p->start != off) return -ENOENT; return p - env->subprog_info; - } static int add_subprog(struct bpf_verifier_env *env, int off) @@ -4700,6 +4704,7 @@ static int check_stack_write_fixed_off(struct bpf_verifier_env *env, */ if (!env->allow_ptr_leaks && is_spilled_reg(&state->stack[spi]) && + !is_spilled_scalar_reg(&state->stack[spi]) && size != BPF_REG_SIZE) { verbose(env, "attempt to corrupt spilled pointer on stack\n"); return -EACCES; @@ -6777,7 +6782,6 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, const char *field_name = NULL; enum bpf_type_flag flag = 0; u32 btf_id = 0; - bool mask; int ret; if (!env->allow_ptr_leaks) { @@ -6849,21 +6853,7 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, if (ret < 0) return ret; - /* For raw_tp progs, we allow dereference of PTR_MAYBE_NULL - * trusted PTR_TO_BTF_ID, these are the ones that are possibly - * arguments to the raw_tp. Since internal checks in for trusted - * reg in check_ptr_to_btf_access would consider PTR_MAYBE_NULL - * modifier as problematic, mask it out temporarily for the - * check. Don't apply this to pointers with ref_obj_id > 0, as - * those won't be raw_tp args. - * - * We may end up applying this relaxation to other trusted - * PTR_TO_BTF_ID with maybe null flag, since we cannot - * distinguish PTR_MAYBE_NULL tagged for arguments vs normal - * tagging, but that should expand allowed behavior, and not - * cause regression for existing behavior. - */ - mask = mask_raw_tp_reg(env, reg); + if (ret != PTR_TO_BTF_ID) { /* just mark; */ @@ -6924,13 +6914,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, clear_trusted_flags(&flag); } - if (atype == BPF_READ && value_regno >= 0) { + if (atype == BPF_READ && value_regno >= 0) mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id, flag); - /* We've assigned a new type to regno, so don't undo masking. */ - if (regno == value_regno) - mask = false; - } - unmask_raw_tp_reg(reg, mask); return 0; } @@ -7305,7 +7290,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); } else if (base_type(reg->type) == PTR_TO_BTF_ID && - (mask_raw_tp_reg_cond(env, reg) || !type_may_be_null(reg->type))) { + !type_may_be_null(reg->type)) { err = check_ptr_to_btf_access(env, regs, regno, off, size, t, value_regno); } else if (reg->type == CONST_PTR_TO_MAP) { @@ -8071,7 +8056,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn if (reg->type != PTR_TO_STACK && reg->type != CONST_PTR_TO_DYNPTR) { verbose(env, "arg#%d expected pointer to stack or const struct bpf_dynptr\n", - regno); + regno - 1); return -EINVAL; } @@ -8125,7 +8110,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn if (!is_dynptr_reg_valid_init(env, reg)) { verbose(env, "Expected an initialized dynptr as arg #%d\n", - regno); + regno - 1); return -EINVAL; } @@ -8133,7 +8118,7 @@ static int process_dynptr_func(struct bpf_verifier_env *env, int regno, int insn if (!is_dynptr_type_expected(env, reg, arg_type & ~MEM_RDONLY)) { verbose(env, "Expected a dynptr of type %s as arg #%d\n", - dynptr_type_str(arg_to_dynptr_type(arg_type)), regno); + dynptr_type_str(arg_to_dynptr_type(arg_type)), regno - 1); return -EINVAL; } @@ -8189,6 +8174,11 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id const struct btf_type *t; int spi, err, i, nr_slots, btf_id; + if (reg->type != PTR_TO_STACK) { + verbose(env, "arg#%d expected pointer to an iterator on stack\n", regno - 1); + return -EINVAL; + } + /* For iter_{new,next,destroy} functions, btf_check_iter_kfuncs() * ensures struct convention, so we wouldn't need to do any BTF * validation here. But given iter state can be passed as a parameter @@ -8197,7 +8187,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id */ btf_id = btf_check_iter_arg(meta->btf, meta->func_proto, regno - 1); if (btf_id < 0) { - verbose(env, "expected valid iter pointer as arg #%d\n", regno); + verbose(env, "expected valid iter pointer as arg #%d\n", regno - 1); return -EINVAL; } t = btf_type_by_id(meta->btf, btf_id); @@ -8207,7 +8197,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id /* bpf_iter_<type>_new() expects pointer to uninit iter state */ if (!is_iter_reg_valid_uninit(env, reg, nr_slots)) { verbose(env, "expected uninitialized iter_%s as arg #%d\n", - iter_type_str(meta->btf, btf_id), regno); + iter_type_str(meta->btf, btf_id), regno - 1); return -EINVAL; } @@ -8231,7 +8221,7 @@ static int process_iter_arg(struct bpf_verifier_env *env, int regno, int insn_id break; case -EINVAL: verbose(env, "expected an initialized iter_%s as arg #%d\n", - iter_type_str(meta->btf, btf_id), regno); + iter_type_str(meta->btf, btf_id), regno - 1); return err; case -EPROTO: verbose(env, "expected an RCU CS when using %s\n", meta->func_name); @@ -9003,7 +8993,6 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, enum bpf_reg_type type = reg->type; u32 *arg_btf_id = NULL; int err = 0; - bool mask; if (arg_type == ARG_DONTCARE) return 0; @@ -9044,11 +9033,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg, base_type(arg_type) == ARG_PTR_TO_SPIN_LOCK) arg_btf_id = fn->arg_btf_id[arg]; - mask = mask_raw_tp_reg(env, reg); err = check_reg_type(env, regno, arg_type, arg_btf_id, meta); + if (err) + return err; - err = err ?: check_func_arg_reg_off(env, reg, regno, arg_type); - unmask_raw_tp_reg(reg, mask); + err = check_func_arg_reg_off(env, reg, regno, arg_type); if (err) return err; @@ -9843,17 +9832,14 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, return ret; } else if (base_type(arg->arg_type) == ARG_PTR_TO_BTF_ID) { struct bpf_call_arg_meta meta; - bool mask; int err; if (register_is_null(reg) && type_may_be_null(arg->arg_type)) continue; memset(&meta, 0, sizeof(meta)); /* leave func_id as zero */ - mask = mask_raw_tp_reg(env, reg); err = check_reg_type(env, regno, arg->arg_type, &arg->btf_id, &meta); err = err ?: check_func_arg_reg_off(env, reg, regno, arg->arg_type); - unmask_raw_tp_reg(reg, mask); if (err) return err; } else { @@ -10013,6 +9999,8 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, verbose(env, "Func#%d ('%s') is global and assumed valid.\n", subprog, sub_name); + if (env->subprog_info[subprog].changes_pkt_data) + clear_all_pkt_pointers(env); /* mark global subprog for verifying after main prog */ subprog_aux(env, subprog)->called = true; clear_caller_saved_regs(env, caller->regs); @@ -10699,7 +10687,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn } /* With LD_ABS/IND some JITs save/restore skb from r1. */ - changes_data = bpf_helper_changes_pkt_data(fn->func); + changes_data = bpf_helper_changes_pkt_data(func_id); if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", func_id_name(func_id), func_id); @@ -12174,7 +12162,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ enum bpf_arg_type arg_type = ARG_DONTCARE; u32 regno = i + 1, ref_id, type_size; bool is_ret_buf_sz = false; - bool mask = false; int kf_arg_type; t = btf_type_skip_modifiers(btf, args[i].type, NULL); @@ -12233,15 +12220,12 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ return -EINVAL; } - mask = mask_raw_tp_reg(env, reg); if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) && (register_is_null(reg) || type_may_be_null(reg->type)) && !is_kfunc_arg_nullable(meta->btf, &args[i])) { verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); - unmask_raw_tp_reg(reg, mask); return -EACCES; } - unmask_raw_tp_reg(reg, mask); if (reg->ref_obj_id) { if (is_kfunc_release(meta) && meta->ref_obj_id) { @@ -12299,24 +12283,16 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (!is_kfunc_trusted_args(meta) && !is_kfunc_rcu(meta)) break; - /* Allow passing maybe NULL raw_tp arguments to - * kfuncs for compatibility. Don't apply this to - * arguments with ref_obj_id > 0. - */ - mask = mask_raw_tp_reg(env, reg); if (!is_trusted_reg(reg)) { if (!is_kfunc_rcu(meta)) { verbose(env, "R%d must be referenced or trusted\n", regno); - unmask_raw_tp_reg(reg, mask); return -EINVAL; } if (!is_rcu_reg(reg)) { verbose(env, "R%d must be a rcu pointer\n", regno); - unmask_raw_tp_reg(reg, mask); return -EINVAL; } } - unmask_raw_tp_reg(reg, mask); fallthrough; case KF_ARG_PTR_TO_CTX: case KF_ARG_PTR_TO_DYNPTR: @@ -12339,9 +12315,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ if (is_kfunc_release(meta) && reg->ref_obj_id) arg_type |= OBJ_RELEASE; - mask = mask_raw_tp_reg(env, reg); ret = check_func_arg_reg_off(env, reg, regno, arg_type); - unmask_raw_tp_reg(reg, mask); if (ret < 0) return ret; @@ -12518,7 +12492,6 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ ref_tname = btf_name_by_offset(btf, ref_t->name_off); fallthrough; case KF_ARG_PTR_TO_BTF_ID: - mask = mask_raw_tp_reg(env, reg); /* Only base_type is checked, further checks are done here */ if ((base_type(reg->type) != PTR_TO_BTF_ID || (bpf_type_has_unsafe_modifiers(reg->type) && !is_rcu_reg(reg))) && @@ -12527,11 +12500,9 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ verbose(env, "expected %s or socket\n", reg_type_str(env, base_type(reg->type) | (type_flag(reg->type) & BPF_REG_TRUSTED_MODIFIERS))); - unmask_raw_tp_reg(reg, mask); return -EINVAL; } ret = process_kf_arg_ptr_to_btf_id(env, reg, ref_t, ref_tname, ref_id, meta, i); - unmask_raw_tp_reg(reg, mask); if (ret < 0) return ret; break; @@ -13504,7 +13475,7 @@ static int sanitize_check_bounds(struct bpf_verifier_env *env, */ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn, - struct bpf_reg_state *ptr_reg, + const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { struct bpf_verifier_state *vstate = env->cur_state; @@ -13518,7 +13489,6 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; - bool mask; int ret; dst_reg = ®s[dst]; @@ -13545,14 +13515,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; } - mask = mask_raw_tp_reg(env, ptr_reg); if (ptr_reg->type & PTR_MAYBE_NULL) { verbose(env, "R%d pointer arithmetic on %s prohibited, null-check it first\n", dst, reg_type_str(env, ptr_reg->type)); - unmask_raw_tp_reg(ptr_reg, mask); return -EACCES; } - unmask_raw_tp_reg(ptr_reg, mask); switch (base_type(ptr_reg->type)) { case PTR_TO_CTX: @@ -16217,6 +16184,29 @@ enforce_retval: return 0; } +static void mark_subprog_changes_pkt_data(struct bpf_verifier_env *env, int off) +{ + struct bpf_subprog_info *subprog; + + subprog = find_containing_subprog(env, off); + subprog->changes_pkt_data = true; +} + +/* 't' is an index of a call-site. + * 'w' is a callee entry point. + * Eventually this function would be called when env->cfg.insn_state[w] == EXPLORED. + * Rely on DFS traversal order and absence of recursive calls to guarantee that + * callee's change_pkt_data marks would be correct at that moment. + */ +static void merge_callee_effects(struct bpf_verifier_env *env, int t, int w) +{ + struct bpf_subprog_info *caller, *callee; + + caller = find_containing_subprog(env, t); + callee = find_containing_subprog(env, w); + caller->changes_pkt_data |= callee->changes_pkt_data; +} + /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered @@ -16350,6 +16340,7 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, bool visit_callee) { int ret, insn_sz; + int w; insn_sz = bpf_is_ldimm64(&insns[t]) ? 2 : 1; ret = push_insn(t, t + insn_sz, FALLTHROUGH, env); @@ -16361,8 +16352,10 @@ static int visit_func_call_insn(int t, struct bpf_insn *insns, mark_jmp_point(env, t + insn_sz); if (visit_callee) { + w = t + insns[t].imm + 1; mark_prune_point(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + merge_callee_effects(env, t, w); + ret = push_insn(t, w, BRANCH, env); } return ret; } @@ -16679,6 +16672,8 @@ static int visit_insn(int t, struct bpf_verifier_env *env) mark_prune_point(env, t); mark_jmp_point(env, t); } + if (bpf_helper_call(insn) && bpf_helper_changes_pkt_data(insn->imm)) + mark_subprog_changes_pkt_data(env, t); if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) { struct bpf_kfunc_call_arg_meta meta; @@ -16813,6 +16808,7 @@ walk_cfg: } } ret = 0; /* cfg looks good */ + env->prog->aux->changes_pkt_data = env->subprog_info[0].changes_pkt_data; err_free: kvfree(insn_state); @@ -20066,7 +20062,6 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) * for this case. */ case PTR_TO_BTF_ID | MEM_ALLOC | PTR_UNTRUSTED: - case PTR_TO_BTF_ID | PTR_TRUSTED | PTR_MAYBE_NULL: if (type == BPF_READ) { if (BPF_MODE(insn->code) == BPF_MEM) insn->code = BPF_LDX | BPF_PROBE_MEM | @@ -20302,6 +20297,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) func[i]->aux->num_exentries = num_exentries; func[i]->aux->tail_call_reachable = env->subprog_info[i].tail_call_reachable; func[i]->aux->exception_cb = env->subprog_info[i].is_exception_cb; + func[i]->aux->changes_pkt_data = env->subprog_info[i].changes_pkt_data; if (!i) func[i]->aux->exception_boundary = env->seen_exception; func[i] = bpf_int_jit_compile(func[i]); @@ -21285,11 +21281,15 @@ patch_map_ops_generic: * changed in some incompatible and hard to support * way, it's fine to back out this inlining logic */ +#ifdef CONFIG_SMP insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number); insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0); insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0); cnt = 3; - +#else + insn_buf[0] = BPF_ALU32_REG(BPF_XOR, BPF_REG_0, BPF_REG_0); + cnt = 1; +#endif new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; @@ -22132,6 +22132,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, } if (tgt_prog) { struct bpf_prog_aux *aux = tgt_prog->aux; + bool tgt_changes_pkt_data; if (bpf_prog_is_dev_bound(prog->aux) && !bpf_prog_dev_bound_match(prog, tgt_prog)) { @@ -22166,6 +22167,14 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, "Extension programs should be JITed\n"); return -EINVAL; } + tgt_changes_pkt_data = aux->func + ? aux->func[subprog]->aux->changes_pkt_data + : aux->changes_pkt_data; + if (prog->aux->changes_pkt_data && !tgt_changes_pkt_data) { + bpf_log(log, + "Extension program changes packet data, while original does not\n"); + return -EINVAL; + } } if (!tgt_prog->jited) { bpf_log(log, "Can attach to only JITed progs\n"); @@ -22631,10 +22640,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; - ret = check_attach_btf_id(env); - if (ret) - goto skip_full_check; - ret = resolve_pseudo_ldimm64(env); if (ret < 0) goto skip_full_check; @@ -22649,6 +22654,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, bpfptr_t uattr, __u3 if (ret < 0) goto skip_full_check; + ret = check_attach_btf_id(env); + if (ret) + goto skip_full_check; + ret = mark_fastcall_patterns(env); if (ret < 0) goto skip_full_check; diff --git a/kernel/fork.c b/kernel/fork.c index 1450b461d196..9b301180fd41 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -639,11 +639,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, 0); - uprobe_start_dup_mmap(); - if (mmap_write_lock_killable(oldmm)) { - retval = -EINTR; - goto fail_uprobe_end; - } + if (mmap_write_lock_killable(oldmm)) + return -EINTR; flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* @@ -782,8 +779,6 @@ out: dup_userfaultfd_complete(&uf); else dup_userfaultfd_fail(&uf); -fail_uprobe_end: - uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: @@ -1692,9 +1687,11 @@ static struct mm_struct *dup_mm(struct task_struct *tsk, if (!mm_init(mm, tsk, mm->user_ns)) goto fail_nomem; + uprobe_start_dup_mmap(); err = dup_mmap(mm, oldmm); if (err) goto free_pt; + uprobe_end_dup_mmap(); mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; @@ -1709,6 +1706,8 @@ free_pt: mm->binfmt = NULL; mm_init_owner(mm, NULL); mmput(mm); + if (err) + uprobe_end_dup_mmap(); fail_nomem: return NULL; diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h index 618ce1fe870e..99b32e728c4a 100644 --- a/kernel/futex/futex.h +++ b/kernel/futex/futex.h @@ -265,11 +265,11 @@ static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from) else if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(val, from, Efault); - user_access_end(); + user_read_access_end(); *dest = val; return 0; Efault: - user_access_end(); + user_read_access_end(); return -EFAULT; } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index f36c33bd2da4..8e29809de38d 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -501,17 +501,18 @@ int show_interrupts(struct seq_file *p, void *v) seq_put_decimal_ull_width(p, " ", cnt, 10); } + seq_putc(p, ' '); raw_spin_lock_irqsave(&desc->lock, flags); if (desc->irq_data.chip) { if (desc->irq_data.chip->irq_print_chip) desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); else if (desc->irq_data.chip->name) - seq_printf(p, " %8s", desc->irq_data.chip->name); + seq_printf(p, "%8s", desc->irq_data.chip->name); else - seq_printf(p, " %8s", "-"); + seq_printf(p, "%8s", "-"); } else { - seq_printf(p, " %8s", "None"); + seq_printf(p, "%8s", "None"); } if (desc->irq_data.domain) seq_printf(p, " %*lu", prec, desc->irq_data.hwirq); diff --git a/kernel/kcov.c b/kernel/kcov.c index 28a6be6e64fd..187ba1b80bda 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -166,7 +166,7 @@ static void kcov_remote_area_put(struct kcov_remote_area *area, * Unlike in_serving_softirq(), this function returns false when called during * a hardirq or an NMI that happened in the softirq context. */ -static inline bool in_softirq_really(void) +static __always_inline bool in_softirq_really(void) { return in_serving_softirq() && !in_hardirq() && !in_nmi(); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index ac1365afcc4a..697a56d3d949 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1248,10 +1248,7 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, /* Check whether the waiter should back out immediately */ rtm = container_of(lock, struct rt_mutex, rtmutex); - preempt_disable(); res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx, wake_q); - wake_up_q(wake_q); - preempt_enable(); if (res) { raw_spin_lock(&task->pi_lock); rt_mutex_dequeue(lock, waiter); @@ -1295,7 +1292,13 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, */ get_task_struct(owner); + preempt_disable(); raw_spin_unlock_irq(&lock->wait_lock); + /* wake up any tasks on the wake_q before calling rt_mutex_adjust_prio_chain */ + wake_up_q(wake_q); + wake_q_init(wake_q); + preempt_enable(); + res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); @@ -1599,6 +1602,7 @@ static void __sched remove_waiter(struct rt_mutex_base *lock, * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter + * @wake_q: wake_q of tasks to wake when we drop the lock->wait_lock * * Must be called with lock->wait_lock held and interrupts disabled */ @@ -1606,7 +1610,8 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, struct ww_acquire_ctx *ww_ctx, unsigned int state, struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter, + struct wake_q_head *wake_q) __releases(&lock->wait_lock) __acquires(&lock->wait_lock) { struct rt_mutex *rtm = container_of(lock, struct rt_mutex, rtmutex); @@ -1637,7 +1642,13 @@ static int __sched rt_mutex_slowlock_block(struct rt_mutex_base *lock, owner = rt_mutex_owner(lock); else owner = NULL; + preempt_disable(); raw_spin_unlock_irq(&lock->wait_lock); + if (wake_q) { + wake_up_q(wake_q); + wake_q_init(wake_q); + } + preempt_enable(); if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) rt_mutex_schedule(); @@ -1711,7 +1722,7 @@ static int __sched __rt_mutex_slowlock(struct rt_mutex_base *lock, ret = task_blocks_on_rt_mutex(lock, waiter, current, ww_ctx, chwalk, wake_q); if (likely(!ret)) - ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter); + ret = rt_mutex_slowlock_block(lock, ww_ctx, state, NULL, waiter, wake_q); if (likely(!ret)) { /* acquired the lock */ diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index 33ea31d6a7b3..191e4720e546 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -383,7 +383,7 @@ int __sched rt_mutex_wait_proxy_lock(struct rt_mutex_base *lock, raw_spin_lock_irq(&lock->wait_lock); /* sleep on the mutex */ set_current_state(TASK_INTERRUPTIBLE); - ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter); + ret = rt_mutex_slowlock_block(lock, NULL, TASK_INTERRUPTIBLE, to, waiter, NULL); /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. diff --git a/kernel/module/Kconfig b/kernel/module/Kconfig index 7c6588148d42..7b329057997a 100644 --- a/kernel/module/Kconfig +++ b/kernel/module/Kconfig @@ -349,7 +349,7 @@ config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS help Symbols exported with EXPORT_SYMBOL_NS*() are considered exported in a namespace. A module that makes use of a symbol exported with such a - namespace is required to import the namespace via MODULE_IMPORT_NS(). + namespace is required to import the namespace via MODULE_IMPORT_NS(""). There is no technical reason to enforce correct namespace imports, but it creates consistency between symbols defining namespaces and users importing namespaces they make use of. This option relaxes this diff --git a/kernel/resource.c b/kernel/resource.c index c9fd26c06345..b7c0e24d9398 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -998,7 +998,7 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) * to use this interface. The former are built-in and only the latter, * CXL, is a module. */ -EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, CXL); +EXPORT_SYMBOL_NS_GPL(insert_resource_expand_to_fit, "CXL"); /** * remove_resource - Remove a resource in the resource tree diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 95e40895a519..3e5a6bf587f9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1283,9 +1283,9 @@ static void nohz_csd_func(void *info) WARN_ON(!(flags & NOHZ_KICK_MASK)); rq->idle_balance = idle_cpu(cpu); - if (rq->idle_balance && !need_resched()) { + if (rq->idle_balance) { rq->nohz_idle_balance = flags; - raise_softirq_irqoff(SCHED_SOFTIRQ); + __raise_softirq_irqoff(SCHED_SOFTIRQ); } } @@ -1341,7 +1341,7 @@ bool sched_can_stop_tick(struct rq *rq) if (scx_enabled() && !scx_can_stop_tick(rq)) return false; - if (rq->cfs.nr_running > 1) + if (rq->cfs.h_nr_running > 1) return false; /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d9d5a702f1a6..d94f2ed6d1f4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -781,7 +781,7 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, * If it is a deferred reservation, and the server * is not handling an starvation case, defer it. */ - if (dl_se->dl_defer & !dl_se->dl_defer_running) { + if (dl_se->dl_defer && !dl_se->dl_defer_running) { dl_se->dl_throttled = 1; dl_se->dl_defer_armed = 1; } @@ -1647,6 +1647,7 @@ void dl_server_start(struct sched_dl_entity *dl_se) if (!dl_se->dl_runtime) return; + dl_se->dl_server_active = 1; enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) resched_curr(dl_se->rq); @@ -1661,6 +1662,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se) hrtimer_try_to_cancel(&dl_se->dl_timer); dl_se->dl_defer_armed = 0; dl_se->dl_throttled = 0; + dl_se->dl_server_active = 0; } void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, @@ -2042,6 +2044,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se); } else if ((flags & ENQUEUE_RESTORE) && + !is_dl_boosted(dl_se) && dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_se(dl_se)))) { setup_new_dl_entity(dl_se); } @@ -2420,8 +2423,10 @@ again: if (dl_server(dl_se)) { p = dl_se->server_pick_task(dl_se); if (!p) { - dl_se->dl_yielded = 1; - update_curr_dl_se(rq, dl_se, 0); + if (dl_server_active(dl_se)) { + dl_se->dl_yielded = 1; + update_curr_dl_se(rq, dl_se, 0); + } goto again; } rq->dl_server = dl_se; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index a48b2a701ec2..a1be00a988bf 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -845,6 +845,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_delayed", cfs_rq->h_nr_delayed); SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", cfs_rq->idle_nr_running); SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running", diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 7fff1d045477..19d2699cf638 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4763,7 +4763,7 @@ static void scx_ops_bypass(bool bypass) * sees scx_rq_bypassing() before moving tasks to SCX. */ if (!scx_enabled()) { - rq_unlock_irqrestore(rq, &rf); + rq_unlock(rq, &rf); continue; } @@ -7013,7 +7013,7 @@ __bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, return -ENOENT; INIT_LIST_HEAD(&kit->cursor.node); - kit->cursor.flags |= SCX_DSQ_LNODE_ITER_CURSOR | flags; + kit->cursor.flags = SCX_DSQ_LNODE_ITER_CURSOR | flags; kit->cursor.priv = READ_ONCE(kit->dsq->seq); return 0; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fbdca89c677f..3e9ca38512de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1159,8 +1159,6 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec) trace_sched_stat_runtime(p, delta_exec); account_group_exec_runtime(p, delta_exec); cgroup_account_cputime(p, delta_exec); - if (p->dl_server) - dl_server_update(p->dl_server, delta_exec); } static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) @@ -1237,11 +1235,16 @@ static void update_curr(struct cfs_rq *cfs_rq) update_curr_task(p, delta_exec); /* - * Any fair task that runs outside of fair_server should - * account against fair_server such that it can account for - * this time and possibly avoid running this period. + * If the fair_server is active, we need to account for the + * fair_server time whether or not the task is running on + * behalf of fair_server or not: + * - If the task is running on behalf of fair_server, we need + * to limit its time based on the assigned runtime. + * - Fair task that runs outside of fair_server should account + * against fair_server such that it can account for this time + * and possibly avoid running this period. */ - if (p->dl_server != &rq->fair_server) + if (dl_server_active(&rq->fair_server)) dl_server_update(&rq->fair_server, delta_exec); } @@ -3399,10 +3402,16 @@ retry_pids: /* Initialise new per-VMA NUMAB state. */ if (!vma->numab_state) { - vma->numab_state = kzalloc(sizeof(struct vma_numab_state), - GFP_KERNEL); - if (!vma->numab_state) + struct vma_numab_state *ptr; + + ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); + if (!ptr) + continue; + + if (cmpxchg(&vma->numab_state, NULL, ptr)) { + kfree(ptr); continue; + } vma->numab_state->start_scan_seq = mm->numa_scan_seq; @@ -5465,9 +5474,33 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); -static inline void finish_delayed_dequeue_entity(struct sched_entity *se) +static void set_delayed(struct sched_entity *se) +{ + se->sched_delayed = 1; + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + cfs_rq->h_nr_delayed++; + if (cfs_rq_throttled(cfs_rq)) + break; + } +} + +static void clear_delayed(struct sched_entity *se) { se->sched_delayed = 0; + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + cfs_rq->h_nr_delayed--; + if (cfs_rq_throttled(cfs_rq)) + break; + } +} + +static inline void finish_delayed_dequeue_entity(struct sched_entity *se) +{ + clear_delayed(se); if (sched_feat(DELAY_ZERO) && se->vlag > 0) se->vlag = 0; } @@ -5478,6 +5511,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) bool sleep = flags & DEQUEUE_SLEEP; update_curr(cfs_rq); + clear_buddies(cfs_rq, se); if (flags & DEQUEUE_DELAYED) { SCHED_WARN_ON(!se->sched_delayed); @@ -5494,10 +5528,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (sched_feat(DELAY_DEQUEUE) && delay && !entity_eligible(cfs_rq, se)) { - if (cfs_rq->next == se) - cfs_rq->next = NULL; update_load_avg(cfs_rq, se, 0); - se->sched_delayed = 1; + set_delayed(se); return false; } } @@ -5520,8 +5552,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_stats_dequeue_fair(cfs_rq, se, flags); - clear_buddies(cfs_rq, se); - update_entity_lag(cfs_rq, se); if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { se->deadline -= se->vruntime; @@ -5911,7 +5941,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta, dequeue = 1; + long task_delta, idle_task_delta, delayed_delta, dequeue = 1; long rq_h_nr_running = rq->cfs.h_nr_running; raw_spin_lock(&cfs_b->lock); @@ -5944,6 +5974,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; + delayed_delta = cfs_rq->h_nr_delayed; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); int flags; @@ -5967,6 +5998,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; + qcfs_rq->h_nr_delayed -= delayed_delta; if (qcfs_rq->load.weight) { /* Avoid re-evaluating load for this entity: */ @@ -5989,6 +6021,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; + qcfs_rq->h_nr_delayed -= delayed_delta; } /* At this point se is NULL and we are at root level*/ @@ -6014,7 +6047,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; - long task_delta, idle_task_delta; + long task_delta, idle_task_delta, delayed_delta; long rq_h_nr_running = rq->cfs.h_nr_running; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -6050,6 +6083,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) task_delta = cfs_rq->h_nr_running; idle_task_delta = cfs_rq->idle_h_nr_running; + delayed_delta = cfs_rq->h_nr_delayed; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); @@ -6067,6 +6101,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running += task_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; + qcfs_rq->h_nr_delayed += delayed_delta; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6084,6 +6119,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) qcfs_rq->h_nr_running += task_delta; qcfs_rq->idle_h_nr_running += idle_task_delta; + qcfs_rq->h_nr_delayed += delayed_delta; /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) @@ -6937,7 +6973,7 @@ requeue_delayed_entity(struct sched_entity *se) } update_load_avg(cfs_rq, se, 0); - se->sched_delayed = 0; + clear_delayed(se); } /* @@ -6951,6 +6987,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); + int h_nr_delayed = 0; int task_new = !(flags & ENQUEUE_WAKEUP); int rq_h_nr_running = rq->cfs.h_nr_running; u64 slice = 0; @@ -6977,6 +7014,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (p->in_iowait) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); + if (task_new) + h_nr_delayed = !!se->sched_delayed; + for_each_sched_entity(se) { if (se->on_rq) { if (se->sched_delayed) @@ -6999,6 +7039,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + cfs_rq->h_nr_delayed += h_nr_delayed; if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -7022,6 +7063,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + cfs_rq->h_nr_delayed += h_nr_delayed; if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = 1; @@ -7084,6 +7126,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) struct task_struct *p = NULL; int idle_h_nr_running = 0; int h_nr_running = 0; + int h_nr_delayed = 0; struct cfs_rq *cfs_rq; u64 slice = 0; @@ -7091,6 +7134,8 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) p = task_of(se); h_nr_running = 1; idle_h_nr_running = task_has_idle_policy(p); + if (!task_sleep && !task_delayed) + h_nr_delayed = !!se->sched_delayed; } else { cfs_rq = group_cfs_rq(se); slice = cfs_rq_min_slice(cfs_rq); @@ -7108,6 +7153,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) cfs_rq->h_nr_running -= h_nr_running; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + cfs_rq->h_nr_delayed -= h_nr_delayed; if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = h_nr_running; @@ -7146,6 +7192,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) cfs_rq->h_nr_running -= h_nr_running; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + cfs_rq->h_nr_delayed -= h_nr_delayed; if (cfs_rq_is_idle(cfs_rq)) idle_h_nr_running = h_nr_running; @@ -8774,7 +8821,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) return; - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) { + if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { set_next_buddy(pse); } @@ -12568,7 +12615,7 @@ static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags) * work being done for other CPUs. Next load * balancing owner will pick it up. */ - if (need_resched()) { + if (!idle_cpu(this_cpu) && need_resched()) { if (flags & NOHZ_STATS_KICK) has_blocked_load = true; if (flags & NOHZ_NEXT_KICK) diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index fc07382361a8..fee75cc2c47b 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -321,7 +321,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) { if (___update_load_sum(now, &cfs_rq->avg, scale_load_down(cfs_rq->load.weight), - cfs_rq->h_nr_running, + cfs_rq->h_nr_running - cfs_rq->h_nr_delayed, cfs_rq->curr != NULL)) { ___update_load_avg(&cfs_rq->avg, 1); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 76f5f53a645f..c5d67a43fe52 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -398,6 +398,11 @@ extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq extern int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init); +static inline bool dl_server_active(struct sched_dl_entity *dl_se) +{ + return dl_se->dl_server_active; +} + #ifdef CONFIG_CGROUP_SCHED extern struct list_head task_groups; @@ -649,6 +654,7 @@ struct cfs_rq { unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */ unsigned int idle_nr_running; /* SCHED_IDLE */ unsigned int idle_h_nr_running; /* SCHED_IDLE */ + unsigned int h_nr_delayed; s64 avg_vruntime; u64 avg_load; @@ -898,8 +904,11 @@ struct dl_rq { static inline void se_update_runnable(struct sched_entity *se) { - if (!entity_is_task(se)) - se->runnable_weight = se->my_q->h_nr_running; + if (!entity_is_task(se)) { + struct cfs_rq *cfs_rq = se->my_q; + + se->runnable_weight = cfs_rq->h_nr_running - cfs_rq->h_nr_delayed; + } } static inline long se_runnable(struct sched_entity *se) diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 0d71fcbaf1e3..ff0e5ab4e37c 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -1200,7 +1200,7 @@ int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) bool empty = !cpumask_and(new_mask, new_mask, ctx->user_mask); - if (WARN_ON_ONCE(empty)) + if (empty) cpumask_copy(new_mask, cpus_allowed); } __set_cpus_allowed_ptr(p, ctx); diff --git a/kernel/softirq.c b/kernel/softirq.c index 8b41bd13cc3d..4dae6ac2e83f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -280,17 +280,24 @@ static inline void invoke_softirq(void) wakeup_softirqd(); } +#define SCHED_SOFTIRQ_MASK BIT(SCHED_SOFTIRQ) + /* * flush_smp_call_function_queue() can raise a soft interrupt in a function - * call. On RT kernels this is undesired and the only known functionality - * in the block layer which does this is disabled on RT. If soft interrupts - * get raised which haven't been raised before the flush, warn so it can be + * call. On RT kernels this is undesired and the only known functionalities + * are in the block layer which is disabled on RT, and in the scheduler for + * idle load balancing. If soft interrupts get raised which haven't been + * raised before the flush, warn if it is not a SCHED_SOFTIRQ so it can be * investigated. */ void do_softirq_post_smp_call_flush(unsigned int was_pending) { - if (WARN_ON_ONCE(was_pending != local_softirq_pending())) + unsigned int is_pending = local_softirq_pending(); + + if (unlikely(was_pending != is_pending)) { + WARN_ON_ONCE(was_pending != (is_pending & ~SCHED_SOFTIRQ_MASK)); invoke_softirq(); + } } #else /* CONFIG_PREEMPT_RT */ diff --git a/kernel/static_call_inline.c b/kernel/static_call_inline.c index 5259cda486d0..bb7d066a7c39 100644 --- a/kernel/static_call_inline.c +++ b/kernel/static_call_inline.c @@ -15,7 +15,7 @@ extern struct static_call_site __start_static_call_sites[], extern struct static_call_tramp_key __start_static_call_tramp_key[], __stop_static_call_tramp_key[]; -static int static_call_initialized; +int static_call_initialized; /* * Must be called before early_initcall() to be effective. diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index aab6472853fa..7304d7cf47f2 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -24,7 +24,7 @@ static void clocksource_enqueue(struct clocksource *cs); static noinline u64 cycles_to_nsec_safe(struct clocksource *cs, u64 start, u64 end) { - u64 delta = clocksource_delta(end, start, cs->mask); + u64 delta = clocksource_delta(end, start, cs->mask, cs->max_raw_delta); if (likely(delta < cs->max_cycles)) return clocksource_cyc2ns(delta, cs->mult, cs->shift); @@ -993,6 +993,15 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs) cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, cs->mask, &cs->max_cycles); + + /* + * Threshold for detecting negative motion in clocksource_delta(). + * + * Allow for 0.875 of the counter width so that overly long idle + * sleeps, which go slightly over mask/2, do not trigger the + * negative motion detection. + */ + cs->max_raw_delta = (cs->mask >> 1) + (cs->mask >> 2) + (cs->mask >> 3); } static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0ca85ff4fbb4..3d128825d343 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -755,7 +755,8 @@ static void timekeeping_forward_now(struct timekeeper *tk) u64 cycle_now, delta; cycle_now = tk_clock_read(&tk->tkr_mono); - delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); + delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask, + tk->tkr_mono.clock->max_raw_delta); tk->tkr_mono.cycle_last = cycle_now; tk->tkr_raw.cycle_last = cycle_now; @@ -2230,7 +2231,8 @@ static bool timekeeping_advance(enum timekeeping_adv_mode mode) return false; offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), - tk->tkr_mono.cycle_last, tk->tkr_mono.mask); + tk->tkr_mono.cycle_last, tk->tkr_mono.mask, + tk->tkr_mono.clock->max_raw_delta); /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 63e600e943a7..8c9079108ffb 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -30,15 +30,15 @@ static inline void timekeeping_inc_mg_floor_swaps(void) #endif -static inline u64 clocksource_delta(u64 now, u64 last, u64 mask) +static inline u64 clocksource_delta(u64 now, u64 last, u64 mask, u64 max_delta) { u64 ret = (now - last) & mask; /* - * Prevent time going backwards by checking the MSB of mask in - * the result. If set, return 0. + * Prevent time going backwards by checking the result against + * @max_delta. If greater, return 0. */ - return ret & ~(mask >> 1) ? 0 : ret; + return ret > max_delta ? 0 : ret; } /* Semi public for serialization of non timekeeper VDSO updates. */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 949a3870946c..1b8db5aee9d3 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2250,6 +2250,9 @@ void perf_event_detach_bpf_prog(struct perf_event *event) goto unlock; old_array = bpf_event_rcu_dereference(event->tp_event->prog_array); + if (!old_array) + goto put; + ret = bpf_prog_array_copy(old_array, event->prog, NULL, 0, &new_array); if (ret < 0) { bpf_prog_array_delete_safe(old_array, event->prog); @@ -2258,6 +2261,14 @@ void perf_event_detach_bpf_prog(struct perf_event *event) bpf_prog_array_free_sleepable(old_array); } +put: + /* + * It could be that the bpf_prog is not sleepable (and will be freed + * via normal RCU), but is called from a point that supports sleepable + * programs and uses tasks-trace-RCU. + */ + synchronize_rcu_tasks_trace(); + bpf_prog_put(event->prog); event->prog = NULL; diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 0bf78517b5d4..30e3ddc8a8a8 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -833,7 +833,7 @@ static unsigned long __ftrace_return_to_handler(struct fgraph_ret_regs *ret_regs #endif { for_each_set_bit(i, &bitmap, sizeof(bitmap) * BITS_PER_BYTE) { - struct fgraph_ops *gops = fgraph_array[i]; + struct fgraph_ops *gops = READ_ONCE(fgraph_array[i]); if (gops == &fgraph_stub) continue; @@ -1215,7 +1215,7 @@ void fgraph_update_pid_func(void) static int start_graph_tracing(void) { unsigned long **ret_stack_list; - int ret; + int ret, cpu; ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE, sizeof(*ret_stack_list), GFP_KERNEL); @@ -1223,6 +1223,12 @@ static int start_graph_tracing(void) if (!ret_stack_list) return -ENOMEM; + /* The cpu_boot init_task->ret_stack will never be freed */ + for_each_online_cpu(cpu) { + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + } + do { ret = alloc_retstack_tasklist(ret_stack_list); } while (ret == -EAGAIN); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9b17efb1a87d..2e113f8b13a2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -902,16 +902,13 @@ static void profile_graph_return(struct ftrace_graph_ret *trace, } static struct fgraph_ops fprofiler_ops = { - .ops = { - .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(fprofiler_ops.ops) - }, .entryfunc = &profile_graph_entry, .retfunc = &profile_graph_return, }; static int register_ftrace_profiler(void) { + ftrace_ops_set_global_filter(&fprofiler_ops.ops); return register_ftrace_graph(&fprofiler_ops); } @@ -922,12 +919,11 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, - .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(ftrace_profile_ops) }; static int register_ftrace_profiler(void) { + ftrace_ops_set_global_filter(&ftrace_profile_ops); return register_ftrace_function(&ftrace_profile_ops); } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7e257e855dd1..60210fb5b211 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7019,7 +7019,11 @@ static int __rb_map_vma(struct ring_buffer_per_cpu *cpu_buffer, lockdep_assert_held(&cpu_buffer->mapping_lock); nr_subbufs = cpu_buffer->nr_pages + 1; /* + reader-subbuf */ - nr_pages = ((nr_subbufs + 1) << subbuf_order) - pgoff; /* + meta-page */ + nr_pages = ((nr_subbufs + 1) << subbuf_order); /* + meta-page */ + if (nr_pages <= pgoff) + return -EINVAL; + + nr_pages -= pgoff; nr_vma_pages = vma_pages(vma); if (!nr_vma_pages || nr_vma_pages > nr_pages) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index be62f0ea1814..f8aebcb01e62 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3611,17 +3611,12 @@ char *trace_iter_expand_format(struct trace_iterator *iter) } /* Returns true if the string is safe to dereference from an event */ -static bool trace_safe_str(struct trace_iterator *iter, const char *str, - bool star, int len) +static bool trace_safe_str(struct trace_iterator *iter, const char *str) { unsigned long addr = (unsigned long)str; struct trace_event *trace_event; struct trace_event_call *event; - /* Ignore strings with no length */ - if (star && !len) - return true; - /* OK if part of the event data */ if ((addr >= (unsigned long)iter->ent) && (addr < (unsigned long)iter->ent + iter->ent_size)) @@ -3661,181 +3656,69 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str, return false; } -static DEFINE_STATIC_KEY_FALSE(trace_no_verify); - -static int test_can_verify_check(const char *fmt, ...) -{ - char buf[16]; - va_list ap; - int ret; - - /* - * The verifier is dependent on vsnprintf() modifies the va_list - * passed to it, where it is sent as a reference. Some architectures - * (like x86_32) passes it by value, which means that vsnprintf() - * does not modify the va_list passed to it, and the verifier - * would then need to be able to understand all the values that - * vsnprintf can use. If it is passed by value, then the verifier - * is disabled. - */ - va_start(ap, fmt); - vsnprintf(buf, 16, "%d", ap); - ret = va_arg(ap, int); - va_end(ap); - - return ret; -} - -static void test_can_verify(void) -{ - if (!test_can_verify_check("%d %d", 0, 1)) { - pr_info("trace event string verifier disabled\n"); - static_branch_inc(&trace_no_verify); - } -} - /** - * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer + * ignore_event - Check dereferenced fields while writing to the seq buffer * @iter: The iterator that holds the seq buffer and the event being printed - * @fmt: The format used to print the event - * @ap: The va_list holding the data to print from @fmt. * - * This writes the data into the @iter->seq buffer using the data from - * @fmt and @ap. If the format has a %s, then the source of the string - * is examined to make sure it is safe to print, otherwise it will - * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string - * pointer. + * At boot up, test_event_printk() will flag any event that dereferences + * a string with "%s" that does exist in the ring buffer. It may still + * be valid, as the string may point to a static string in the kernel + * rodata that never gets freed. But if the string pointer is pointing + * to something that was allocated, there's a chance that it can be freed + * by the time the user reads the trace. This would cause a bad memory + * access by the kernel and possibly crash the system. + * + * This function will check if the event has any fields flagged as needing + * to be checked at runtime and perform those checks. + * + * If it is found that a field is unsafe, it will write into the @iter->seq + * a message stating what was found to be unsafe. + * + * @return: true if the event is unsafe and should be ignored, + * false otherwise. */ -void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, - va_list ap) +bool ignore_event(struct trace_iterator *iter) { - long text_delta = 0; - long data_delta = 0; - const char *p = fmt; - const char *str; - bool good; - int i, j; + struct ftrace_event_field *field; + struct trace_event *trace_event; + struct trace_event_call *event; + struct list_head *head; + struct trace_seq *seq; + const void *ptr; - if (WARN_ON_ONCE(!fmt)) - return; + trace_event = ftrace_find_event(iter->ent->type); - if (static_branch_unlikely(&trace_no_verify)) - goto print; + seq = &iter->seq; - /* - * When the kernel is booted with the tp_printk command line - * parameter, trace events go directly through to printk(). - * It also is checked by this function, but it does not - * have an associated trace_array (tr) for it. - */ - if (iter->tr) { - text_delta = iter->tr->text_delta; - data_delta = iter->tr->data_delta; + if (!trace_event) { + trace_seq_printf(seq, "EVENT ID %d NOT FOUND?\n", iter->ent->type); + return true; } - /* Don't bother checking when doing a ftrace_dump() */ - if (iter->fmt == static_fmt_buf) - goto print; - - while (*p) { - bool star = false; - int len = 0; - - j = 0; - - /* - * We only care about %s and variants - * as well as %p[sS] if delta is non-zero - */ - for (i = 0; p[i]; i++) { - if (i + 1 >= iter->fmt_size) { - /* - * If we can't expand the copy buffer, - * just print it. - */ - if (!trace_iter_expand_format(iter)) - goto print; - } - - if (p[i] == '\\' && p[i+1]) { - i++; - continue; - } - if (p[i] == '%') { - /* Need to test cases like %08.*s */ - for (j = 1; p[i+j]; j++) { - if (isdigit(p[i+j]) || - p[i+j] == '.') - continue; - if (p[i+j] == '*') { - star = true; - continue; - } - break; - } - if (p[i+j] == 's') - break; - - if (text_delta && p[i+1] == 'p' && - ((p[i+2] == 's' || p[i+2] == 'S'))) - break; - - star = false; - } - j = 0; - } - /* If no %s found then just print normally */ - if (!p[i]) - break; - - /* Copy up to the %s, and print that */ - strncpy(iter->fmt, p, i); - iter->fmt[i] = '\0'; - trace_seq_vprintf(&iter->seq, iter->fmt, ap); + event = container_of(trace_event, struct trace_event_call, event); + if (!(event->flags & TRACE_EVENT_FL_TEST_STR)) + return false; - /* Add delta to %pS pointers */ - if (p[i+1] == 'p') { - unsigned long addr; - char fmt[4]; + head = trace_get_fields(event); + if (!head) { + trace_seq_printf(seq, "FIELDS FOR EVENT '%s' NOT FOUND?\n", + trace_event_name(event)); + return true; + } - fmt[0] = '%'; - fmt[1] = 'p'; - fmt[2] = p[i+2]; /* Either %ps or %pS */ - fmt[3] = '\0'; + /* Offsets are from the iter->ent that points to the raw event */ + ptr = iter->ent; - addr = va_arg(ap, unsigned long); - addr += text_delta; - trace_seq_printf(&iter->seq, fmt, (void *)addr); + list_for_each_entry(field, head, link) { + const char *str; + bool good; - p += i + 3; + if (!field->needs_test) continue; - } - - /* - * If iter->seq is full, the above call no longer guarantees - * that ap is in sync with fmt processing, and further calls - * to va_arg() can return wrong positional arguments. - * - * Ensure that ap is no longer used in this case. - */ - if (iter->seq.full) { - p = ""; - break; - } - if (star) - len = va_arg(ap, int); + str = *(const char **)(ptr + field->offset); - /* The ap now points to the string data of the %s */ - str = va_arg(ap, const char *); - - good = trace_safe_str(iter, str, star, len); - - /* Could be from the last boot */ - if (data_delta && !good) { - str += data_delta; - good = trace_safe_str(iter, str, star, len); - } + good = trace_safe_str(iter, str); /* * If you hit this warning, it is likely that the @@ -3846,44 +3729,14 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, * instead. See samples/trace_events/trace-events-sample.h * for reference. */ - if (WARN_ONCE(!good, "fmt: '%s' current_buffer: '%s'", - fmt, seq_buf_str(&iter->seq.seq))) { - int ret; - - /* Try to safely read the string */ - if (star) { - if (len + 1 > iter->fmt_size) - len = iter->fmt_size - 1; - if (len < 0) - len = 0; - ret = copy_from_kernel_nofault(iter->fmt, str, len); - iter->fmt[len] = 0; - star = false; - } else { - ret = strncpy_from_kernel_nofault(iter->fmt, str, - iter->fmt_size); - } - if (ret < 0) - trace_seq_printf(&iter->seq, "(0x%px)", str); - else - trace_seq_printf(&iter->seq, "(0x%px:%s)", - str, iter->fmt); - str = "[UNSAFE-MEMORY]"; - strcpy(iter->fmt, "%s"); - } else { - strncpy(iter->fmt, p + i, j + 1); - iter->fmt[j+1] = '\0'; + if (WARN_ONCE(!good, "event '%s' has unsafe pointer field '%s'", + trace_event_name(event), field->name)) { + trace_seq_printf(seq, "EVENT %s: HAS UNSAFE POINTER FIELD '%s'\n", + trace_event_name(event), field->name); + return true; } - if (star) - trace_seq_printf(&iter->seq, iter->fmt, len, str); - else - trace_seq_printf(&iter->seq, iter->fmt, str); - - p += i + j + 1; } - print: - if (*p) - trace_seq_vprintf(&iter->seq, p, ap); + return false; } const char *trace_event_format(struct trace_iterator *iter, const char *fmt) @@ -4353,6 +4206,15 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) if (event) { if (tr->trace_flags & TRACE_ITER_FIELDS) return print_event_fields(iter, event); + /* + * For TRACE_EVENT() events, the print_fmt is not + * safe to use if the array has delta offsets + * Force printing via the fields. + */ + if ((tr->text_delta || tr->data_delta) && + event->type > __TRACE_LAST_TYPE) + return print_event_fields(iter, event); + return event->funcs->trace(iter, sym_flags, event); } @@ -5225,6 +5087,9 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, cpumask_var_t tracing_cpumask_new; int err; + if (count == 0 || count > KMALLOC_MAX_SIZE) + return -EINVAL; + if (!zalloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) return -ENOMEM; @@ -10777,8 +10642,6 @@ __init static int tracer_alloc_buffers(void) register_snapshot_cmd(); - test_can_verify(); - return 0; out_free_pipe_cpumask: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 266740b4e121..9691b47b5f3d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -667,9 +667,8 @@ void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, bool trace_is_tracepoint_string(const char *str); const char *trace_event_format(struct trace_iterator *iter, const char *fmt); -void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, - va_list ap) __printf(2, 0); char *trace_iter_expand_format(struct trace_iterator *iter); +bool ignore_event(struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); @@ -1413,7 +1412,8 @@ struct ftrace_event_field { int filter_type; int offset; int size; - int is_signed; + unsigned int is_signed:1; + unsigned int needs_test:1; int len; }; diff --git a/kernel/trace/trace_eprobe.c b/kernel/trace/trace_eprobe.c index ebda68ee9abf..be8be0c1aaf0 100644 --- a/kernel/trace/trace_eprobe.c +++ b/kernel/trace/trace_eprobe.c @@ -963,6 +963,11 @@ static int __trace_eprobe_create(int argc, const char *argv[]) goto error; } ret = dyn_event_add(&ep->devent, &ep->tp.event->call); + if (ret < 0) { + trace_probe_unregister_event_call(&ep->tp); + mutex_unlock(&event_mutex); + goto error; + } mutex_unlock(&event_mutex); return ret; parse_error: diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 77e68efbd43e..770e7ed91716 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -82,7 +82,7 @@ static int system_refcount_dec(struct event_subsystem *system) } static struct ftrace_event_field * -__find_event_field(struct list_head *head, char *name) +__find_event_field(struct list_head *head, const char *name) { struct ftrace_event_field *field; @@ -114,7 +114,8 @@ trace_find_event_field(struct trace_event_call *call, char *name) static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, - int is_signed, int filter_type, int len) + int is_signed, int filter_type, int len, + int need_test) { struct ftrace_event_field *field; @@ -133,6 +134,7 @@ static int __trace_define_field(struct list_head *head, const char *type, field->offset = offset; field->size = size; field->is_signed = is_signed; + field->needs_test = need_test; field->len = len; list_add(&field->link, head); @@ -151,13 +153,13 @@ int trace_define_field(struct trace_event_call *call, const char *type, head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, - is_signed, filter_type, 0); + is_signed, filter_type, 0, 0); } EXPORT_SYMBOL_GPL(trace_define_field); static int trace_define_field_ext(struct trace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, - int filter_type, int len) + int filter_type, int len, int need_test) { struct list_head *head; @@ -166,13 +168,13 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ head = trace_get_fields(call); return __trace_define_field(head, type, name, offset, size, - is_signed, filter_type, len); + is_signed, filter_type, len, need_test); } #define __generic_field(type, item, filter_type) \ ret = __trace_define_field(&ftrace_generic_fields, #type, \ #item, 0, 0, is_signed_type(type), \ - filter_type, 0); \ + filter_type, 0, 0); \ if (ret) \ return ret; @@ -181,7 +183,8 @@ static int trace_define_field_ext(struct trace_event_call *call, const char *typ "common_" #item, \ offsetof(typeof(ent), item), \ sizeof(ent.item), \ - is_signed_type(type), FILTER_OTHER, 0); \ + is_signed_type(type), FILTER_OTHER, \ + 0, 0); \ if (ret) \ return ret; @@ -244,19 +247,16 @@ int trace_event_get_offsets(struct trace_event_call *call) return tail->offset + tail->size; } -/* - * Check if the referenced field is an array and return true, - * as arrays are OK to dereference. - */ -static bool test_field(const char *fmt, struct trace_event_call *call) + +static struct trace_event_fields *find_event_field(const char *fmt, + struct trace_event_call *call) { struct trace_event_fields *field = call->class->fields_array; - const char *array_descriptor; const char *p = fmt; int len; if (!(len = str_has_prefix(fmt, "REC->"))) - return false; + return NULL; fmt += len; for (p = fmt; *p; p++) { if (!isalnum(*p) && *p != '_') @@ -265,16 +265,141 @@ static bool test_field(const char *fmt, struct trace_event_call *call) len = p - fmt; for (; field->type; field++) { - if (strncmp(field->name, fmt, len) || - field->name[len]) + if (strncmp(field->name, fmt, len) || field->name[len]) continue; - array_descriptor = strchr(field->type, '['); - /* This is an array and is OK to dereference. */ - return array_descriptor != NULL; + + return field; + } + return NULL; +} + +/* + * Check if the referenced field is an array and return true, + * as arrays are OK to dereference. + */ +static bool test_field(const char *fmt, struct trace_event_call *call) +{ + struct trace_event_fields *field; + + field = find_event_field(fmt, call); + if (!field) + return false; + + /* This is an array and is OK to dereference. */ + return strchr(field->type, '[') != NULL; +} + +/* Look for a string within an argument */ +static bool find_print_string(const char *arg, const char *str, const char *end) +{ + const char *r; + + r = strstr(arg, str); + return r && r < end; +} + +/* Return true if the argument pointer is safe */ +static bool process_pointer(const char *fmt, int len, struct trace_event_call *call) +{ + const char *r, *e, *a; + + e = fmt + len; + + /* Find the REC-> in the argument */ + r = strstr(fmt, "REC->"); + if (r && r < e) { + /* + * Addresses of events on the buffer, or an array on the buffer is + * OK to dereference. There's ways to fool this, but + * this is to catch common mistakes, not malicious code. + */ + a = strchr(fmt, '&'); + if ((a && (a < r)) || test_field(r, call)) + return true; + } else if (find_print_string(fmt, "__get_dynamic_array(", e)) { + return true; + } else if (find_print_string(fmt, "__get_rel_dynamic_array(", e)) { + return true; + } else if (find_print_string(fmt, "__get_dynamic_array_len(", e)) { + return true; + } else if (find_print_string(fmt, "__get_rel_dynamic_array_len(", e)) { + return true; + } else if (find_print_string(fmt, "__get_sockaddr(", e)) { + return true; + } else if (find_print_string(fmt, "__get_rel_sockaddr(", e)) { + return true; } return false; } +/* Return true if the string is safe */ +static bool process_string(const char *fmt, int len, struct trace_event_call *call) +{ + struct trace_event_fields *field; + const char *r, *e, *s; + + e = fmt + len; + + /* + * There are several helper functions that return strings. + * If the argument contains a function, then assume its field is valid. + * It is considered that the argument has a function if it has: + * alphanumeric or '_' before a parenthesis. + */ + s = fmt; + do { + r = strstr(s, "("); + if (!r || r >= e) + break; + for (int i = 1; r - i >= s; i++) { + char ch = *(r - i); + if (isspace(ch)) + continue; + if (isalnum(ch) || ch == '_') + return true; + /* Anything else, this isn't a function */ + break; + } + /* A function could be wrapped in parethesis, try the next one */ + s = r + 1; + } while (s < e); + + /* + * Check for arrays. If the argument has: foo[REC->val] + * then it is very likely that foo is an array of strings + * that are safe to use. + */ + r = strstr(s, "["); + if (r && r < e) { + r = strstr(r, "REC->"); + if (r && r < e) + return true; + } + + /* + * If there's any strings in the argument consider this arg OK as it + * could be: REC->field ? "foo" : "bar" and we don't want to get into + * verifying that logic here. + */ + if (find_print_string(fmt, "\"", e)) + return true; + + /* Dereferenced strings are also valid like any other pointer */ + if (process_pointer(fmt, len, call)) + return true; + + /* Make sure the field is found */ + field = find_event_field(fmt, call); + if (!field) + return false; + + /* Test this field's string before printing the event */ + call->flags |= TRACE_EVENT_FL_TEST_STR; + field->needs_test = 1; + + return true; +} + /* * Examine the print fmt of the event looking for unsafe dereference * pointers using %p* that could be recorded in the trace event and @@ -284,13 +409,14 @@ static bool test_field(const char *fmt, struct trace_event_call *call) static void test_event_printk(struct trace_event_call *call) { u64 dereference_flags = 0; + u64 string_flags = 0; bool first = true; - const char *fmt, *c, *r, *a; + const char *fmt; int parens = 0; char in_quote = 0; int start_arg = 0; int arg = 0; - int i; + int i, e; fmt = call->print_fmt; @@ -374,8 +500,16 @@ static void test_event_printk(struct trace_event_call *call) star = true; continue; } - if ((fmt[i + j] == 's') && star) - arg++; + if ((fmt[i + j] == 's')) { + if (star) + arg++; + if (WARN_ONCE(arg == 63, + "Too many args for event: %s", + trace_event_name(call))) + return; + dereference_flags |= 1ULL << arg; + string_flags |= 1ULL << arg; + } break; } break; @@ -403,42 +537,47 @@ static void test_event_printk(struct trace_event_call *call) case ',': if (in_quote || parens) continue; + e = i; i++; while (isspace(fmt[i])) i++; - start_arg = i; - if (!(dereference_flags & (1ULL << arg))) - goto next_arg; - /* Find the REC-> in the argument */ - c = strchr(fmt + i, ','); - r = strstr(fmt + i, "REC->"); - if (r && (!c || r < c)) { - /* - * Addresses of events on the buffer, - * or an array on the buffer is - * OK to dereference. - * There's ways to fool this, but - * this is to catch common mistakes, - * not malicious code. - */ - a = strchr(fmt + i, '&'); - if ((a && (a < r)) || test_field(r, call)) + /* + * If start_arg is zero, then this is the start of the + * first argument. The processing of the argument happens + * when the end of the argument is found, as it needs to + * handle paranthesis and such. + */ + if (!start_arg) { + start_arg = i; + /* Balance out the i++ in the for loop */ + i--; + continue; + } + + if (dereference_flags & (1ULL << arg)) { + if (string_flags & (1ULL << arg)) { + if (process_string(fmt + start_arg, e - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } else if (process_pointer(fmt + start_arg, e - start_arg, call)) dereference_flags &= ~(1ULL << arg); - } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) && - (!c || r < c)) { - dereference_flags &= ~(1ULL << arg); - } else if ((r = strstr(fmt + i, "__get_sockaddr(")) && - (!c || r < c)) { - dereference_flags &= ~(1ULL << arg); } - next_arg: - i--; + start_arg = i; arg++; + /* Balance out the i++ in the for loop */ + i--; } } + if (dereference_flags & (1ULL << arg)) { + if (string_flags & (1ULL << arg)) { + if (process_string(fmt + start_arg, i - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } else if (process_pointer(fmt + start_arg, i - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } + /* * If you triggered the below warning, the trace event reported * uses an unsafe dereference pointer %p*. As the data stored @@ -2471,7 +2610,7 @@ event_define_fields(struct trace_event_call *call) ret = trace_define_field_ext(call, field->type, field->name, offset, field->size, field->is_signed, field->filter_type, - field->len); + field->len, field->needs_test); if (WARN_ON_ONCE(ret)) { pr_err("error code is %d\n", ret); break; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 74c353164ca1..d358c9935164 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -176,7 +176,8 @@ static void function_trace_start(struct trace_array *tr) tracing_reset_online_cpus(&tr->array_buffer); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* fregs are guaranteed not to be NULL if HAVE_DYNAMIC_FTRACE_WITH_ARGS is set */ +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) && defined(CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS) static __always_inline unsigned long function_get_true_parent_ip(unsigned long parent_ip, struct ftrace_regs *fregs) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 263fac44d3ca..935a886af40c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -725,7 +725,7 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, static struct notifier_block trace_kprobe_module_nb = { .notifier_call = trace_kprobe_module_callback, - .priority = 1 /* Invoked after kprobe module callback */ + .priority = 2 /* Invoked after kprobe and jump_label module callback */ }; static int trace_kprobe_register_module_notifier(void) { diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index da748b7cbc4d..03d56f711ad1 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -317,10 +317,14 @@ EXPORT_SYMBOL(trace_raw_output_prep); void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) { + struct trace_seq *s = &iter->seq; va_list ap; + if (ignore_event(iter)) + return; + va_start(ap, fmt); - trace_check_vprintf(iter, trace_event_format(iter, fmt), ap); + trace_seq_vprintf(s, trace_event_format(iter, fmt), ap); va_end(ap); } EXPORT_SYMBOL(trace_event_printf); diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 5c03633316a6..0c42b15c3800 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -10,11 +10,42 @@ #include <linux/module.h> #include <linux/ftrace.h> #include <linux/kprobes.h> +#include <linux/hardirq.h> #include "trace.h" #define CREATE_TRACE_POINTS #include <trace/events/preemptirq.h> +/* + * Use regular trace points on architectures that implement noinstr + * tooling: these calls will only happen with RCU enabled, which can + * use a regular tracepoint. + * + * On older architectures, RCU may not be watching in idle. In that + * case, wake up RCU to watch while calling the tracepoint. These + * aren't NMI-safe - so exclude NMI contexts: + */ +#ifdef CONFIG_ARCH_WANTS_NO_INSTR +#define trace(point, args) trace_##point(args) +#else +#define trace(point, args) \ + do { \ + if (trace_##point##_enabled()) { \ + bool exit_rcu = false; \ + if (in_nmi()) \ + break; \ + if (!IS_ENABLED(CONFIG_TINY_RCU) && \ + is_idle_task(current)) { \ + ct_irq_enter(); \ + exit_rcu = true; \ + } \ + trace_##point(args); \ + if (exit_rcu) \ + ct_irq_exit(); \ + } \ + } while (0) +#endif + #ifdef CONFIG_TRACE_IRQFLAGS /* Per-cpu variable to prevent redundant calls when IRQs already off */ static DEFINE_PER_CPU(int, tracing_irq_cpu); @@ -28,7 +59,7 @@ static DEFINE_PER_CPU(int, tracing_irq_cpu); void trace_hardirqs_on_prepare(void) { if (this_cpu_read(tracing_irq_cpu)) { - trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_enable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1)); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -39,7 +70,7 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_prepare); void trace_hardirqs_on(void) { if (this_cpu_read(tracing_irq_cpu)) { - trace_irq_enable(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_enable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1)); tracer_hardirqs_on(CALLER_ADDR0, CALLER_ADDR1); this_cpu_write(tracing_irq_cpu, 0); } @@ -61,7 +92,7 @@ void trace_hardirqs_off_finish(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_disable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1)); } } @@ -75,7 +106,7 @@ void trace_hardirqs_off(void) if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, CALLER_ADDR1); - trace_irq_disable(CALLER_ADDR0, CALLER_ADDR1); + trace(irq_disable, TP_ARGS(CALLER_ADDR0, CALLER_ADDR1)); } } EXPORT_SYMBOL(trace_hardirqs_off); @@ -86,13 +117,13 @@ NOKPROBE_SYMBOL(trace_hardirqs_off); void trace_preempt_on(unsigned long a0, unsigned long a1) { - trace_preempt_enable(a0, a1); + trace(preempt_enable, TP_ARGS(a0, a1)); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { - trace_preempt_disable(a0, a1); + trace(preempt_disable, TP_ARGS(a0, a1)); tracer_preempt_off(a0, a1); } #endif diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index fed382b7881b..4875e7f5de3d 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1402,9 +1402,13 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, #ifdef CONFIG_BPF_EVENTS if (bpf_prog_array_valid(call)) { + const struct bpf_prog_array *array; u32 ret; - ret = bpf_prog_run_array_uprobe(call->prog_array, regs, bpf_prog_run); + rcu_read_lock_trace(); + array = rcu_dereference_check(call->prog_array, rcu_read_lock_trace_held()); + ret = bpf_prog_run_array_uprobe(array, regs, bpf_prog_run); + rcu_read_unlock_trace(); if (!ret) return; } diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 3a56e7c8aa4f..1921ade45be3 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -845,15 +845,11 @@ int tracing_map_init(struct tracing_map *map) static int cmp_entries_dup(const void *A, const void *B) { const struct tracing_map_sort_entry *a, *b; - int ret = 0; a = *(const struct tracing_map_sort_entry **)A; b = *(const struct tracing_map_sort_entry **)B; - if (memcmp(a->key, b->key, a->elt->map->key_size)) - ret = 1; - - return ret; + return memcmp(a->key, b->key, a->elt->map->key_size); } static int cmp_entries_sum(const void *A, const void *B) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8b07576814a5..f7d8fc204579 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3680,23 +3680,27 @@ void workqueue_softirq_dead(unsigned int cpu) * check_flush_dependency - check for flush dependency sanity * @target_wq: workqueue being flushed * @target_work: work item being flushed (NULL for workqueue flushes) + * @from_cancel: are we called from the work cancel path * * %current is trying to flush the whole @target_wq or @target_work on it. - * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not - * reclaiming memory or running on a workqueue which doesn't have - * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to - * a deadlock. + * If this is not the cancel path (which implies work being flushed is either + * already running, or will not be at all), check if @target_wq doesn't have + * %WQ_MEM_RECLAIM and verify that %current is not reclaiming memory or running + * on a workqueue which doesn't have %WQ_MEM_RECLAIM as that can break forward- + * progress guarantee leading to a deadlock. */ static void check_flush_dependency(struct workqueue_struct *target_wq, - struct work_struct *target_work) + struct work_struct *target_work, + bool from_cancel) { - work_func_t target_func = target_work ? target_work->func : NULL; + work_func_t target_func; struct worker *worker; - if (target_wq->flags & WQ_MEM_RECLAIM) + if (from_cancel || target_wq->flags & WQ_MEM_RECLAIM) return; worker = current_wq_worker(); + target_func = target_work ? target_work->func : NULL; WARN_ONCE(current->flags & PF_MEMALLOC, "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%ps", @@ -3980,7 +3984,7 @@ void __flush_workqueue(struct workqueue_struct *wq) list_add_tail(&this_flusher.list, &wq->flusher_overflow); } - check_flush_dependency(wq, NULL); + check_flush_dependency(wq, NULL, false); mutex_unlock(&wq->mutex); @@ -4155,7 +4159,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, } wq = pwq->wq; - check_flush_dependency(wq, work); + check_flush_dependency(wq, work, from_cancel); insert_wq_barrier(pwq, barr, work, worker); raw_spin_unlock_irq(&pool->lock); @@ -5641,6 +5645,7 @@ static void wq_adjust_max_active(struct workqueue_struct *wq) } while (activated); } +__printf(1, 0) static struct workqueue_struct *__alloc_workqueue(const char *fmt, unsigned int flags, int max_active, va_list args) |