diff options
Diffstat (limited to 'kernel')
39 files changed, 680 insertions, 455 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3f4c99e06c6b..b0799bced518 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -28,11 +28,17 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) attr->value_size == 0) return ERR_PTR(-EINVAL); + if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + return ERR_PTR(-E2BIG); + elem_size = round_up(attr->value_size, 8); /* check round_up into zero and u32 overflow */ if (elem_size == 0 || - attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size) + attr->max_entries > (U32_MAX - PAGE_SIZE - sizeof(*array)) / elem_size) return ERR_PTR(-ENOMEM); array_size = sizeof(*array) + attr->max_entries * elem_size; @@ -105,7 +111,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, /* all elements already exist */ return -EEXIST; - memcpy(array->value + array->elem_size * index, value, array->elem_size); + memcpy(array->value + array->elem_size * index, value, map->value_size); return 0; } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 19909b22b4f8..34777b3746fa 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -64,12 +64,35 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; - err = -ENOMEM; + if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) - + MAX_BPF_STACK - sizeof(struct htab_elem)) + /* if value_size is bigger, the user space won't be able to + * access the elements via bpf syscall. This check also makes + * sure that the elem_size doesn't overflow and it's + * kmalloc-able later in htab_map_update_elem() + */ + goto free_htab; + + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8) + + htab->map.value_size; + /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) goto free_htab; + if ((u64) htab->n_buckets * sizeof(struct hlist_head) + + (u64) htab->elem_size * htab->map.max_entries >= + U32_MAX - PAGE_SIZE) + /* make sure page count doesn't overflow */ + goto free_htab; + + htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) + + htab->elem_size * htab->map.max_entries, + PAGE_SIZE) >> PAGE_SHIFT; + + err = -ENOMEM; htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), GFP_USER | __GFP_NOWARN); @@ -85,13 +108,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) raw_spin_lock_init(&htab->lock); htab->count = 0; - htab->elem_size = sizeof(struct htab_elem) + - round_up(htab->map.key_size, 8) + - htab->map.value_size; - - htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) + - htab->elem_size * htab->map.max_entries, - PAGE_SIZE) >> PAGE_SHIFT; return &htab->map; free_htab: @@ -222,7 +238,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, WARN_ON_ONCE(!rcu_read_lock_held()); /* allocate new element outside of lock */ - l_new = kmalloc(htab->elem_size, GFP_ATOMIC); + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); if (!l_new) return -ENOMEM; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index be6d726e31c9..5a8a797d50b7 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -34,7 +34,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type) atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt); break; case BPF_TYPE_MAP: - atomic_inc(&((struct bpf_map *)raw)->refcnt); + bpf_map_inc(raw, true); break; default: WARN_ON_ONCE(1); @@ -51,7 +51,7 @@ static void bpf_any_put(void *raw, enum bpf_type type) bpf_prog_put(raw); break; case BPF_TYPE_MAP: - bpf_map_put(raw); + bpf_map_put_with_uref(raw); break; default: WARN_ON_ONCE(1); @@ -64,7 +64,7 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) void *raw; *type = BPF_TYPE_MAP; - raw = bpf_map_get(ufd); + raw = bpf_map_get_with_uref(ufd); if (IS_ERR(raw)) { *type = BPF_TYPE_PROG; raw = bpf_prog_get(ufd); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0d3313d02a7e..3b39550d8485 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -82,6 +82,14 @@ static void bpf_map_free_deferred(struct work_struct *work) map->ops->map_free(map); } +static void bpf_map_put_uref(struct bpf_map *map) +{ + if (atomic_dec_and_test(&map->usercnt)) { + if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) + bpf_fd_array_map_clear(map); + } +} + /* decrement map refcnt and schedule it for freeing via workqueue * (unrelying map implementation ops->map_free() might sleep) */ @@ -93,17 +101,15 @@ void bpf_map_put(struct bpf_map *map) } } -static int bpf_map_release(struct inode *inode, struct file *filp) +void bpf_map_put_with_uref(struct bpf_map *map) { - struct bpf_map *map = filp->private_data; - - if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) - /* prog_array stores refcnt-ed bpf_prog pointers - * release them all when user space closes prog_array_fd - */ - bpf_fd_array_map_clear(map); - + bpf_map_put_uref(map); bpf_map_put(map); +} + +static int bpf_map_release(struct inode *inode, struct file *filp) +{ + bpf_map_put_with_uref(filp->private_data); return 0; } @@ -142,6 +148,7 @@ static int map_create(union bpf_attr *attr) return PTR_ERR(map); atomic_set(&map->refcnt, 1); + atomic_set(&map->usercnt, 1); err = bpf_map_charge_memlock(map); if (err) @@ -174,7 +181,14 @@ struct bpf_map *__bpf_map_get(struct fd f) return f.file->private_data; } -struct bpf_map *bpf_map_get(u32 ufd) +void bpf_map_inc(struct bpf_map *map, bool uref) +{ + atomic_inc(&map->refcnt); + if (uref) + atomic_inc(&map->usercnt); +} + +struct bpf_map *bpf_map_get_with_uref(u32 ufd) { struct fd f = fdget(ufd); struct bpf_map *map; @@ -183,7 +197,7 @@ struct bpf_map *bpf_map_get(u32 ufd) if (IS_ERR(map)) return map; - atomic_inc(&map->refcnt); + bpf_map_inc(map, true); fdput(f); return map; @@ -226,7 +240,7 @@ static int map_lookup_elem(union bpf_attr *attr) goto free_key; err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER); + value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; @@ -285,7 +299,7 @@ static int map_update_elem(union bpf_attr *attr) goto free_key; err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER); + value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c6073056badf..a7945d10b378 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2021,8 +2021,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) * will be used by the valid program until it's unloaded * and all maps are released in free_bpf_prog_info() */ - atomic_inc(&map->refcnt); - + bpf_map_inc(map, false); fdput(f); next_insn: insn++; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index f1603c153890..470f6536b9e8 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -98,6 +98,12 @@ static DEFINE_SPINLOCK(css_set_lock); static DEFINE_SPINLOCK(cgroup_idr_lock); /* + * Protects cgroup_file->kn for !self csses. It synchronizes notifications + * against file removal/re-creation across css hiding. + */ +static DEFINE_SPINLOCK(cgroup_file_kn_lock); + +/* * Protects cgroup_subsys->release_agent_path. Modifying it also requires * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. */ @@ -754,9 +760,11 @@ static void put_css_set_locked(struct css_set *cset) if (!atomic_dec_and_test(&cset->refcount)) return; - /* This css_set is dead. unlink it and release cgroup refcounts */ - for_each_subsys(ss, ssid) + /* This css_set is dead. unlink it and release cgroup and css refs */ + for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); + css_put(cset->subsys[ssid]); + } hash_del(&cset->hlist); css_set_count--; @@ -1056,9 +1064,13 @@ static struct css_set *find_css_set(struct css_set *old_cset, key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); - for_each_subsys(ss, ssid) + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cset->subsys[ssid]; + list_add_tail(&cset->e_cset_node[ssid], - &cset->subsys[ssid]->cgroup->e_csets[ssid]); + &css->cgroup->e_csets[ssid]); + css_get(css); + } spin_unlock_bh(&css_set_lock); @@ -1393,6 +1405,16 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) char name[CGROUP_FILE_NAME_MAX]; lockdep_assert_held(&cgroup_mutex); + + if (cft->file_offset) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); + struct cgroup_file *cfile = (void *)css + cft->file_offset; + + spin_lock_irq(&cgroup_file_kn_lock); + cfile->kn = NULL; + spin_unlock_irq(&cgroup_file_kn_lock); + } + kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } @@ -1856,7 +1878,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); - INIT_LIST_HEAD(&cgrp->self.files); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); @@ -2216,6 +2237,9 @@ struct cgroup_taskset { struct list_head src_csets; struct list_head dst_csets; + /* the subsys currently being processed */ + int ssid; + /* * Fields for cgroup_taskset_*() iteration. * @@ -2278,25 +2302,29 @@ static void cgroup_taskset_add(struct task_struct *task, /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest + * @dst_cssp: output variable for the destination css * * @tset iteration is initialized and the first task is returned. */ -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp) { tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); tset->cur_task = NULL; - return cgroup_taskset_next(tset); + return cgroup_taskset_next(tset, dst_cssp); } /** * cgroup_taskset_next - iterate to the next task in taskset * @tset: taskset of interest + * @dst_cssp: output variable for the destination css * * Return the next task in @tset. Iteration must have been initialized * with cgroup_taskset_first(). */ -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp) { struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; @@ -2311,6 +2339,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) if (&task->cg_list != &cset->mg_tasks) { tset->cur_cset = cset; tset->cur_task = task; + + /* + * This function may be called both before and + * after cgroup_taskset_migrate(). The two cases + * can be distinguished by looking at whether @cset + * has its ->mg_dst_cset set. + */ + if (cset->mg_dst_cset) + *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; + else + *dst_cssp = cset->subsys[tset->ssid]; + return task; } @@ -2346,7 +2386,8 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, /* check that we can legitimately attach to the cgroup */ for_each_e_css(css, i, dst_cgrp) { if (css->ss->can_attach) { - ret = css->ss->can_attach(css, tset); + tset->ssid = i; + ret = css->ss->can_attach(tset); if (ret) { failed_css = css; goto out_cancel_attach; @@ -2379,9 +2420,12 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, */ tset->csets = &tset->dst_csets; - for_each_e_css(css, i, dst_cgrp) - if (css->ss->attach) - css->ss->attach(css, tset); + for_each_e_css(css, i, dst_cgrp) { + if (css->ss->attach) { + tset->ssid = i; + css->ss->attach(tset); + } + } ret = 0; goto out_release_tset; @@ -2390,8 +2434,10 @@ out_cancel_attach: for_each_e_css(css, i, dst_cgrp) { if (css == failed_css) break; - if (css->ss->cancel_attach) - css->ss->cancel_attach(css, tset); + if (css->ss->cancel_attach) { + tset->ssid = i; + css->ss->cancel_attach(tset); + } } out_release_tset: spin_lock_bh(&css_set_lock); @@ -3313,9 +3359,9 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; - kernfs_get(kn); + spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; - list_add(&cfile->node, &css->files); + spin_unlock_irq(&cgroup_file_kn_lock); } return 0; @@ -3553,6 +3599,22 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) } /** + * cgroup_file_notify - generate a file modified event for a cgroup_file + * @cfile: target cgroup_file + * + * @cfile must have been obtained by setting cftype->file_offset. + */ +void cgroup_file_notify(struct cgroup_file *cfile) +{ + unsigned long flags; + + spin_lock_irqsave(&cgroup_file_kn_lock, flags); + if (cfile->kn) + kernfs_notify(cfile->kn); + spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); +} + +/** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * @@ -4613,13 +4675,9 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; - struct cgroup_file *cfile; percpu_ref_exit(&css->refcnt); - list_for_each_entry(cfile, &css->files, node) - kernfs_put(cfile->kn); - if (ss) { /* css free path */ int id = css->id; @@ -4724,7 +4782,6 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->ss = ss; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); - INIT_LIST_HEAD(&css->files); css->serial_nr = css_serial_nr_next++; if (cgroup_parent(cgrp)) { diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f1b30ad5dc6d..2d3df82c54f2 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -155,12 +155,10 @@ static void freezer_css_free(struct cgroup_subsys_state *css) * @freezer->lock. freezer_attach() makes the new tasks conform to the * current state and all following state changes can see the new tasks. */ -static void freezer_attach(struct cgroup_subsys_state *new_css, - struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup_taskset *tset) { - struct freezer *freezer = css_freezer(new_css); struct task_struct *task; - bool clear_frozen = false; + struct cgroup_subsys_state *new_css; mutex_lock(&freezer_mutex); @@ -174,22 +172,21 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, new_css, tset) { + struct freezer *freezer = css_freezer(new_css); + if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { freeze_task(task); - freezer->state &= ~CGROUP_FROZEN; - clear_frozen = true; + /* clear FROZEN and propagate upwards */ + while (freezer && (freezer->state & CGROUP_FROZEN)) { + freezer->state &= ~CGROUP_FROZEN; + freezer = parent_freezer(freezer); + } } } - /* propagate FROZEN clearing upwards */ - while (clear_frozen && (freezer = parent_freezer(freezer))) { - freezer->state &= ~CGROUP_FROZEN; - clear_frozen = freezer->state & CGROUP_FREEZING; - } - mutex_unlock(&freezer_mutex); } diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index cdd8df4e991c..b50d5a167fda 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -106,7 +106,7 @@ static void pids_uncharge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p; - for (p = pids; p; p = parent_pids(p)) + for (p = pids; parent_pids(p); p = parent_pids(p)) pids_cancel(p, num); } @@ -123,7 +123,7 @@ static void pids_charge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p; - for (p = pids; p; p = parent_pids(p)) + for (p = pids; parent_pids(p); p = parent_pids(p)) atomic64_add(num, &p->counter); } @@ -140,7 +140,7 @@ static int pids_try_charge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p, *q; - for (p = pids; p; p = parent_pids(p)) { + for (p = pids; parent_pids(p); p = parent_pids(p)) { int64_t new = atomic64_add_return(num, &p->counter); /* @@ -162,13 +162,13 @@ revert: return -EAGAIN; } -static int pids_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int pids_can_attach(struct cgroup_taskset *tset) { - struct pids_cgroup *pids = css_pids(css); struct task_struct *task; + struct cgroup_subsys_state *dst_css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { + struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; @@ -187,13 +187,13 @@ static int pids_can_attach(struct cgroup_subsys_state *css, return 0; } -static void pids_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void pids_cancel_attach(struct cgroup_taskset *tset) { - struct pids_cgroup *pids = css_pids(css); struct task_struct *task; + struct cgroup_subsys_state *dst_css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { + struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; @@ -205,65 +205,28 @@ static void pids_cancel_attach(struct cgroup_subsys_state *css, } } +/* + * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies + * on threadgroup_change_begin() held by the copy_process(). + */ static int pids_can_fork(struct task_struct *task, void **priv_p) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; - int err; - /* - * Use the "current" task_css for the pids subsystem as the tentative - * css. It is possible we will charge the wrong hierarchy, in which - * case we will forcefully revert/reapply the charge on the right - * hierarchy after it is committed to the task proper. - */ - css = task_get_css(current, pids_cgrp_id); + css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); - - err = pids_try_charge(pids, 1); - if (err) - goto err_css_put; - - *priv_p = css; - return 0; - -err_css_put: - css_put(css); - return err; + return pids_try_charge(pids, 1); } static void pids_cancel_fork(struct task_struct *task, void *priv) { - struct cgroup_subsys_state *css = priv; - struct pids_cgroup *pids = css_pids(css); - - pids_uncharge(pids, 1); - css_put(css); -} - -static void pids_fork(struct task_struct *task, void *priv) -{ struct cgroup_subsys_state *css; - struct cgroup_subsys_state *old_css = priv; struct pids_cgroup *pids; - struct pids_cgroup *old_pids = css_pids(old_css); - css = task_get_css(task, pids_cgrp_id); + css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); - - /* - * If the association has changed, we have to revert and reapply the - * charge/uncharge on the wrong hierarchy to the current one. Since - * the association can only change due to an organisation event, its - * okay for us to ignore the limit in this case. - */ - if (pids != old_pids) { - pids_uncharge(old_pids, 1); - pids_charge(pids, 1); - } - - css_put(css); - css_put(old_css); + pids_uncharge(pids, 1); } static void pids_free(struct task_struct *task) @@ -335,6 +298,7 @@ static struct cftype pids_files[] = { { .name = "current", .read_s64 = pids_current_read, + .flags = CFTYPE_NOT_ON_ROOT, }, { } /* terminate */ }; @@ -346,7 +310,6 @@ struct cgroup_subsys pids_cgrp_subsys = { .cancel_attach = pids_cancel_attach, .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, - .fork = pids_fork, .free = pids_free, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 10ae73611d80..02a8ea5c9963 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1429,15 +1429,16 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup_taskset *tset) { - struct cpuset *cs = css_cs(css); + struct cgroup_subsys_state *css; + struct cpuset *cs; struct task_struct *task; int ret; /* used later by cpuset_attach() */ - cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset)); + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); + cs = css_cs(css); mutex_lock(&cpuset_mutex); @@ -1447,7 +1448,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task, cs->cpus_allowed); if (ret) goto out_unlock; @@ -1467,9 +1468,14 @@ out_unlock: return ret; } -static void cpuset_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpuset_cancel_attach(struct cgroup_taskset *tset) { + struct cgroup_subsys_state *css; + struct cpuset *cs; + + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + mutex_lock(&cpuset_mutex); css_cs(css)->attach_in_progress--; mutex_unlock(&cpuset_mutex); @@ -1482,16 +1488,19 @@ static void cpuset_cancel_attach(struct cgroup_subsys_state *css, */ static cpumask_var_t cpus_attach; -static void cpuset_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup_taskset *tset) { /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; - struct cpuset *cs = css_cs(css); + struct cgroup_subsys_state *css; + struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + mutex_lock(&cpuset_mutex); /* prepare for attach */ @@ -1502,7 +1511,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -1518,7 +1527,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, * sleep and should be moved outside migration path proper. */ cpuset_attach_nodemask_to = cs->effective_mems; - cgroup_taskset_for_each_leader(leader, tset) { + cgroup_taskset_for_each_leader(leader, css, tset) { struct mm_struct *mm = get_task_mm(leader); if (mm) { diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index d659487254d5..9c418002b8c1 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING diff --git a/kernel/events/core.c b/kernel/events/core.c index 36babfd20648..cfc227ccfceb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING @@ -435,7 +435,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) if (!is_cgroup_event(event)) return; - cgrp = perf_cgroup_from_task(current); + cgrp = perf_cgroup_from_task(current, event->ctx); /* * Do not update time when cgroup is not active */ @@ -458,7 +458,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, if (!task || !ctx->nr_cgroups) return; - cgrp = perf_cgroup_from_task(task); + cgrp = perf_cgroup_from_task(task, ctx); info = this_cpu_ptr(cgrp->info); info->timestamp = ctx->timestamp; } @@ -489,7 +489,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) * we reschedule only in the presence of cgroup * constrained events. */ - rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); @@ -522,8 +521,10 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) * set cgrp before ctxsw in to allow * event_filter_match() to not have to pass * task around + * we pass the cpuctx->ctx to perf_cgroup_from_task() + * because cgorup events are only per-cpu */ - cpuctx->cgrp = perf_cgroup_from_task(task); + cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); } perf_pmu_enable(cpuctx->ctx.pmu); @@ -531,8 +532,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) } } - rcu_read_unlock(); - local_irq_restore(flags); } @@ -542,17 +541,20 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, struct perf_cgroup *cgrp1; struct perf_cgroup *cgrp2 = NULL; + rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 + * we do not need to pass the ctx here because we know + * we are holding the rcu lock */ - cgrp1 = perf_cgroup_from_task(task); + cgrp1 = perf_cgroup_from_task(task, NULL); /* * next is NULL when called from perf_event_enable_on_exec() * that will systematically cause a cgroup_switch() */ if (next) - cgrp2 = perf_cgroup_from_task(next); + cgrp2 = perf_cgroup_from_task(next, NULL); /* * only schedule out current cgroup events if we know @@ -561,6 +563,8 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, */ if (cgrp1 != cgrp2) perf_cgroup_switch(task, PERF_CGROUP_SWOUT); + + rcu_read_unlock(); } static inline void perf_cgroup_sched_in(struct task_struct *prev, @@ -569,13 +573,16 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, struct perf_cgroup *cgrp1; struct perf_cgroup *cgrp2 = NULL; + rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 + * we do not need to pass the ctx here because we know + * we are holding the rcu lock */ - cgrp1 = perf_cgroup_from_task(task); + cgrp1 = perf_cgroup_from_task(task, NULL); /* prev can never be NULL */ - cgrp2 = perf_cgroup_from_task(prev); + cgrp2 = perf_cgroup_from_task(prev, NULL); /* * only need to schedule in cgroup events if we are changing @@ -584,6 +591,8 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, */ if (cgrp1 != cgrp2) perf_cgroup_switch(task, PERF_CGROUP_SWIN); + + rcu_read_unlock(); } static inline int perf_cgroup_connect(int fd, struct perf_event *event, @@ -3145,15 +3154,16 @@ static int event_enable_on_exec(struct perf_event *event, * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ -static void perf_event_enable_on_exec(struct perf_event_context *ctx) +static void perf_event_enable_on_exec(int ctxn) { - struct perf_event_context *clone_ctx = NULL; + struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_event *event; unsigned long flags; int enabled = 0; int ret; local_irq_save(flags); + ctx = current->perf_event_ctxp[ctxn]; if (!ctx || !ctx->nr_events) goto out; @@ -3196,17 +3206,11 @@ out: void perf_event_exec(void) { - struct perf_event_context *ctx; int ctxn; rcu_read_lock(); - for_each_task_context_nr(ctxn) { - ctx = current->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - - perf_event_enable_on_exec(ctx); - } + for_each_task_context_nr(ctxn) + perf_event_enable_on_exec(ctxn); rcu_read_unlock(); } @@ -4216,7 +4220,14 @@ retry: goto retry; } - __perf_event_period(&pe); + if (event->attr.freq) { + event->attr.sample_freq = value; + } else { + event->attr.sample_period = value; + event->hw.sample_period = value; + } + + local64_set(&event->hw.period_left, 0); raw_spin_unlock_irq(&ctx->lock); return 0; @@ -5667,6 +5678,17 @@ perf_event_aux_ctx(struct perf_event_context *ctx, } static void +perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, + struct perf_event_context *task_ctx) +{ + rcu_read_lock(); + preempt_disable(); + perf_event_aux_ctx(task_ctx, output, data); + preempt_enable(); + rcu_read_unlock(); +} + +static void perf_event_aux(perf_event_aux_output_cb output, void *data, struct perf_event_context *task_ctx) { @@ -5675,14 +5697,23 @@ perf_event_aux(perf_event_aux_output_cb output, void *data, struct pmu *pmu; int ctxn; + /* + * If we have task_ctx != NULL we only notify + * the task context itself. The task_ctx is set + * only for EXIT events before releasing task + * context. + */ + if (task_ctx) { + perf_event_aux_task_ctx(output, data, task_ctx); + return; + } + rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); if (cpuctx->unique_pmu != pmu) goto next; perf_event_aux_ctx(&cpuctx->ctx, output, data); - if (task_ctx) - goto next; ctxn = pmu->task_ctx_nr; if (ctxn < 0) goto next; @@ -5692,12 +5723,6 @@ perf_event_aux(perf_event_aux_output_cb output, void *data, next: put_cpu_ptr(pmu->pmu_cpu_context); } - - if (task_ctx) { - preempt_disable(); - perf_event_aux_ctx(task_ctx, output, data); - preempt_enable(); - } rcu_read_unlock(); } @@ -6463,9 +6488,6 @@ struct swevent_htable { /* Recursion avoidance in each contexts */ int recursion[PERF_NR_CONTEXTS]; - - /* Keeps track of cpu being initialized/exited */ - bool online; }; static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); @@ -6723,14 +6745,8 @@ static int perf_swevent_add(struct perf_event *event, int flags) hwc->state = !(flags & PERF_EF_START); head = find_swevent_head(swhash, event); - if (!head) { - /* - * We can race with cpu hotplug code. Do not - * WARN if the cpu just got unplugged. - */ - WARN_ON_ONCE(swhash->online); + if (WARN_ON_ONCE(!head)) return -EINVAL; - } hlist_add_head_rcu(&event->hlist_entry, head); perf_event_update_userpage(event); @@ -6798,7 +6814,6 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) int err = 0; mutex_lock(&swhash->hlist_mutex); - if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { struct swevent_hlist *hlist; @@ -8787,10 +8802,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) struct perf_event_context *child_ctx, *clone_ctx = NULL; unsigned long flags; - if (likely(!child->perf_event_ctxp[ctxn])) { - perf_event_task(child, NULL, 0); + if (likely(!child->perf_event_ctxp[ctxn])) return; - } local_irq_save(flags); /* @@ -8874,6 +8887,14 @@ void perf_event_exit_task(struct task_struct *child) for_each_task_context_nr(ctxn) perf_event_exit_task_context(child, ctxn); + + /* + * The perf_event_exit_task_context calls perf_event_task + * with child's task_ctx, which generates EXIT events for + * child contexts and sets child->perf_event_ctxp[] to NULL. + * At this point we need to send EXIT events to cpu contexts. + */ + perf_event_task(child, NULL, 0); } static void perf_free_event(struct perf_event *event, @@ -9255,7 +9276,6 @@ static void perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); - swhash->online = true; if (swhash->hlist_refcount > 0) { struct swevent_hlist *hlist; @@ -9297,14 +9317,7 @@ static void perf_event_exit_cpu_context(int cpu) static void perf_event_exit_cpu(int cpu) { - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - perf_event_exit_cpu_context(cpu); - - mutex_lock(&swhash->hlist_mutex); - swhash->online = false; - swevent_hlist_release(swhash); - mutex_unlock(&swhash->hlist_mutex); } #else static inline void perf_event_exit_cpu(int cpu) { } @@ -9452,16 +9465,18 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css) static int __perf_cgroup_move(void *info) { struct task_struct *task = info; + rcu_read_lock(); perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + rcu_read_unlock(); return 0; } -static void perf_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index b5d1ea79c595..adfdc0536117 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4e5e9798aa0c..7dad84913abf 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -19,7 +19,7 @@ * Authors: * Srikar Dronamraju * Jim Keniston - * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include <linux/kernel.h> diff --git a/kernel/fork.c b/kernel/fork.c index f97f2c449f5c..1155eac61687 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -380,6 +380,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) #endif tsk->splice_pipe = NULL; tsk->task_frag.page = NULL; + tsk->wake_q.next = NULL; account_kernel_stack(ti, 1); @@ -1368,8 +1369,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; - if (clone_flags & CLONE_THREAD) - threadgroup_change_begin(current); + threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1610,8 +1610,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, proc_fork_connector(p); cgroup_post_fork(p, cgrp_ss_priv); - if (clone_flags & CLONE_THREAD) - threadgroup_change_end(current); + threadgroup_change_end(current); perf_event_fork(p); trace_task_newtask(p, clone_flags); @@ -1652,8 +1651,7 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: #endif - if (clone_flags & CLONE_THREAD) - threadgroup_change_end(current); + threadgroup_change_end(current); delayacct_tsk_free(p); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0eebaeef317b..6ead200370da 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1434,6 +1434,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc) return NULL; + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); /* @@ -1447,7 +1448,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); raw_spin_unlock_irqrestore(&desc->lock, flags); - + chip_bus_sync_unlock(desc); return NULL; } @@ -1475,6 +1476,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) #endif raw_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(desc); unregister_handler_proc(irq, action); @@ -1553,9 +1555,7 @@ void free_irq(unsigned int irq, void *dev_id) desc->affinity_notify = NULL; #endif - chip_bus_lock(desc); kfree(__free_irq(irq, dev_id)); - chip_bus_sync_unlock(desc); } EXPORT_SYMBOL(free_irq); diff --git a/kernel/irq_work.c b/kernel/irq_work.c index cbf9fb899d92..bcf107ce0854 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra * * Provides a framework for enqueueing and running callbacks from hardirq * context. The enqueueing is NMI-safe. diff --git a/kernel/jump_label.c b/kernel/jump_label.c index f7dd15d537f9..05254eeb4b4e 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -2,7 +2,7 @@ * jump label support * * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> - * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2011 Peter Zijlstra * */ #include <linux/memory.h> diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e83b26464061..152da4a48867 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -20,7 +20,7 @@ #include <linux/capability.h> #include <linux/compiler.h> -#include <linux/rcupdate.h> /* rcu_expedited */ +#include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */ #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) @@ -144,11 +144,12 @@ static ssize_t fscaps_show(struct kobject *kobj, } KERNEL_ATTR_RO(fscaps); +#ifndef CONFIG_TINY_RCU int rcu_expedited; static ssize_t rcu_expedited_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", rcu_expedited); + return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited)); } static ssize_t rcu_expedited_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -161,6 +162,24 @@ static ssize_t rcu_expedited_store(struct kobject *kobj, } KERNEL_ATTR_RW(rcu_expedited); +int rcu_normal; +static ssize_t rcu_normal_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", READ_ONCE(rcu_normal)); +} +static ssize_t rcu_normal_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (kstrtoint(buf, 0, &rcu_normal)) + return -EINVAL; + + return count; +} +KERNEL_ATTR_RW(rcu_normal); +#endif /* #ifndef CONFIG_TINY_RCU */ + /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ @@ -202,7 +221,10 @@ static struct attribute * kernel_attrs[] = { &kexec_crash_size_attr.attr, &vmcoreinfo_attr.attr, #endif +#ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, + &rcu_normal_attr.attr, +#endif NULL }; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index deae3907ac1e..60ace56618f6 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -6,7 +6,7 @@ * Started by Ingo Molnar: * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * this code maps all the lock dependencies as they occur in a live kernel * and will warn about the following classes of locking bugs: diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index d83d798bef95..dbb61a302548 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -6,7 +6,7 @@ * Started by Ingo Molnar: * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Code for /proc/lockdep and /proc/lockdep_stats: * diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index d092a0c9c2d4..05a37857ab55 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -93,10 +93,12 @@ bool osq_lock(struct optimistic_spin_queue *lock) node->cpu = curr; /* - * ACQUIRE semantics, pairs with corresponding RELEASE - * in unlock() uncontended, or fastpath. + * We need both ACQUIRE (pairs with corresponding RELEASE in + * unlock() uncontended, or fastpath) and RELEASE (to publish + * the node fields we just initialised) semantics when updating + * the lock tail. */ - old = atomic_xchg_acquire(&lock->tail, curr); + old = atomic_xchg(&lock->tail, curr); if (old == OSQ_UNLOCKED_VAL) return true; diff --git a/kernel/module.c b/kernel/module.c index 8f051a106676..38c7bd5583ff 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3571,6 +3571,12 @@ static int load_module(struct load_info *info, const char __user *uargs, synchronize_sched(); mutex_unlock(&module_mutex); free_module: + /* + * Ftrace needs to clean up what it initialized. + * This does nothing if ftrace_module_init() wasn't called, + * but it must be called outside of module_mutex. + */ + ftrace_release_mod(mod); /* Free lock-classes; relies on the preceding sync_rcu() */ lockdep_free_key_range(mod->module_core, mod->core_size); diff --git a/kernel/pid.c b/kernel/pid.c index ca368793808e..78b3d9f80d44 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -467,7 +467,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type) rcu_read_lock(); if (type != PIDTYPE_PID) task = task->group_leader; - pid = get_pid(task->pids[type].pid); + pid = get_pid(rcu_dereference(task->pids[type].pid)); rcu_read_unlock(); return pid; } @@ -528,7 +528,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (likely(pid_alive(task))) { if (type != PIDTYPE_PID) task = task->group_leader; - nr = pid_nr_ns(task->pids[type].pid, ns); + nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); } rcu_read_unlock(); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d89328e260df..d2988d047d66 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -162,6 +162,27 @@ static int rcu_torture_writer_state; #define RTWS_SYNC 7 #define RTWS_STUTTER 8 #define RTWS_STOPPING 9 +static const char * const rcu_torture_writer_state_names[] = { + "RTWS_FIXED_DELAY", + "RTWS_DELAY", + "RTWS_REPLACE", + "RTWS_DEF_FREE", + "RTWS_EXP_SYNC", + "RTWS_COND_GET", + "RTWS_COND_SYNC", + "RTWS_SYNC", + "RTWS_STUTTER", + "RTWS_STOPPING", +}; + +static const char *rcu_torture_writer_state_getname(void) +{ + unsigned int i = READ_ONCE(rcu_torture_writer_state); + + if (i >= ARRAY_SIZE(rcu_torture_writer_state_names)) + return "???"; + return rcu_torture_writer_state_names[i]; +} #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 @@ -1307,7 +1328,8 @@ rcu_torture_stats_print(void) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n", + pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n", + rcu_torture_writer_state_getname(), rcu_torture_writer_state, gpnum, completed, flags); show_rcu_gp_kthreads(); diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index a63a1ea5a41b..9b9cdd549caa 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -489,7 +489,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) */ void synchronize_srcu(struct srcu_struct *sp) { - __synchronize_srcu(sp, rcu_gp_is_expedited() + __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal()) ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT : SYNCHRONIZE_SRCU_TRYCOUNT); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f07343b54fe5..e41dd4131f7a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -68,10 +68,6 @@ MODULE_ALIAS("rcutree"); /* Data structures. */ -static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; -static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; -static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; - /* * In order to export the rcu_state name to the tracing tools, it * needs to be added in the __tracepoint_string section. @@ -246,24 +242,17 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { - unsigned long flags; - - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { - trace_rcu_grace_period(TPS("rcu_sched"), - __this_cpu_read(rcu_sched_data.gpnum), - TPS("cpuqs")); - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); - if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - local_irq_save(flags); - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), - true); - } - local_irq_restore(flags); - } + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) + return; + trace_rcu_grace_period(TPS("rcu_sched"), + __this_cpu_read(rcu_sched_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); } void rcu_bh_qs(void) @@ -300,17 +289,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); * We inform the RCU core by emulating a zero-duration dyntick-idle * period, which we in turn do by incrementing the ->dynticks counter * by two. + * + * The caller must have disabled interrupts. */ static void rcu_momentary_dyntick_idle(void) { - unsigned long flags; struct rcu_data *rdp; struct rcu_dynticks *rdtp; int resched_mask; struct rcu_state *rsp; - local_irq_save(flags); - /* * Yes, we can lose flag-setting operations. This is OK, because * the flag will be set again after some delay. @@ -340,13 +328,12 @@ static void rcu_momentary_dyntick_idle(void) smp_mb__after_atomic(); /* Later stuff after QS. */ break; } - local_irq_restore(flags); } /* * Note a context switch. This is a quiescent state for RCU-sched, * and requires special handling for preemptible RCU. - * The caller must have disabled preemption. + * The caller must have disabled interrupts. */ void rcu_note_context_switch(void) { @@ -376,9 +363,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); */ void rcu_all_qs(void) { + unsigned long flags; + barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { + local_irq_save(flags); rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + } this_cpu_inc(rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -605,25 +597,25 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) * The caller must have disabled interrupts to prevent races with * normal callback registry. */ -static int +static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { int i; if (rcu_gp_in_progress(rsp)) - return 0; /* No, a grace period is already in progress. */ + return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) - return 1; /* Yes, a no-CBs CPU needs one. */ + return true; /* Yes, a no-CBs CPU needs one. */ if (!rdp->nxttail[RCU_NEXT_TAIL]) - return 0; /* No, this is a no-CBs (or offline) CPU. */ + return false; /* No, this is a no-CBs (or offline) CPU. */ if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) - return 1; /* Yes, this CPU has newly registered callbacks. */ + return true; /* Yes, CPU has newly registered callbacks. */ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) if (rdp->nxttail[i - 1] != rdp->nxttail[i] && ULONG_CMP_LT(READ_ONCE(rsp->completed), rdp->nxtcompleted[i])) - return 1; /* Yes, CBs for future grace period. */ - return 0; /* No grace period needed. */ + return true; /* Yes, CBs for future grace period. */ + return false; /* No grace period needed. */ } /* @@ -740,7 +732,7 @@ void rcu_user_enter(void) * * Exit from an interrupt handler, which might possibly result in entering * idle mode, in other words, leaving the mode in which read-side critical - * sections can occur. + * sections can occur. The caller must have disabled interrupts. * * This code assumes that the idle loop never does anything that might * result in unbalanced calls to irq_enter() and irq_exit(). If your @@ -753,11 +745,10 @@ void rcu_user_enter(void) */ void rcu_irq_exit(void) { - unsigned long flags; long long oldval; struct rcu_dynticks *rdtp; - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; @@ -768,6 +759,17 @@ void rcu_irq_exit(void) else rcu_eqs_enter_common(oldval, true); rcu_sysidle_enter(1); +} + +/* + * Wrapper for rcu_irq_exit() where interrupts are enabled. + */ +void rcu_irq_exit_irqson(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_irq_exit(); local_irq_restore(flags); } @@ -865,7 +867,7 @@ void rcu_user_exit(void) * * Enter an interrupt handler, which might possibly result in exiting * idle mode, in other words, entering the mode in which read-side critical - * sections can occur. + * sections can occur. The caller must have disabled interrupts. * * Note that the Linux kernel is fully capable of entering an interrupt * handler that it never exits, for example when doing upcalls to @@ -881,11 +883,10 @@ void rcu_user_exit(void) */ void rcu_irq_enter(void) { - unsigned long flags; struct rcu_dynticks *rdtp; long long oldval; - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; @@ -896,6 +897,17 @@ void rcu_irq_enter(void) else rcu_eqs_exit_common(oldval, true); rcu_sysidle_exit(1); +} + +/* + * Wrapper for rcu_irq_enter() where interrupts are enabled. + */ +void rcu_irq_enter_irqson(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_irq_enter(); local_irq_restore(flags); } @@ -1187,6 +1199,16 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) } /* + * Convert a ->gp_state value to a character string. + */ +static const char *gp_state_getname(short gs) +{ + if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) + return "???"; + return gp_state_names[gs]; +} + +/* * Complain about starvation of grace-period kthread. */ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) @@ -1196,12 +1218,16 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); - if (j - gpa > 2 * HZ) - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", + if (j - gpa > 2 * HZ) { + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n", rsp->name, j - gpa, rsp->gpnum, rsp->completed, - rsp->gp_flags, rsp->gp_state, - rsp->gp_kthread ? rsp->gp_kthread->state : 0); + rsp->gp_flags, + gp_state_getname(rsp->gp_state), rsp->gp_state, + rsp->gp_kthread ? rsp->gp_kthread->state : ~0); + if (rsp->gp_kthread) + sched_show_task(rsp->gp_kthread); + } } /* @@ -1214,7 +1240,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask != 0) { for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) if (rnp->qsmask & (1UL << cpu)) @@ -1237,7 +1263,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) /* Only let one CPU complain about others per time interval. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); delta = jiffies - READ_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -1256,7 +1282,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) rsp->name); print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) @@ -1327,7 +1353,7 @@ static void print_cpu_stall(struct rcu_state *rsp) rcu_dump_cpu_stacks(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); @@ -1534,10 +1560,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * hold it, acquire the root rcu_node structure's lock in order to * start one (if needed). */ - if (rnp != rnp_root) { - raw_spin_lock(&rnp_root->lock); - smp_mb__after_unlock_lock(); - } + if (rnp != rnp_root) + raw_spin_lock_rcu_node(rnp_root); /* * Get a new grace-period number. If there really is no grace @@ -1786,11 +1810,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && rdp->completed == READ_ONCE(rnp->completed) && !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ + !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */ local_irq_restore(flags); return; } - smp_mb__after_unlock_lock(); needwake = __note_gp_changes(rsp, rnp, rdp); raw_spin_unlock_irqrestore(&rnp->lock, flags); if (needwake) @@ -1805,21 +1828,20 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay) } /* - * Initialize a new grace period. Return 0 if no grace period required. + * Initialize a new grace period. Return false if no grace period required. */ -static int rcu_gp_init(struct rcu_state *rsp) +static bool rcu_gp_init(struct rcu_state *rsp) { unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); if (!READ_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq(&rnp->lock); - return 0; + return false; } WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ @@ -1829,7 +1851,7 @@ static int rcu_gp_init(struct rcu_state *rsp) * Not supposed to be able to happen. */ raw_spin_unlock_irq(&rnp->lock); - return 0; + return false; } /* Advance to a new grace period and initialize state. */ @@ -1847,8 +1869,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_leaf_node(rsp, rnp) { rcu_gp_slow(rsp, gp_preinit_delay); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ @@ -1904,8 +1925,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { rcu_gp_slow(rsp, gp_init_delay); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1923,7 +1943,7 @@ static int rcu_gp_init(struct rcu_state *rsp) WRITE_ONCE(rsp->gp_activity, jiffies); } - return 1; + return true; } /* @@ -1973,8 +1993,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) } /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq(&rnp->lock); @@ -1993,8 +2012,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -2019,8 +2037,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * grace period is recorded in any of the rcu_node structures. */ rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->completed, rsp->gpnum); @@ -2035,8 +2052,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_gp_slow(rsp, gp_cleanup_delay); } rnp = rcu_get_root(rsp); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */ + raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ rcu_nocb_gp_set(rnp, nocb); /* Declare grace period done. */ @@ -2284,8 +2300,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, raw_spin_unlock_irqrestore(&rnp->lock, flags); rnp_c = rnp; rnp = rnp->parent; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); oldmask = rnp_c->qsmask; } @@ -2332,8 +2347,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, gps = rnp->gpnum; mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); } @@ -2355,8 +2369,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) struct rcu_node *rnp; rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if ((rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || @@ -2582,8 +2595,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) rnp = rnp->parent; if (!rnp) break; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); /* GP memory ordering. */ + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; rnp->qsmask &= ~mask; if (rnp->qsmaskinit) { @@ -2611,8 +2623,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rnp->qsmaskinitnext &= ~mask; raw_spin_unlock_irqrestore(&rnp->lock, flags); } @@ -2809,8 +2820,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rcu_for_each_leaf_node(rsp, rnp) { cond_resched_rcu_qs(); mask = 0; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || @@ -2881,8 +2891,7 @@ static void force_quiescent_state(struct rcu_state *rsp) /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ /* Reached the root of the rcu_node tree, acquire lock. */ - raw_spin_lock_irqsave(&rnp_old->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp_old, flags); raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; @@ -2914,7 +2923,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* Does this CPU require a not-yet-started grace period? */ local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ + raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ needwake = rcu_start_gp(rsp); raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); if (needwake) @@ -3005,8 +3014,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, if (!rcu_gp_in_progress(rsp)) { struct rcu_node *rnp_root = rcu_get_root(rsp); - raw_spin_lock(&rnp_root->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp_root); needwake = rcu_start_gp(rsp); raw_spin_unlock(&rnp_root->lock); if (needwake) @@ -3365,7 +3373,6 @@ static unsigned long rcu_seq_snap(unsigned long *sp) { unsigned long s; - smp_mb(); /* Caller's modifications seen first by other CPUs. */ s = (READ_ONCE(*sp) + 3) & ~0x1; smp_mb(); /* Above access must not bleed into critical section. */ return s; @@ -3392,6 +3399,7 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) } static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { + smp_mb(); /* Caller's modifications seen first by other CPUs. */ return rcu_seq_snap(&rsp->expedited_sequence); } static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) @@ -3426,8 +3434,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) * CPUs for the current rcu_node structure up the rcu_node tree. */ rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmaskinit == rnp->expmaskinitnext) { raw_spin_unlock_irqrestore(&rnp->lock, flags); continue; /* No new CPUs, nothing to do. */ @@ -3447,8 +3454,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) rnp_up = rnp->parent; done = false; while (rnp_up) { - raw_spin_lock_irqsave(&rnp_up->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp_up, flags); if (rnp_up->expmaskinit) done = true; rnp_up->expmaskinit |= mask; @@ -3472,8 +3478,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) sync_exp_reset_tree_hotplug(rsp); rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); rnp->expmask = rnp->expmaskinit; raw_spin_unlock_irqrestore(&rnp->lock, flags); @@ -3531,8 +3536,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ rnp = rnp->parent; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); rnp->expmask &= ~mask; } @@ -3549,8 +3553,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, { unsigned long flags; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); __rcu_report_exp_rnp(rsp, rnp, wake, flags); } @@ -3564,8 +3567,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, { unsigned long flags; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!(rnp->expmask & mask)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -3609,7 +3611,7 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, */ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) { - struct rcu_data *rdp; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); struct rcu_node *rnp0; struct rcu_node *rnp1 = NULL; @@ -3623,7 +3625,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { if (mutex_trylock(&rnp0->exp_funnel_mutex)) { if (sync_exp_work_done(rsp, rnp0, NULL, - &rsp->expedited_workdone0, s)) + &rdp->expedited_workdone0, s)) return NULL; return rnp0; } @@ -3637,14 +3639,13 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) * can be inexact, as it is just promoting locality and is not * strictly needed for correctness. */ - rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s)) + if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s)) return NULL; mutex_lock(&rdp->exp_funnel_mutex); rnp0 = rdp->mynode; for (; rnp0 != NULL; rnp0 = rnp0->parent) { if (sync_exp_work_done(rsp, rnp1, rdp, - &rsp->expedited_workdone2, s)) + &rdp->expedited_workdone2, s)) return NULL; mutex_lock(&rnp0->exp_funnel_mutex); if (rnp1) @@ -3654,7 +3655,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) rnp1 = rnp0; } if (sync_exp_work_done(rsp, rnp1, rdp, - &rsp->expedited_workdone3, s)) + &rdp->expedited_workdone3, s)) return NULL; return rnp1; } @@ -3708,8 +3709,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, sync_exp_reset_tree(rsp); rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; @@ -3741,24 +3741,22 @@ retry_ipi: ret = smp_call_function_single(cpu, func, rsp, 0); if (!ret) { mask_ofl_ipi &= ~mask; - } else { - /* Failed, raced with offline. */ - raw_spin_lock_irqsave(&rnp->lock, flags); - if (cpu_online(cpu) && - (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, - flags); - schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave(&rnp->lock, - flags); - } - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; + continue; + } + /* Failed, raced with offline. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (cpu_online(cpu) && + (rnp->expmask & mask)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); + schedule_timeout_uninterruptible(1); + if (cpu_online(cpu) && + (rnp->expmask & mask)) + goto retry_ipi; + raw_spin_lock_irqsave_rcu_node(rnp, flags); } + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* Report quiescent states for those that went offline. */ mask_ofl_test |= mask_ofl_ipi; @@ -3773,6 +3771,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) unsigned long jiffies_stall; unsigned long jiffies_start; unsigned long mask; + int ndetected; struct rcu_node *rnp; struct rcu_node *rnp_root = rcu_get_root(rsp); int ret; @@ -3785,7 +3784,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root), jiffies_stall); - if (ret > 0) + if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) return; if (ret < 0) { /* Hit a signal, disable CPU stall warnings. */ @@ -3795,14 +3794,16 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rsp->name); + ndetected = 0; rcu_for_each_leaf_node(rsp, rnp) { - (void)rcu_print_task_exp_stall(rnp); + ndetected = rcu_print_task_exp_stall(rnp); mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { struct rcu_data *rdp; if (!(rnp->expmask & mask)) continue; + ndetected++; rdp = per_cpu_ptr(rsp->rda, cpu); pr_cont(" %d-%c%c%c", cpu, "O."[cpu_online(cpu)], @@ -3811,8 +3812,23 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } mask <<= 1; } - pr_cont(" } %lu jiffies s: %lu\n", - jiffies - jiffies_start, rsp->expedited_sequence); + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + jiffies - jiffies_start, rsp->expedited_sequence, + rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); + if (!ndetected) { + pr_err("blocking rcu_node structures:"); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_preempt_exp_done(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, + rnp->expmask, + ".T"[!!rnp->exp_tasks]); + } + pr_cont("\n"); + } rcu_for_each_leaf_node(rsp, rnp) { mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { @@ -3847,6 +3863,16 @@ void synchronize_sched_expedited(void) struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; + /* If only one CPU, this is automatically a grace period. */ + if (rcu_blocking_is_gp()) + return; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu_sched); + return; + } + /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); @@ -4135,7 +4161,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) rnp = rnp->parent; if (rnp == NULL) return; - raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */ + raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ rnp->qsmaskinit |= mask; raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ } @@ -4152,7 +4178,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); @@ -4179,7 +4205,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; @@ -4198,8 +4224,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) */ rnp = rdp->mynode; mask = rdp->grpmask; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinitnext |= mask; rnp->expmaskinitnext |= mask; if (!rdp->beenonline) @@ -4327,14 +4352,14 @@ static int __init rcu_spawn_gp_kthread(void) t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); BUG_ON(IS_ERR(t)); rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rsp->gp_kthread = t; if (kthread_prio) { sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } - wake_up_process(t); raw_spin_unlock_irqrestore(&rnp->lock, flags); + wake_up_process(t); } rcu_spawn_nocb_kthreads(); rcu_spawn_boost_kthreads(); @@ -4385,12 +4410,14 @@ static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) /* * Helper function for rcu_init() that initializes one rcu_state structure. */ -static void __init rcu_init_one(struct rcu_state *rsp, - struct rcu_data __percpu *rda) +static void __init rcu_init_one(struct rcu_state *rsp) { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; static const char * const exp[] = RCU_EXP_NAME_INIT; + static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; + static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; + static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -4576,8 +4603,8 @@ void __init rcu_init(void) rcu_bootup_announce(); rcu_init_geometry(); - rcu_init_one(&rcu_bh_state, &rcu_bh_data); - rcu_init_one(&rcu_sched_state, &rcu_sched_data); + rcu_init_one(&rcu_bh_state); + rcu_init_one(&rcu_sched_state); if (dump_tree) rcu_dump_rcu_node_tree(&rcu_sched_state); __rcu_init_preempt(); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9fb4e238d4dc..83360b4f4352 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -178,6 +178,8 @@ struct rcu_node { /* beginning of each expedited GP. */ unsigned long expmaskinitnext; /* Online CPUs for next expedited GP. */ + /* Any CPU that has ever been online will */ + /* have its bit set. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ @@ -384,6 +386,10 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ struct mutex exp_funnel_mutex; + atomic_long_t expedited_workdone0; /* # done by others #0. */ + atomic_long_t expedited_workdone1; /* # done by others #1. */ + atomic_long_t expedited_workdone2; /* # done by others #2. */ + atomic_long_t expedited_workdone3; /* # done by others #3. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -498,10 +504,6 @@ struct rcu_state { /* End of fields guarded by barrier_mutex. */ unsigned long expedited_sequence; /* Take a ticket. */ - atomic_long_t expedited_workdone0; /* # done by others #0. */ - atomic_long_t expedited_workdone1; /* # done by others #1. */ - atomic_long_t expedited_workdone2; /* # done by others #2. */ - atomic_long_t expedited_workdone3; /* # done by others #3. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ wait_queue_head_t expedited_wq; /* Wait for check-ins. */ @@ -545,6 +547,18 @@ struct rcu_state { #define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ #define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ +#ifndef RCU_TREE_NONCORE +static const char * const gp_state_names[] = { + "RCU_GP_IDLE", + "RCU_GP_WAIT_GPS", + "RCU_GP_DONE_GPS", + "RCU_GP_WAIT_FQS", + "RCU_GP_DOING_FQS", + "RCU_GP_CLEANUP", + "RCU_GP_CLEANED", +}; +#endif /* #ifndef RCU_TREE_NONCORE */ + extern struct list_head rcu_struct_flavors; /* Sequence through rcu_state structures for each RCU flavor. */ @@ -664,3 +678,42 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #else /* #ifdef CONFIG_PPC */ #define smp_mb__after_unlock_lock() do { } while (0) #endif /* #else #ifdef CONFIG_PPC */ + +/* + * Wrappers for the rcu_node::lock acquire. + * + * Because the rcu_nodes form a tree, the tree traversal locking will observe + * different lock values, this in turn means that an UNLOCK of one level + * followed by a LOCK of another level does not imply a full memory barrier; + * and most importantly transitivity is lost. + * + * In order to restore full ordering between tree levels, augment the regular + * lock acquire functions with smp_mb__after_unlock_lock(). + */ +static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) +{ + raw_spin_lock(&rnp->lock); + smp_mb__after_unlock_lock(); +} + +static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) +{ + raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); +} + +#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_lock_irqsave(&(rnp)->lock, flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) +{ + bool locked = raw_spin_trylock(&rnp->lock); + + if (locked) + smp_mb__after_unlock_lock(); + return locked; +} diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 630c19772630..9467a8b7e756 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -63,8 +63,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ /* * Check the RCU kernel configuration parameters and print informative - * messages about anything out of the ordinary. If you like #ifdef, you - * will love this function. + * messages about anything out of the ordinary. */ static void __init rcu_bootup_announce_oddness(void) { @@ -147,8 +146,8 @@ static void __init rcu_bootup_announce(void) * the corresponding expedited grace period will also be the end of the * normal grace period. */ -static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long flags) __releases(rnp->lock) +static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) + __releases(rnp->lock) /* But leaves rrupts disabled. */ { int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + @@ -236,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; - raw_spin_unlock(&rnp->lock); + raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */ /* * Report the quiescent state for the expedited GP. This expedited @@ -251,7 +250,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, } else { WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); } - local_irq_restore(flags); } /* @@ -286,12 +284,11 @@ static void rcu_preempt_qs(void) * predating the current grace period drain, in other words, until * rnp->gp_tasks becomes NULL. * - * Caller must disable preemption. + * Caller must disable interrupts. */ static void rcu_preempt_note_context_switch(void) { struct task_struct *t = current; - unsigned long flags; struct rcu_data *rdp; struct rcu_node *rnp; @@ -301,8 +298,7 @@ static void rcu_preempt_note_context_switch(void) /* Possibly blocking in an RCU read-side critical section. */ rdp = this_cpu_ptr(rcu_state_p->rda); rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); t->rcu_read_unlock_special.b.blocked = true; t->rcu_blocked_node = rnp; @@ -318,7 +314,7 @@ static void rcu_preempt_note_context_switch(void) (rnp->qsmask & rdp->grpmask) ? rnp->gpnum : rnp->gpnum + 1); - rcu_preempt_ctxt_queue(rnp, rdp, flags); + rcu_preempt_ctxt_queue(rnp, rdp); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special.s) { @@ -450,20 +446,13 @@ void rcu_read_unlock_special(struct task_struct *t) /* * Remove this task from the list it blocked on. The task - * now remains queued on the rcu_node corresponding to - * the CPU it first blocked on, so the first attempt to - * acquire the task's rcu_node's ->lock will succeed. - * Keep the loop and add a WARN_ON() out of sheer paranoia. + * now remains queued on the rcu_node corresponding to the + * CPU it first blocked on, so there is no longer any need + * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia. */ - for (;;) { - rnp = t->rcu_blocked_node; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); - if (rnp == t->rcu_blocked_node) - break; - WARN_ON_ONCE(1); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } + rnp = t->rcu_blocked_node; + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + WARN_ON_ONCE(rnp != t->rcu_blocked_node); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -527,7 +516,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) unsigned long flags; struct task_struct *t; - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; @@ -748,6 +737,12 @@ void synchronize_rcu_expedited(void) struct rcu_state *rsp = rcu_state_p; unsigned long s; + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu); + return; + } + s = rcu_exp_gp_seq_snap(rsp); rnp_unlock = exp_funnel_lock(rsp, s); @@ -788,7 +783,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier); */ static void __init __rcu_init_preempt(void) { - rcu_init_one(rcu_state_p, rcu_data_p); + rcu_init_one(rcu_state_p); } /* @@ -989,8 +984,7 @@ static int rcu_boost(struct rcu_node *rnp) READ_ONCE(rnp->boost_tasks) == NULL) return 0; /* Nothing left to boost. */ - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* * Recheck under the lock: all tasks in need of boosting @@ -1176,8 +1170,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, "rcub/%d", rnp_index); if (IS_ERR(t)) return PTR_ERR(t); - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = kthread_prio; @@ -1524,7 +1517,8 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || + rcu_is_nocb_cpu(smp_processor_id())) return; /* Handle nohz enablement switches conservatively. */ @@ -1538,10 +1532,6 @@ static void rcu_prepare_for_idle(void) if (!tne) return; - /* If this is a no-CBs CPU, no callbacks, just return. */ - if (rcu_is_nocb_cpu(smp_processor_id())) - return; - /* * If a non-lazy callback arrived at a CPU having only lazy * callbacks, invoke RCU core for the side-effect of recalculating @@ -1567,8 +1557,7 @@ static void rcu_prepare_for_idle(void) if (!*rdp->nxttail[RCU_DONE_TAIL]) continue; rnp = rdp->mynode; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ if (needwake) @@ -2068,8 +2057,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) bool needwake; struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); needwake = rcu_start_future_gp(rnp, rdp, &c); raw_spin_unlock_irqrestore(&rnp->lock, flags); if (needwake) diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index ef7093cc9b5c..1088e64f01ad 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -1,5 +1,5 @@ /* - * Read-Copy Update tracing for classic implementation + * Read-Copy Update tracing for hierarchical implementation. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -16,6 +16,7 @@ * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 + * Author: Paul E. McKenney * * Papers: http://www.rdrop.com/users/paulmck/RCU * @@ -33,9 +34,7 @@ #include <linux/sched.h> #include <linux/atomic.h> #include <linux/bitops.h> -#include <linux/module.h> #include <linux/completion.h> -#include <linux/moduleparam.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/cpu.h> @@ -183,14 +182,20 @@ static const struct file_operations rcudata_fops = { static int show_rcuexp(struct seq_file *m, void *v) { + int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; - + struct rcu_data *rdp; + unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; + + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + s0 += atomic_long_read(&rdp->expedited_workdone0); + s1 += atomic_long_read(&rdp->expedited_workdone1); + s2 += atomic_long_read(&rdp->expedited_workdone2); + s3 += atomic_long_read(&rdp->expedited_workdone3); + } seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, - atomic_long_read(&rsp->expedited_workdone0), - atomic_long_read(&rsp->expedited_workdone1), - atomic_long_read(&rsp->expedited_workdone2), - atomic_long_read(&rsp->expedited_workdone3), + rsp->expedited_sequence, s0, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); @@ -319,7 +324,7 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) unsigned long gpmax; struct rcu_node *rnp = &rsp->node[0]; - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); completed = READ_ONCE(rsp->completed); gpnum = READ_ONCE(rsp->gpnum); if (completed == gpnum) @@ -487,16 +492,4 @@ free_out: debugfs_remove_recursive(rcudir); return 1; } - -static void __exit rcutree_trace_cleanup(void) -{ - debugfs_remove_recursive(rcudir); -} - - -module_init(rcutree_trace_init); -module_exit(rcutree_trace_cleanup); - -MODULE_AUTHOR("Paul E. McKenney"); -MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); -MODULE_LICENSE("GPL"); +device_initcall(rcutree_trace_init); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5f748c5a40f0..76b94e19430b 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -60,7 +60,12 @@ MODULE_ALIAS("rcupdate"); #endif #define MODULE_PARAM_PREFIX "rcupdate." +#ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); +module_param(rcu_normal, int, 0); +static int rcu_normal_after_boot; +module_param(rcu_normal_after_boot, int, 0); +#endif /* #ifndef CONFIG_TINY_RCU */ #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) /** @@ -113,6 +118,17 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); #ifndef CONFIG_TINY_RCU +/* + * Should expedited grace-period primitives always fall back to their + * non-expedited counterparts? Intended for use within RCU. Note + * that if the user specifies both rcu_expedited and rcu_normal, then + * rcu_normal wins. + */ +bool rcu_gp_is_normal(void) +{ + return READ_ONCE(rcu_normal); +} + static atomic_t rcu_expedited_nesting = ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); @@ -157,8 +173,6 @@ void rcu_unexpedite_gp(void) } EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); -#endif /* #ifndef CONFIG_TINY_RCU */ - /* * Inform RCU of the end of the in-kernel boot sequence. */ @@ -166,8 +180,12 @@ void rcu_end_inkernel_boot(void) { if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) rcu_unexpedite_gp(); + if (rcu_normal_after_boot) + WRITE_ONCE(rcu_normal, 1); } +#endif /* #ifndef CONFIG_TINY_RCU */ + #ifdef CONFIG_PREEMPT_RCU /* diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c0a205101c23..caf4041f5b0a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -1,7 +1,7 @@ /* * sched_clock for unstable cpu clocks * - * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * * Updates and enhancements: * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com> diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 91db75018652..34cb9f7fc2d2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3194,7 +3194,6 @@ static void __sched notrace __schedule(bool preempt) cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_note_context_switch(); prev = rq->curr; /* @@ -3213,13 +3212,16 @@ static void __sched notrace __schedule(bool preempt) if (sched_feat(HRTICK)) hrtick_clear(rq); + local_irq_disable(); + rcu_note_context_switch(); + /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - raw_spin_lock_irq(&rq->lock); + raw_spin_lock(&rq->lock); lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ @@ -8326,12 +8328,12 @@ static void cpu_cgroup_fork(struct task_struct *task, void *private) sched_move_task(task); } -static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -8344,12 +8346,12 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpu_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f04fda8f669c..cfdc0e61066c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -17,7 +17,7 @@ * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> * * Adaptive scheduling granularity, math enhancements by Peter Zijlstra - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ #include <linux/latencytop.h> @@ -2689,7 +2689,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) int decayed, removed = 0; if (atomic_long_read(&cfs_rq->removed_load_avg)) { - long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); + s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sa->load_avg = max_t(long, sa->load_avg - r, 0); sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); removed = 1; diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index f10bd873e684..f15d6b6a538a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -392,7 +392,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, do { prepare_to_wait(wq, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(&q->key); + ret = (*action)(&q->key, mode); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); finish_wait(wq, &q->wait); return ret; @@ -431,7 +431,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, prepare_to_wait_exclusive(wq, &q->wait, mode); if (!test_bit(q->key.bit_nr, q->key.flags)) continue; - ret = action(&q->key); + ret = action(&q->key, mode); if (!ret) continue; abort_exclusive_wait(wq, &q->wait, mode, &q->key); @@ -581,43 +581,43 @@ void wake_up_atomic_t(atomic_t *p) } EXPORT_SYMBOL(wake_up_atomic_t); -__sched int bit_wait(struct wait_bit_key *word) +__sched int bit_wait(struct wait_bit_key *word, int mode) { schedule(); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait); -__sched int bit_wait_io(struct wait_bit_key *word) +__sched int bit_wait_io(struct wait_bit_key *word, int mode) { io_schedule(); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait_io); -__sched int bit_wait_timeout(struct wait_bit_key *word) +__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); if (time_after_eq(now, word->timeout)) return -EAGAIN; schedule_timeout(word->timeout - now); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } EXPORT_SYMBOL_GPL(bit_wait_timeout); -__sched int bit_wait_io_timeout(struct wait_bit_key *word) +__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); if (time_after_eq(now, word->timeout)) return -EAGAIN; io_schedule_timeout(word->timeout - now); - if (signal_pending(current)) + if (signal_pending_state(mode, current)) return -EINTR; return 0; } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 867bc20e1ef1..a3bbaee77c58 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -531,7 +531,7 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -#ifdef CONFIG_STOP_MACHINE +#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { @@ -631,4 +631,4 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, return ret ?: done.ret; } -#endif /* CONFIG_STOP_MACHINE */ +#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 75f1d05ea82d..9c6045a27ba3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1887,12 +1887,6 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } -static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) -{ - cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; - cpu_buffer->reader_page->read = 0; -} - static void rb_inc_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; @@ -2803,8 +2797,11 @@ rb_reserve_next_event(struct ring_buffer *buffer, event = __rb_reserve_next(cpu_buffer, &info); - if (unlikely(PTR_ERR(event) == -EAGAIN)) + if (unlikely(PTR_ERR(event) == -EAGAIN)) { + if (info.add_timestamp) + info.length -= RB_LEN_TIME_EXTEND; goto again; + } if (!event) goto out_fail; @@ -3626,7 +3623,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; - rb_reset_reader_page(cpu_buffer); + cpu_buffer->reader_page->read = 0; if (overwrite != cpu_buffer->last_overrun) { cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun; @@ -3636,6 +3633,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto again; out: + /* Update the read_stamp on the first event */ + if (reader && reader->read == 0) + cpu_buffer->read_stamp = reader->page->time_stamp; + arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index abfc903e741e..cc9f7a9319be 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -1,7 +1,7 @@ /* * trace event based perf event profiling/tracing * - * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com> */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6bbc5f652355..4f6ef6912e00 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -582,6 +582,12 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr); + unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, tr); + unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, tr); + + unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr); + unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr); + list_for_each_entry(file, &tr->events, list) { clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); } @@ -1729,6 +1735,16 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, tr, INT_MAX); register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr, 0); + + register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, + tr, 0); + + register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post, + tr, 0); } /* diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 1c2b28536feb..060df67dbdd1 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -273,6 +273,7 @@ static const char **find_next(void *v, loff_t *pos) if (*pos < last_index + start_index) return __start___tracepoint_str + (*pos - last_index); + start_index += last_index; return find_next_mod_format(start_index, v, fmt, pos); } |