summaryrefslogtreecommitdiffstats
path: root/kernel/cgroup/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup/cpuset.c')
-rw-r--r--kernel/cgroup/cpuset.c157
1 files changed, 87 insertions, 70 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index a4dd285cdf39..f321ed515f3a 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -84,9 +84,19 @@ static bool have_boot_isolcpus;
static struct list_head remote_children;
/*
- * A flag to force sched domain rebuild at the end of an operation while
- * inhibiting it in the intermediate stages when set. Currently it is only
- * set in hotplug code.
+ * A flag to force sched domain rebuild at the end of an operation.
+ * It can be set in
+ * - update_partition_sd_lb()
+ * - remote_partition_check()
+ * - update_cpumasks_hier()
+ * - cpuset_update_flag()
+ * - cpuset_hotplug_update_tasks()
+ * - cpuset_handle_hotplug()
+ *
+ * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
+ *
+ * Note that update_relax_domain_level() in cpuset-v1.c can still call
+ * rebuild_sched_domains_locked() directly without using this flag.
*/
static bool force_sd_rebuild;
@@ -283,6 +293,12 @@ static inline void dec_attach_in_progress(struct cpuset *cs)
mutex_unlock(&cpuset_mutex);
}
+static inline bool cpuset_v2(void)
+{
+ return !IS_ENABLED(CONFIG_CPUSETS_V1) ||
+ cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+}
+
/*
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
@@ -293,7 +309,7 @@ static inline void dec_attach_in_progress(struct cpuset *cs)
*/
static inline bool is_in_v2_mode(void)
{
- return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ return cpuset_v2() ||
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
@@ -565,12 +581,24 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/*
* We can't shrink if we won't have enough room for SCHED_DEADLINE
- * tasks.
+ * tasks. This check is not done when scheduling is disabled as the
+ * users should know what they are doing.
+ *
+ * For v1, effective_cpus == cpus_allowed & user_xcpus() returns
+ * cpus_allowed.
+ *
+ * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only
+ * for non-isolated partition root. At this point, the target
+ * effective_cpus isn't computed yet. user_xcpus() is the best
+ * approximation.
+ *
+ * TBD: May need to precompute the real effective_cpus here in case
+ * incorrect scheduling of SCHED_DEADLINE tasks in a partition
+ * becomes an issue.
*/
ret = -EBUSY;
- if (is_cpu_exclusive(cur) &&
- !cpuset_cpumask_can_shrink(cur->cpus_allowed,
- trial->cpus_allowed))
+ if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) &&
+ !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial)))
goto out;
/*
@@ -728,7 +756,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup_subsys_state *pos_css;
bool root_load_balance = is_sched_load_balance(&top_cpuset);
- bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+ bool cgrpv2 = cpuset_v2();
int nslot_update;
doms = NULL;
@@ -990,6 +1018,7 @@ void rebuild_sched_domains_locked(void)
lockdep_assert_cpus_held();
lockdep_assert_held(&cpuset_mutex);
+ force_sd_rebuild = false;
/*
* If we have raced with CPU hotplug, return early to avoid
@@ -1164,8 +1193,8 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}
- if (rebuild_domains && !force_sd_rebuild)
- rebuild_sched_domains_locked();
+ if (rebuild_domains)
+ cpuset_force_rebuild();
}
/*
@@ -1187,7 +1216,7 @@ static void reset_partition_data(struct cpuset *cs)
{
struct cpuset *parent = parent_cs(cs);
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ if (!cpuset_v2())
return;
lockdep_assert_held(&callback_lock);
@@ -1339,7 +1368,7 @@ static inline bool is_local_partition(struct cpuset *cs)
* remote_partition_enable - Enable current cpuset as a remote partition root
* @cs: the cpuset to update
* @new_prs: new partition_root_state
- * @tmp: temparary masks
+ * @tmp: temporary masks
* Return: 0 if successful, errcode if error
*
* Enable the current cpuset to become a remote partition root taking CPUs
@@ -1377,7 +1406,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
update_unbound_workqueue_cpumask(isolcpus_updated);
/*
- * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+ * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
*/
cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
@@ -1387,7 +1416,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs,
/*
* remote_partition_disable - Remove current cpuset from remote partition list
* @cs: the cpuset to update
- * @tmp: temparary masks
+ * @tmp: temporary masks
*
* The effective_cpus is also updated.
*
@@ -1413,7 +1442,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
update_unbound_workqueue_cpumask(isolcpus_updated);
/*
- * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+ * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
*/
cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
@@ -1423,7 +1452,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
* remote_cpus_update - cpus_exclusive change of remote partition
* @cs: the cpuset to be updated
* @newmask: the new effective_xcpus mask
- * @tmp: temparary masks
+ * @tmp: temporary masks
*
* top_cpuset and subpartitions_cpus will be updated or partition can be
* invalidated.
@@ -1465,7 +1494,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask,
update_unbound_workqueue_cpumask(isolcpus_updated);
/*
- * Proprogate changes in top_cpuset's effective_cpus down the hierarchy.
+ * Propagate changes in top_cpuset's effective_cpus down the hierarchy.
*/
cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
@@ -1480,7 +1509,7 @@ invalidate:
* @cs: the cpuset to be updated
* @newmask: the new effective_xcpus mask
* @delmask: temporary mask for deletion (not in tmp)
- * @tmp: temparary masks
+ * @tmp: temporary masks
*
* This should be called before the given cs has updated its cpus_allowed
* and/or effective_xcpus.
@@ -1512,8 +1541,8 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
remote_partition_disable(child, tmp);
disable_cnt++;
}
- if (disable_cnt && !force_sd_rebuild)
- rebuild_sched_domains_locked();
+ if (disable_cnt)
+ cpuset_force_rebuild();
}
/*
@@ -1923,12 +1952,6 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
}
/*
- * update_cpumasks_hier() flags
- */
-#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */
-#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */
-
-/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
* @cs: the cpuset to consider
* @tmp: temp variables for calculating effective_cpus & partition setup
@@ -1942,7 +1965,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs,
* Called with cpuset_mutex held
*/
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
- int flags)
+ bool force)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
@@ -2007,12 +2030,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
* Skip the whole subtree if
* 1) the cpumask remains the same,
* 2) has no partition root state,
- * 3) HIER_CHECKALL flag not set, and
+ * 3) force flag not set, and
* 4) for v2 load balance state same as its parent.
*/
- if (!cp->partition_root_state && !(flags & HIER_CHECKALL) &&
+ if (!cp->partition_root_state && !force &&
cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
- (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (!cpuset_v2() ||
(is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
pos_css = css_rightmost_descendant(pos_css);
continue;
@@ -2086,8 +2109,7 @@ get_css:
* from parent if current cpuset isn't a valid partition root
* and their load balance states differ.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- !is_partition_valid(cp) &&
+ if (cpuset_v2() && !is_partition_valid(cp) &&
(is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
if (is_sched_load_balance(parent))
set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
@@ -2103,8 +2125,7 @@ get_css:
*/
if (!cpumask_empty(cp->cpus_allowed) &&
is_sched_load_balance(cp) &&
- (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
- is_partition_valid(cp)))
+ (!cpuset_v2() || is_partition_valid(cp)))
need_rebuild_sched_domains = true;
rcu_read_lock();
@@ -2112,9 +2133,8 @@ get_css:
}
rcu_read_unlock();
- if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD) &&
- !force_sd_rebuild)
- rebuild_sched_domains_locked();
+ if (need_rebuild_sched_domains)
+ cpuset_force_rebuild();
}
/**
@@ -2141,9 +2161,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
* directly.
*
* The update_cpumasks_hier() function may sleep. So we have to
- * release the RCU read lock before calling it. HIER_NO_SD_REBUILD
- * flag is used to suppress rebuild of sched domains as the callers
- * will take care of that.
+ * release the RCU read lock before calling it.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
@@ -2159,7 +2177,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
continue;
rcu_read_unlock();
- update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD);
+ update_cpumasks_hier(sibling, tmp, false);
rcu_read_lock();
css_put(&sibling->css);
}
@@ -2179,7 +2197,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
struct tmpmasks tmp;
struct cpuset *parent = parent_cs(cs);
bool invalidate = false;
- int hier_flags = 0;
+ bool force = false;
int old_prs = cs->partition_root_state;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
@@ -2206,7 +2224,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
return -EINVAL;
/*
- * When exclusive_cpus isn't explicitly set, it is constrainted
+ * When exclusive_cpus isn't explicitly set, it is constrained
* by cpus_allowed and parent's effective_xcpus. Otherwise,
* trialcs->effective_xcpus is used as a temporary cpumask
* for checking validity of the partition root.
@@ -2240,12 +2258,11 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Check all the descendants in update_cpumasks_hier() if
* effective_xcpus is to be changed.
*/
- if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
- hier_flags = HIER_CHECKALL;
+ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
retval = validate_change(cs, trialcs);
- if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ if ((retval == -EINVAL) && cpuset_v2()) {
struct cgroup_subsys_state *css;
struct cpuset *cp;
@@ -2309,7 +2326,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
spin_unlock_irq(&callback_lock);
/* effective_cpus/effective_xcpus will be updated here */
- update_cpumasks_hier(cs, &tmp, hier_flags);
+ update_cpumasks_hier(cs, &tmp, force);
/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
if (cs->partition_root_state)
@@ -2334,7 +2351,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
struct tmpmasks tmp;
struct cpuset *parent = parent_cs(cs);
bool invalidate = false;
- int hier_flags = 0;
+ bool force = false;
int old_prs = cs->partition_root_state;
if (!*buf) {
@@ -2357,8 +2374,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Check all the descendants in update_cpumasks_hier() if
* effective_xcpus is to be changed.
*/
- if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus))
- hier_flags = HIER_CHECKALL;
+ force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
retval = validate_change(cs, trialcs);
if (retval)
@@ -2411,8 +2427,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* of the subtree when it is a valid partition root or effective_xcpus
* is updated.
*/
- if (is_partition_valid(cs) || hier_flags)
- update_cpumasks_hier(cs, &tmp, hier_flags);
+ if (is_partition_valid(cs) || force)
+ update_cpumasks_hier(cs, &tmp, force);
/* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */
if (cs->partition_root_state)
@@ -2737,9 +2753,12 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
cs->flags = trialcs->flags;
spin_unlock_irq(&callback_lock);
- if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed &&
- !force_sd_rebuild)
- rebuild_sched_domains_locked();
+ if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) {
+ if (cpuset_v2())
+ cpuset_force_rebuild();
+ else
+ rebuild_sched_domains_locked();
+ }
if (spread_flag_changed)
cpuset1_update_tasks_flags(cs);
@@ -2853,12 +2872,14 @@ out:
update_unbound_workqueue_cpumask(new_xcpus_state);
/* Force update if switching back to member */
- update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0);
+ update_cpumasks_hier(cs, &tmpmask, !new_prs);
/* Update sched domains and load balance flag */
update_partition_sd_lb(cs, old_prs);
notify_partition_change(cs, old_prs);
+ if (force_sd_rebuild)
+ rebuild_sched_domains_locked();
free_cpumasks(NULL, &tmpmask);
return 0;
}
@@ -2919,8 +2940,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
* migration permission derives from hierarchy ownership in
* cgroup_procs_write_permission()).
*/
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
- (cpus_updated || mems_updated)) {
+ if (!cpuset_v2() || (cpus_updated || mems_updated)) {
ret = security_task_setscheduler(task);
if (ret)
goto out_unlock;
@@ -3034,8 +3054,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* in effective cpus and mems. In that case, we can optimize out
* by skipping the task iteration and update.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- !cpus_updated && !mems_updated) {
+ if (cpuset_v2() && !cpus_updated && !mems_updated) {
cpuset_attach_nodemask_to = cs->effective_mems;
goto out;
}
@@ -3152,6 +3171,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
}
free_cpuset(trialcs);
+ if (force_sd_rebuild)
+ rebuild_sched_domains_locked();
out_unlock:
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
@@ -3383,7 +3404,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
INIT_LIST_HEAD(&cs->remote_sibling);
/* Set CS_MEMORY_MIGRATE for default hierarchy */
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ if (cpuset_v2())
__set_bit(CS_MEMORY_MIGRATE, &cs->flags);
return &cs->css;
@@ -3410,8 +3431,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
/*
* For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- !is_sched_load_balance(parent))
+ if (cpuset_v2() && !is_sched_load_balance(parent))
clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpuset_inc();
@@ -3481,8 +3501,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
if (is_partition_valid(cs))
update_prstate(cs, 0);
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- is_sched_load_balance(cs))
+ if (!cpuset_v2() && is_sched_load_balance(cs))
cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
cpuset_dec();
@@ -3896,11 +3915,9 @@ static void cpuset_handle_hotplug(void)
rcu_read_unlock();
}
- /* rebuild sched domains if cpus_allowed has changed */
- if (force_sd_rebuild) {
- force_sd_rebuild = false;
+ /* rebuild sched domains if necessary */
+ if (force_sd_rebuild)
rebuild_sched_domains_cpuslocked();
- }
free_cpumasks(NULL, ptmp);
}