diff options
Diffstat (limited to 'kernel/cgroup/cpuset.c')
-rw-r--r-- | kernel/cgroup/cpuset.c | 157 |
1 files changed, 87 insertions, 70 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a4dd285cdf39..f321ed515f3a 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -84,9 +84,19 @@ static bool have_boot_isolcpus; static struct list_head remote_children; /* - * A flag to force sched domain rebuild at the end of an operation while - * inhibiting it in the intermediate stages when set. Currently it is only - * set in hotplug code. + * A flag to force sched domain rebuild at the end of an operation. + * It can be set in + * - update_partition_sd_lb() + * - remote_partition_check() + * - update_cpumasks_hier() + * - cpuset_update_flag() + * - cpuset_hotplug_update_tasks() + * - cpuset_handle_hotplug() + * + * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock. + * + * Note that update_relax_domain_level() in cpuset-v1.c can still call + * rebuild_sched_domains_locked() directly without using this flag. */ static bool force_sd_rebuild; @@ -283,6 +293,12 @@ static inline void dec_attach_in_progress(struct cpuset *cs) mutex_unlock(&cpuset_mutex); } +static inline bool cpuset_v2(void) +{ + return !IS_ENABLED(CONFIG_CPUSETS_V1) || + cgroup_subsys_on_dfl(cpuset_cgrp_subsys); +} + /* * Cgroup v2 behavior is used on the "cpus" and "mems" control files when * on default hierarchy or when the cpuset_v2_mode flag is set by mounting @@ -293,7 +309,7 @@ static inline void dec_attach_in_progress(struct cpuset *cs) */ static inline bool is_in_v2_mode(void) { - return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + return cpuset_v2() || (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE); } @@ -565,12 +581,24 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* * We can't shrink if we won't have enough room for SCHED_DEADLINE - * tasks. + * tasks. This check is not done when scheduling is disabled as the + * users should know what they are doing. + * + * For v1, effective_cpus == cpus_allowed & user_xcpus() returns + * cpus_allowed. + * + * For v2, is_cpu_exclusive() & is_sched_load_balance() are true only + * for non-isolated partition root. At this point, the target + * effective_cpus isn't computed yet. user_xcpus() is the best + * approximation. + * + * TBD: May need to precompute the real effective_cpus here in case + * incorrect scheduling of SCHED_DEADLINE tasks in a partition + * becomes an issue. */ ret = -EBUSY; - if (is_cpu_exclusive(cur) && - !cpuset_cpumask_can_shrink(cur->cpus_allowed, - trial->cpus_allowed)) + if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) && + !cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial))) goto out; /* @@ -728,7 +756,7 @@ static int generate_sched_domains(cpumask_var_t **domains, int nslot; /* next empty doms[] struct cpumask slot */ struct cgroup_subsys_state *pos_css; bool root_load_balance = is_sched_load_balance(&top_cpuset); - bool cgrpv2 = cgroup_subsys_on_dfl(cpuset_cgrp_subsys); + bool cgrpv2 = cpuset_v2(); int nslot_update; doms = NULL; @@ -990,6 +1018,7 @@ void rebuild_sched_domains_locked(void) lockdep_assert_cpus_held(); lockdep_assert_held(&cpuset_mutex); + force_sd_rebuild = false; /* * If we have raced with CPU hotplug, return early to avoid @@ -1164,8 +1193,8 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs) clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); } - if (rebuild_domains && !force_sd_rebuild) - rebuild_sched_domains_locked(); + if (rebuild_domains) + cpuset_force_rebuild(); } /* @@ -1187,7 +1216,7 @@ static void reset_partition_data(struct cpuset *cs) { struct cpuset *parent = parent_cs(cs); - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (!cpuset_v2()) return; lockdep_assert_held(&callback_lock); @@ -1339,7 +1368,7 @@ static inline bool is_local_partition(struct cpuset *cs) * remote_partition_enable - Enable current cpuset as a remote partition root * @cs: the cpuset to update * @new_prs: new partition_root_state - * @tmp: temparary masks + * @tmp: temporary masks * Return: 0 if successful, errcode if error * * Enable the current cpuset to become a remote partition root taking CPUs @@ -1377,7 +1406,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, update_unbound_workqueue_cpumask(isolcpus_updated); /* - * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); @@ -1387,7 +1416,7 @@ static int remote_partition_enable(struct cpuset *cs, int new_prs, /* * remote_partition_disable - Remove current cpuset from remote partition list * @cs: the cpuset to update - * @tmp: temparary masks + * @tmp: temporary masks * * The effective_cpus is also updated. * @@ -1413,7 +1442,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) update_unbound_workqueue_cpumask(isolcpus_updated); /* - * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); @@ -1423,7 +1452,7 @@ static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp) * remote_cpus_update - cpus_exclusive change of remote partition * @cs: the cpuset to be updated * @newmask: the new effective_xcpus mask - * @tmp: temparary masks + * @tmp: temporary masks * * top_cpuset and subpartitions_cpus will be updated or partition can be * invalidated. @@ -1465,7 +1494,7 @@ static void remote_cpus_update(struct cpuset *cs, struct cpumask *newmask, update_unbound_workqueue_cpumask(isolcpus_updated); /* - * Proprogate changes in top_cpuset's effective_cpus down the hierarchy. + * Propagate changes in top_cpuset's effective_cpus down the hierarchy. */ cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus); update_sibling_cpumasks(&top_cpuset, NULL, tmp); @@ -1480,7 +1509,7 @@ invalidate: * @cs: the cpuset to be updated * @newmask: the new effective_xcpus mask * @delmask: temporary mask for deletion (not in tmp) - * @tmp: temparary masks + * @tmp: temporary masks * * This should be called before the given cs has updated its cpus_allowed * and/or effective_xcpus. @@ -1512,8 +1541,8 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask, remote_partition_disable(child, tmp); disable_cnt++; } - if (disable_cnt && !force_sd_rebuild) - rebuild_sched_domains_locked(); + if (disable_cnt) + cpuset_force_rebuild(); } /* @@ -1923,12 +1952,6 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, } /* - * update_cpumasks_hier() flags - */ -#define HIER_CHECKALL 0x01 /* Check all cpusets with no skipping */ -#define HIER_NO_SD_REBUILD 0x02 /* Don't rebuild sched domains */ - -/* * update_cpumasks_hier - Update effective cpumasks and tasks in the subtree * @cs: the cpuset to consider * @tmp: temp variables for calculating effective_cpus & partition setup @@ -1942,7 +1965,7 @@ static void compute_partition_effective_cpumask(struct cpuset *cs, * Called with cpuset_mutex held */ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, - int flags) + bool force) { struct cpuset *cp; struct cgroup_subsys_state *pos_css; @@ -2007,12 +2030,12 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * Skip the whole subtree if * 1) the cpumask remains the same, * 2) has no partition root state, - * 3) HIER_CHECKALL flag not set, and + * 3) force flag not set, and * 4) for v2 load balance state same as its parent. */ - if (!cp->partition_root_state && !(flags & HIER_CHECKALL) && + if (!cp->partition_root_state && !force && cpumask_equal(tmp->new_cpus, cp->effective_cpus) && - (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || + (!cpuset_v2() || (is_sched_load_balance(parent) == is_sched_load_balance(cp)))) { pos_css = css_rightmost_descendant(pos_css); continue; @@ -2086,8 +2109,7 @@ get_css: * from parent if current cpuset isn't a valid partition root * and their load balance states differ. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_partition_valid(cp) && + if (cpuset_v2() && !is_partition_valid(cp) && (is_sched_load_balance(parent) != is_sched_load_balance(cp))) { if (is_sched_load_balance(parent)) set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags); @@ -2103,8 +2125,7 @@ get_css: */ if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && - (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - is_partition_valid(cp))) + (!cpuset_v2() || is_partition_valid(cp))) need_rebuild_sched_domains = true; rcu_read_lock(); @@ -2112,9 +2133,8 @@ get_css: } rcu_read_unlock(); - if (need_rebuild_sched_domains && !(flags & HIER_NO_SD_REBUILD) && - !force_sd_rebuild) - rebuild_sched_domains_locked(); + if (need_rebuild_sched_domains) + cpuset_force_rebuild(); } /** @@ -2141,9 +2161,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, * directly. * * The update_cpumasks_hier() function may sleep. So we have to - * release the RCU read lock before calling it. HIER_NO_SD_REBUILD - * flag is used to suppress rebuild of sched domains as the callers - * will take care of that. + * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { @@ -2159,7 +2177,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; rcu_read_unlock(); - update_cpumasks_hier(sibling, tmp, HIER_NO_SD_REBUILD); + update_cpumasks_hier(sibling, tmp, false); rcu_read_lock(); css_put(&sibling->css); } @@ -2179,7 +2197,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, struct tmpmasks tmp; struct cpuset *parent = parent_cs(cs); bool invalidate = false; - int hier_flags = 0; + bool force = false; int old_prs = cs->partition_root_state; /* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */ @@ -2206,7 +2224,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, return -EINVAL; /* - * When exclusive_cpus isn't explicitly set, it is constrainted + * When exclusive_cpus isn't explicitly set, it is constrained * by cpus_allowed and parent's effective_xcpus. Otherwise, * trialcs->effective_xcpus is used as a temporary cpumask * for checking validity of the partition root. @@ -2240,12 +2258,11 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Check all the descendants in update_cpumasks_hier() if * effective_xcpus is to be changed. */ - if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) - hier_flags = HIER_CHECKALL; + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); retval = validate_change(cs, trialcs); - if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) { + if ((retval == -EINVAL) && cpuset_v2()) { struct cgroup_subsys_state *css; struct cpuset *cp; @@ -2309,7 +2326,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, spin_unlock_irq(&callback_lock); /* effective_cpus/effective_xcpus will be updated here */ - update_cpumasks_hier(cs, &tmp, hier_flags); + update_cpumasks_hier(cs, &tmp, force); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) @@ -2334,7 +2351,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, struct tmpmasks tmp; struct cpuset *parent = parent_cs(cs); bool invalidate = false; - int hier_flags = 0; + bool force = false; int old_prs = cs->partition_root_state; if (!*buf) { @@ -2357,8 +2374,7 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, * Check all the descendants in update_cpumasks_hier() if * effective_xcpus is to be changed. */ - if (!cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus)) - hier_flags = HIER_CHECKALL; + force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus); retval = validate_change(cs, trialcs); if (retval) @@ -2411,8 +2427,8 @@ static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs, * of the subtree when it is a valid partition root or effective_xcpus * is updated. */ - if (is_partition_valid(cs) || hier_flags) - update_cpumasks_hier(cs, &tmp, hier_flags); + if (is_partition_valid(cs) || force) + update_cpumasks_hier(cs, &tmp, force); /* Update CS_SCHED_LOAD_BALANCE and/or sched_domains, if necessary */ if (cs->partition_root_state) @@ -2737,9 +2753,12 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs, cs->flags = trialcs->flags; spin_unlock_irq(&callback_lock); - if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed && - !force_sd_rebuild) - rebuild_sched_domains_locked(); + if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) { + if (cpuset_v2()) + cpuset_force_rebuild(); + else + rebuild_sched_domains_locked(); + } if (spread_flag_changed) cpuset1_update_tasks_flags(cs); @@ -2853,12 +2872,14 @@ out: update_unbound_workqueue_cpumask(new_xcpus_state); /* Force update if switching back to member */ - update_cpumasks_hier(cs, &tmpmask, !new_prs ? HIER_CHECKALL : 0); + update_cpumasks_hier(cs, &tmpmask, !new_prs); /* Update sched domains and load balance flag */ update_partition_sd_lb(cs, old_prs); notify_partition_change(cs, old_prs); + if (force_sd_rebuild) + rebuild_sched_domains_locked(); free_cpumasks(NULL, &tmpmask); return 0; } @@ -2919,8 +2940,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset) * migration permission derives from hierarchy ownership in * cgroup_procs_write_permission()). */ - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - (cpus_updated || mems_updated)) { + if (!cpuset_v2() || (cpus_updated || mems_updated)) { ret = security_task_setscheduler(task); if (ret) goto out_unlock; @@ -3034,8 +3054,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) * in effective cpus and mems. In that case, we can optimize out * by skipping the task iteration and update. */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !cpus_updated && !mems_updated) { + if (cpuset_v2() && !cpus_updated && !mems_updated) { cpuset_attach_nodemask_to = cs->effective_mems; goto out; } @@ -3152,6 +3171,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of, } free_cpuset(trialcs); + if (force_sd_rebuild) + rebuild_sched_domains_locked(); out_unlock: mutex_unlock(&cpuset_mutex); cpus_read_unlock(); @@ -3383,7 +3404,7 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css) INIT_LIST_HEAD(&cs->remote_sibling); /* Set CS_MEMORY_MIGRATE for default hierarchy */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) + if (cpuset_v2()) __set_bit(CS_MEMORY_MIGRATE, &cs->flags); return &cs->css; @@ -3410,8 +3431,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) /* * For v2, clear CS_SCHED_LOAD_BALANCE if parent is isolated */ - if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - !is_sched_load_balance(parent)) + if (cpuset_v2() && !is_sched_load_balance(parent)) clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); cpuset_inc(); @@ -3481,8 +3501,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) if (is_partition_valid(cs)) update_prstate(cs, 0); - if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && - is_sched_load_balance(cs)) + if (!cpuset_v2() && is_sched_load_balance(cs)) cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); cpuset_dec(); @@ -3896,11 +3915,9 @@ static void cpuset_handle_hotplug(void) rcu_read_unlock(); } - /* rebuild sched domains if cpus_allowed has changed */ - if (force_sd_rebuild) { - force_sd_rebuild = false; + /* rebuild sched domains if necessary */ + if (force_sd_rebuild) rebuild_sched_domains_cpuslocked(); - } free_cpumasks(NULL, ptmp); } |