diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-21 02:41:08 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-02-21 02:41:08 +0100 |
commit | 1f2d9ffc7a5f916935749ffc6e93fb33bfe94d2f (patch) | |
tree | a5dabaa924d50867cbe347e20a7643b2850f11c0 /kernel/sched | |
parent | Merge tag 'perf-core-2023-02-20' of git://git.kernel.org/pub/scm/linux/kernel... (diff) | |
parent | sched/rt: pick_next_rt_entity(): check list_entry (diff) | |
download | linux-1f2d9ffc7a5f916935749ffc6e93fb33bfe94d2f.tar.xz linux-1f2d9ffc7a5f916935749ffc6e93fb33bfe94d2f.zip |
Merge tag 'sched-core-2023-02-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- Improve the scalability of the CFS bandwidth unthrottling logic with
large number of CPUs.
- Fix & rework various cpuidle routines, simplify interaction with the
generic scheduler code. Add __cpuidle methods as noinstr to objtool's
noinstr detection and fix boatloads of cpuidle bugs & quirks.
- Add new ABI: introduce MEMBARRIER_CMD_GET_REGISTRATIONS, to query
previously issued registrations.
- Limit scheduler slice duration to the sysctl_sched_latency period, to
improve scheduling granularity with a large number of SCHED_IDLE
tasks.
- Debuggability enhancement on sys_exit(): warn about disabled IRQs,
but also enable them to prevent a cascade of followup problems and
repeat warnings.
- Fix the rescheduling logic in prio_changed_dl().
- Micro-optimize cpufreq and sched-util methods.
- Micro-optimize ttwu_runnable()
- Micro-optimize the idle-scanning in update_numa_stats(),
select_idle_capacity() and steal_cookie_task().
- Update the RSEQ code & self-tests
- Constify various scheduler methods
- Remove unused methods
- Refine __init tags
- Documentation updates
- Misc other cleanups, fixes
* tag 'sched-core-2023-02-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (110 commits)
sched/rt: pick_next_rt_entity(): check list_entry
sched/deadline: Add more reschedule cases to prio_changed_dl()
sched/fair: sanitize vruntime of entity being placed
sched/fair: Remove capacity inversion detection
sched/fair: unlink misfit task from cpu overutilized
objtool: mem*() are not uaccess safe
cpuidle: Fix poll_idle() noinstr annotation
sched/clock: Make local_clock() noinstr
sched/clock/x86: Mark sched_clock() noinstr
x86/pvclock: Improve atomic update of last_value in pvclock_clocksource_read()
x86/atomics: Always inline arch_atomic64*()
cpuidle: tracing, preempt: Squash _rcuidle tracing
cpuidle: tracing: Warn about !rcu_is_watching()
cpuidle: lib/bug: Disable rcu_is_watching() during WARN/BUG
cpuidle: drivers: firmware: psci: Dont instrument suspend code
KVM: selftests: Fix build of rseq test
exit: Detect and fix irq disabled state in oops
cpuidle, arm64: Fix the ARM64 cpuidle logic
cpuidle: mvebu: Fix duplicate flags assignment
sched/fair: Limit sched slice duration
...
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/clock.c | 27 | ||||
-rw-r--r-- | kernel/sched/core.c | 134 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 43 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 4 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 42 | ||||
-rw-r--r-- | kernel/sched/fair.c | 389 | ||||
-rw-r--r-- | kernel/sched/idle.c | 47 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 39 | ||||
-rw-r--r-- | kernel/sched/rt.c | 5 | ||||
-rw-r--r-- | kernel/sched/sched.h | 107 | ||||
-rw-r--r-- | kernel/sched/topology.c | 4 |
11 files changed, 565 insertions, 276 deletions
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e374c0c923da..5732fa75ebab 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -93,7 +93,7 @@ struct sched_clock_data { static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); -notrace static inline struct sched_clock_data *this_scd(void) +static __always_inline struct sched_clock_data *this_scd(void) { return this_cpu_ptr(&sched_clock_data); } @@ -244,12 +244,12 @@ late_initcall(sched_clock_init_late); * min, max except they take wrapping into account */ -notrace static inline u64 wrap_min(u64 x, u64 y) +static __always_inline u64 wrap_min(u64 x, u64 y) { return (s64)(x - y) < 0 ? x : y; } -notrace static inline u64 wrap_max(u64 x, u64 y) +static __always_inline u64 wrap_max(u64 x, u64 y) { return (s64)(x - y) > 0 ? x : y; } @@ -260,7 +260,7 @@ notrace static inline u64 wrap_max(u64 x, u64 y) * - filter out backward motion * - use the GTOD tick value to create a window to filter crazy TSC values */ -notrace static u64 sched_clock_local(struct sched_clock_data *scd) +static __always_inline u64 sched_clock_local(struct sched_clock_data *scd) { u64 now, clock, old_clock, min_clock, max_clock, gtod; s64 delta; @@ -287,13 +287,28 @@ again: clock = wrap_max(clock, min_clock); clock = wrap_min(clock, max_clock); - if (!try_cmpxchg64(&scd->clock, &old_clock, clock)) + if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock)) goto again; return clock; } -notrace static u64 sched_clock_remote(struct sched_clock_data *scd) +noinstr u64 local_clock(void) +{ + u64 clock; + + if (static_branch_likely(&__sched_clock_stable)) + return sched_clock() + __sched_clock_offset; + + preempt_disable_notrace(); + clock = sched_clock_local(this_scd()); + preempt_enable_notrace(); + + return clock; +} +EXPORT_SYMBOL_GPL(local_clock); + +static notrace u64 sched_clock_remote(struct sched_clock_data *scd) { struct sched_clock_data *my_scd = this_scd(); u64 this_clock, remote_clock; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2a4918a1faa9..fb49dbf61273 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -152,7 +152,7 @@ __read_mostly int scheduler_running; DEFINE_STATIC_KEY_FALSE(__sched_core_enabled); /* kernel prio, less is more */ -static inline int __task_prio(struct task_struct *p) +static inline int __task_prio(const struct task_struct *p) { if (p->sched_class == &stop_sched_class) /* trumps deadline */ return -2; @@ -174,7 +174,8 @@ static inline int __task_prio(struct task_struct *p) */ /* real prio, less is less */ -static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +static inline bool prio_less(const struct task_struct *a, + const struct task_struct *b, bool in_fi) { int pa = __task_prio(a), pb = __task_prio(b); @@ -194,7 +195,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool return false; } -static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b) +static inline bool __sched_core_less(const struct task_struct *a, + const struct task_struct *b) { if (a->core_cookie < b->core_cookie) return true; @@ -3675,14 +3677,39 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } /* - * Mark the task runnable and perform wakeup-preemption. + * Mark the task runnable. */ -static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, - struct rq_flags *rf) +static inline void ttwu_do_wakeup(struct task_struct *p) { - check_preempt_curr(rq, p, wake_flags); WRITE_ONCE(p->__state, TASK_RUNNING); trace_sched_wakeup(p); +} + +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, + struct rq_flags *rf) +{ + int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; + + lockdep_assert_rq_held(rq); + + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; + +#ifdef CONFIG_SMP + if (wake_flags & WF_MIGRATED) + en_flags |= ENQUEUE_MIGRATED; + else +#endif + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + + activate_task(rq, p, en_flags); + check_preempt_curr(rq, p, wake_flags); + + ttwu_do_wakeup(p); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { @@ -3712,31 +3739,6 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, #endif } -static void -ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, - struct rq_flags *rf) -{ - int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; - - lockdep_assert_rq_held(rq); - - if (p->sched_contributes_to_load) - rq->nr_uninterruptible--; - -#ifdef CONFIG_SMP - if (wake_flags & WF_MIGRATED) - en_flags |= ENQUEUE_MIGRATED; - else -#endif - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - - activate_task(rq, p, en_flags); - ttwu_do_wakeup(rq, p, wake_flags, rf); -} - /* * Consider @p being inside a wait loop: * @@ -3770,9 +3772,15 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) rq = __task_rq_lock(p, &rf); if (task_on_rq_queued(p)) { - /* check_preempt_curr() may use rq clock */ - update_rq_clock(rq); - ttwu_do_wakeup(rq, p, wake_flags, &rf); + if (!task_on_cpu(rq, p)) { + /* + * When on_rq && !on_cpu the task is preempted, see if + * it should preempt the task that is current now. + */ + update_rq_clock(rq); + check_preempt_curr(rq, p, wake_flags); + } + ttwu_do_wakeup(p); ret = 1; } __task_rq_unlock(rq, &rf); @@ -4138,8 +4146,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) goto out; trace_sched_waking(p); - WRITE_ONCE(p->__state, TASK_RUNNING); - trace_sched_wakeup(p); + ttwu_do_wakeup(p); goto out; } @@ -5104,6 +5111,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); rseq_preempt(prev); + switch_mm_cid(prev, next); fire_sched_out_preempt_notifiers(prev, next); kmap_local_sched_out(); prepare_task(next); @@ -6260,7 +6268,7 @@ static bool steal_cookie_task(int cpu, struct sched_domain *sd) { int i; - for_each_cpu_wrap(i, sched_domain_span(sd), cpu) { + for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) { if (i == cpu) continue; @@ -11365,3 +11373,53 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count) { trace_sched_update_nr_running_tp(rq, count); } + +#ifdef CONFIG_SCHED_MM_CID +void sched_mm_cid_exit_signals(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + unsigned long flags; + + if (!mm) + return; + local_irq_save(flags); + mm_cid_put(mm, t->mm_cid); + t->mm_cid = -1; + t->mm_cid_active = 0; + local_irq_restore(flags); +} + +void sched_mm_cid_before_execve(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + unsigned long flags; + + if (!mm) + return; + local_irq_save(flags); + mm_cid_put(mm, t->mm_cid); + t->mm_cid = -1; + t->mm_cid_active = 0; + local_irq_restore(flags); +} + +void sched_mm_cid_after_execve(struct task_struct *t) +{ + struct mm_struct *mm = t->mm; + unsigned long flags; + + if (!mm) + return; + local_irq_save(flags); + t->mm_cid = mm_cid_get(mm); + t->mm_cid_active = 1; + local_irq_restore(flags); + rseq_set_notify_resume(t); +} + +void sched_mm_cid_fork(struct task_struct *t) +{ + WARN_ON_ONCE(!t->mm || t->mm_cid != -1); + t->mm_cid_active = 1; +} +#endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 1207c78f85c1..5c840151f3bb 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -48,7 +48,6 @@ struct sugov_cpu { unsigned long util; unsigned long bw_dl; - unsigned long max; /* The field below is for single-CPU policies only: */ #ifdef CONFIG_NO_HZ_COMMON @@ -158,7 +157,6 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); sg_cpu->bw_dl = cpu_bw_dl(rq); sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu), FREQUENCY_UTIL, NULL); @@ -238,6 +236,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * sugov_iowait_apply() - Apply the IO boost to a CPU. * @sg_cpu: the sugov data for the cpu to boost * @time: the update time from the caller + * @max_cap: the max CPU capacity * * A CPU running a task which woken up after an IO operation can have its * utilization boosted to speed up the completion of those IO operations. @@ -251,7 +250,8 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) +static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, + unsigned long max_cap) { unsigned long boost; @@ -280,7 +280,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) * sg_cpu->util is already in capacity scale; convert iowait_boost * into the same scale so we can compare. */ - boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT; + boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT; boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL); if (sg_cpu->util < boost) sg_cpu->util = boost; @@ -310,7 +310,8 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu) } static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, - u64 time, unsigned int flags) + u64 time, unsigned long max_cap, + unsigned int flags) { sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -321,7 +322,7 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, return false; sugov_get_util(sg_cpu); - sugov_iowait_apply(sg_cpu, time); + sugov_iowait_apply(sg_cpu, time, max_cap); return true; } @@ -332,12 +333,15 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned int cached_freq = sg_policy->cached_raw_freq; + unsigned long max_cap; unsigned int next_f; - if (!sugov_update_single_common(sg_cpu, time, flags)) + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + + if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) return; - next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max); + next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); /* * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. @@ -374,6 +378,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); unsigned long prev_util = sg_cpu->util; + unsigned long max_cap; /* * Fall back to the "frequency" path if frequency invariance is not @@ -385,7 +390,9 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, return; } - if (!sugov_update_single_common(sg_cpu, time, flags)) + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + + if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) return; /* @@ -399,7 +406,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, sg_cpu->util = prev_util; cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), - map_util_perf(sg_cpu->util), sg_cpu->max); + map_util_perf(sg_cpu->util), max_cap); sg_cpu->sg_policy->last_freq_update_time = time; } @@ -408,25 +415,21 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - unsigned long util = 0, max = 1; + unsigned long util = 0, max_cap; unsigned int j; + max_cap = arch_scale_cpu_capacity(sg_cpu->cpu); + for_each_cpu(j, policy->cpus) { struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); - unsigned long j_util, j_max; sugov_get_util(j_sg_cpu); - sugov_iowait_apply(j_sg_cpu, time); - j_util = j_sg_cpu->util; - j_max = j_sg_cpu->max; + sugov_iowait_apply(j_sg_cpu, time, max_cap); - if (j_util * max > j_max * util) { - util = j_util; - max = j_max; - } + util = max(j_sg_cpu->util, util); } - return get_next_freq(sg_policy, util, max); + return get_next_freq(sg_policy, util, max_cap); } static void diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 95fc77853743..af7952f12e6c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -3,6 +3,10 @@ * Simple CPU accounting cgroup controller */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + #include <asm/cputime.h> +#endif + #ifdef CONFIG_IRQ_TIME_ACCOUNTING /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0d97d54276cc..71b24371a6f7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2663,17 +2663,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - if (task_on_rq_queued(p) || task_current(rq, p)) { + if (!task_on_rq_queued(p)) + return; + #ifdef CONFIG_SMP - /* - * This might be too much, but unfortunately - * we don't have the old deadline value, and - * we can't argue if the task is increasing - * or lowering its prio, so... - */ - if (!rq->dl.overloaded) - deadline_queue_pull_task(rq); + /* + * This might be too much, but unfortunately + * we don't have the old deadline value, and + * we can't argue if the task is increasing + * or lowering its prio, so... + */ + if (!rq->dl.overloaded) + deadline_queue_pull_task(rq); + if (task_current(rq, p)) { /* * If we now have a earlier deadline task than p, * then reschedule, provided p is still on this @@ -2681,15 +2684,24 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, */ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline)) resched_curr(rq); -#else + } else { /* - * Again, we don't know if p has a earlier - * or later deadline, so let's blindly set a - * (maybe not needed) rescheduling point. + * Current may not be deadline in case p was throttled but we + * have just replenished it (e.g. rt_mutex_setprio()). + * + * Otherwise, if p was given an earlier deadline, reschedule. */ - resched_curr(rq); -#endif /* CONFIG_SMP */ + if (!dl_task(rq->curr) || + dl_time_before(p->dl.deadline, rq->curr->dl.deadline)) + resched_curr(rq); } +#else + /* + * We don't know if p has a earlier or later deadline, so let's blindly + * set a (maybe not needed) rescheduling point. + */ + resched_curr(rq); +#endif } DEFINE_SCHED_CLASS(dl) = { diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0f8736991427..ff4dbbae3b10 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -468,7 +468,7 @@ is_same_group(struct sched_entity *se, struct sched_entity *pse) return NULL; } -static inline struct sched_entity *parent_entity(struct sched_entity *se) +static inline struct sched_entity *parent_entity(const struct sched_entity *se) { return se->parent; } @@ -595,8 +595,8 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } -static inline bool entity_before(struct sched_entity *a, - struct sched_entity *b) +static inline bool entity_before(const struct sched_entity *a, + const struct sched_entity *b) { return (s64)(a->vruntime - b->vruntime) < 0; } @@ -1804,7 +1804,7 @@ static void update_numa_stats(struct task_numa_env *env, ns->nr_running += rq->cfs.h_nr_running; ns->compute_capacity += capacity_of(cpu); - if (find_idle && !rq->nr_running && idle_cpu(cpu)) { + if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) { if (READ_ONCE(rq->numa_migrate_on) || !cpumask_test_cpu(cpu, env->p->cpus_ptr)) continue; @@ -1836,7 +1836,7 @@ static void task_numa_assign(struct task_numa_env *env, int start = env->dst_cpu; /* Find alternative idle CPU. */ - for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) { + for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) { if (cpu == env->best_cpu || !idle_cpu(cpu) || !cpumask_test_cpu(cpu, env->p->cpus_ptr)) { continue; @@ -4476,17 +4476,9 @@ static inline int util_fits_cpu(unsigned long util, * * For uclamp_max, we can tolerate a drop in performance level as the * goal is to cap the task. So it's okay if it's getting less. - * - * In case of capacity inversion we should honour the inverted capacity - * for both uclamp_min and uclamp_max all the time. */ - capacity_orig = cpu_in_capacity_inversion(cpu); - if (capacity_orig) { - capacity_orig_thermal = capacity_orig; - } else { - capacity_orig = capacity_orig_of(cpu); - capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); - } + capacity_orig = capacity_orig_of(cpu); + capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); /* * We want to force a task to fit a cpu as implied by uclamp_max. @@ -4561,8 +4553,8 @@ static inline int util_fits_cpu(unsigned long util, * handle the case uclamp_min > uclamp_max. */ uclamp_min = min(uclamp_min, uclamp_max); - if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE) - fits = fits && (uclamp_min <= capacity_orig_thermal); + if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal)) + return -1; return fits; } @@ -4572,7 +4564,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu) unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); unsigned long util = task_util_est(p); - return util_fits_cpu(util, uclamp_min, uclamp_max, cpu); + /* + * Return true only if the cpu fully fits the task requirements, which + * include the utilization but also the performance hints. + */ + return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) @@ -4656,6 +4652,7 @@ static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime = cfs_rq->min_vruntime; + u64 sleep_time; /* * The 'current' period is already promised to the current tasks, @@ -4685,8 +4682,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime -= thresh; } - /* ensure we never gain time by being placed backwards. */ - se->vruntime = max_vruntime(se->vruntime, vruntime); + /* + * Pull vruntime of the entity being placed to the base level of + * cfs_rq, to prevent boosting it if placed backwards. If the entity + * slept for a long time, don't even try to compare its vruntime with + * the base as it may be too far off and the comparison may get + * inversed due to s64 overflow. + */ + sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start; + if ((s64)sleep_time > 60LL * NSEC_PER_SEC) + se->vruntime = vruntime; + else + se->vruntime = max_vruntime(se->vruntime, vruntime); } static void check_enqueue_throttle(struct cfs_rq *cfs_rq); @@ -4896,7 +4903,13 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) struct sched_entity *se; s64 delta; - ideal_runtime = sched_slice(cfs_rq, curr); + /* + * When many tasks blow up the sched_period; it is possible that + * sched_slice() reports unusually large results (when many tasks are + * very light for example). Therefore impose a maximum. + */ + ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency); + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { resched_curr(rq_of(cfs_rq)); @@ -5461,22 +5474,105 @@ unthrottle_throttle: resched_curr(rq); } -static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) +#ifdef CONFIG_SMP +static void __cfsb_csd_unthrottle(void *arg) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cursor, *tmp; + struct rq *rq = arg; + struct rq_flags rf; + + rq_lock(rq, &rf); + + /* + * Since we hold rq lock we're safe from concurrent manipulation of + * the CSD list. However, this RCU critical section annotates the + * fact that we pair with sched_free_group_rcu(), so that we cannot + * race with group being freed in the window between removing it + * from the list and advancing to the next entry in the list. + */ + rcu_read_lock(); + + list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list, + throttled_csd_list) { + list_del_init(&cursor->throttled_csd_list); + + if (cfs_rq_throttled(cursor)) + unthrottle_cfs_rq(cursor); + } + + rcu_read_unlock(); + + rq_unlock(rq, &rf); +} + +static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + bool first; + + if (rq == this_rq()) { + unthrottle_cfs_rq(cfs_rq); + return; + } + + /* Already enqueued */ + if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list))) + return; + + first = list_empty(&rq->cfsb_csd_list); + list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list); + if (first) + smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd); +} +#else +static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ + unthrottle_cfs_rq(cfs_rq); +} +#endif + +static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) +{ + lockdep_assert_rq_held(rq_of(cfs_rq)); + + if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) || + cfs_rq->runtime_remaining <= 0)) + return; + + __unthrottle_cfs_rq_async(cfs_rq); +} + +static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) +{ + struct cfs_rq *local_unthrottle = NULL; + int this_cpu = smp_processor_id(); u64 runtime, remaining = 1; + bool throttled = false; + struct cfs_rq *cfs_rq; + struct rq_flags rf; + struct rq *rq; rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) { - struct rq *rq = rq_of(cfs_rq); - struct rq_flags rf; + rq = rq_of(cfs_rq); + + if (!remaining) { + throttled = true; + break; + } rq_lock_irqsave(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; - /* By the above check, this should never be true */ +#ifdef CONFIG_SMP + /* Already queued for async unthrottle */ + if (!list_empty(&cfs_rq->throttled_csd_list)) + goto next; +#endif + + /* By the above checks, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); raw_spin_lock(&cfs_b->lock); @@ -5490,16 +5586,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) cfs_rq->runtime_remaining += runtime; /* we check whether we're throttled above */ - if (cfs_rq->runtime_remaining > 0) - unthrottle_cfs_rq(cfs_rq); + if (cfs_rq->runtime_remaining > 0) { + if (cpu_of(rq) != this_cpu || + SCHED_WARN_ON(local_unthrottle)) + unthrottle_cfs_rq_async(cfs_rq); + else + local_unthrottle = cfs_rq; + } else { + throttled = true; + } next: rq_unlock_irqrestore(rq, &rf); - - if (!remaining) - break; } rcu_read_unlock(); + + if (local_unthrottle) { + rq = cpu_rq(this_cpu); + rq_lock_irqsave(rq, &rf); + if (cfs_rq_throttled(local_unthrottle)) + unthrottle_cfs_rq(local_unthrottle); + rq_unlock_irqrestore(rq, &rf); + } + + return throttled; } /* @@ -5544,10 +5654,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u while (throttled && cfs_b->runtime > 0) { raw_spin_unlock_irqrestore(&cfs_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ - distribute_cfs_runtime(cfs_b); + throttled = distribute_cfs_runtime(cfs_b); raw_spin_lock_irqsave(&cfs_b->lock, flags); - - throttled = !list_empty(&cfs_b->throttled_cfs_rq); } /* @@ -5824,6 +5932,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); +#ifdef CONFIG_SMP + INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); +#endif } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -5840,12 +5951,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { + int __maybe_unused i; + /* init_cfs_bandwidth() was not called */ if (!cfs_b->throttled_cfs_rq.next) return; hrtimer_cancel(&cfs_b->period_timer); hrtimer_cancel(&cfs_b->slack_timer); + + /* + * It is possible that we still have some cfs_rq's pending on a CSD + * list, though this race is very rare. In order for this to occur, we + * must have raced with the last task leaving the group while there + * exist throttled cfs_rq(s), and the period_timer must have queued the + * CSD item but the remote cpu has not yet processed it. To handle this, + * we can simply flush all pending CSD work inline here. We're + * guaranteed at this point that no additional cfs_rq of this group can + * join a CSD list. + */ +#ifdef CONFIG_SMP + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + unsigned long flags; + + if (list_empty(&rq->cfsb_csd_list)) + continue; + + local_irq_save(flags); + __cfsb_csd_unthrottle(rq); + local_irq_restore(flags); + } +#endif } /* @@ -6008,6 +6145,7 @@ static inline bool cpu_overutilized(int cpu) unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + /* Return true only if the utilization doesn't fit CPU's capacity */ return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); } @@ -6801,6 +6939,7 @@ static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { unsigned long task_util, util_min, util_max, best_cap = 0; + int fits, best_fits = 0; int cpu, best_cpu = -1; struct cpumask *cpus; @@ -6811,17 +6950,33 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) util_min = uclamp_eff_value(p, UCLAMP_MIN); util_max = uclamp_eff_value(p, UCLAMP_MAX); - for_each_cpu_wrap(cpu, cpus, target) { + for_each_cpu_wrap(cpu, cpus, target + 1) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; - if (util_fits_cpu(task_util, util_min, util_max, cpu)) + + fits = util_fits_cpu(task_util, util_min, util_max, cpu); + + /* This CPU fits with all requirements */ + if (fits > 0) return cpu; + /* + * Only the min performance hint (i.e. uclamp_min) doesn't fit. + * Look for the CPU with best capacity. + */ + else if (fits < 0) + cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu)); - if (cpu_cap > best_cap) { + /* + * First, select CPU which fits better (-1 being better than 0). + * Then, select the one with best capacity at same level. + */ + if ((fits < best_fits) || + ((fits == best_fits) && (cpu_cap > best_cap))) { best_cap = cpu_cap; best_cpu = cpu; + best_fits = fits; } } @@ -6834,7 +6989,11 @@ static inline bool asym_fits_cpu(unsigned long util, int cpu) { if (sched_asym_cpucap_active()) - return util_fits_cpu(util, util_min, util_max, cpu); + /* + * Return true only if the cpu fully fits the task requirements + * which include the utilization and the performance hints. + */ + return (util_fits_cpu(util, util_min, util_max, cpu) > 0); return true; } @@ -7201,6 +7360,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024; struct root_domain *rd = this_rq()->rd; int cpu, best_energy_cpu, target = -1; + int prev_fits = -1, best_fits = -1; + unsigned long best_thermal_cap = 0; + unsigned long prev_thermal_cap = 0; struct sched_domain *sd; struct perf_domain *pd; struct energy_env eenv; @@ -7236,6 +7398,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) unsigned long prev_spare_cap = 0; int max_spare_cap_cpu = -1; unsigned long base_energy; + int fits, max_fits = -1; cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); @@ -7285,7 +7448,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) util_min = max(rq_util_min, p_util_min); util_max = max(rq_util_max, p_util_max); } - if (!util_fits_cpu(util, util_min, util_max, cpu)) + + fits = util_fits_cpu(util, util_min, util_max, cpu); + if (!fits) continue; lsub_positive(&cpu_cap, util); @@ -7293,7 +7458,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (cpu == prev_cpu) { /* Always use prev_cpu as a candidate. */ prev_spare_cap = cpu_cap; - } else if (cpu_cap > max_spare_cap) { + prev_fits = fits; + } else if ((fits > max_fits) || + ((fits == max_fits) && (cpu_cap > max_spare_cap))) { /* * Find the CPU with the maximum spare capacity * among the remaining CPUs in the performance @@ -7301,6 +7468,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) */ max_spare_cap = cpu_cap; max_spare_cap_cpu = cpu; + max_fits = fits; } } @@ -7319,26 +7487,50 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) if (prev_delta < base_energy) goto unlock; prev_delta -= base_energy; + prev_thermal_cap = cpu_thermal_cap; best_delta = min(best_delta, prev_delta); } /* Evaluate the energy impact of using max_spare_cap_cpu. */ if (max_spare_cap_cpu >= 0 && max_spare_cap > prev_spare_cap) { + /* Current best energy cpu fits better */ + if (max_fits < best_fits) + continue; + + /* + * Both don't fit performance hint (i.e. uclamp_min) + * but best energy cpu has better capacity. + */ + if ((max_fits < 0) && + (cpu_thermal_cap <= best_thermal_cap)) + continue; + cur_delta = compute_energy(&eenv, pd, cpus, p, max_spare_cap_cpu); /* CPU utilization has changed */ if (cur_delta < base_energy) goto unlock; cur_delta -= base_energy; - if (cur_delta < best_delta) { - best_delta = cur_delta; - best_energy_cpu = max_spare_cap_cpu; - } + + /* + * Both fit for the task but best energy cpu has lower + * energy impact. + */ + if ((max_fits > 0) && (best_fits > 0) && + (cur_delta >= best_delta)) + continue; + + best_delta = cur_delta; + best_energy_cpu = max_spare_cap_cpu; + best_fits = max_fits; + best_thermal_cap = cpu_thermal_cap; } } rcu_read_unlock(); - if (best_delta < prev_delta) + if ((best_fits > prev_fits) || + ((best_fits > 0) && (best_delta < prev_delta)) || + ((best_fits < 0) && (best_thermal_cap > prev_thermal_cap))) target = best_energy_cpu; return target; @@ -8838,82 +9030,16 @@ static unsigned long scale_rt_capacity(int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu) { - unsigned long capacity_orig = arch_scale_cpu_capacity(cpu); unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; - struct rq *rq = cpu_rq(cpu); - rq->cpu_capacity_orig = capacity_orig; + cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); if (!capacity) capacity = 1; - rq->cpu_capacity = capacity; - - /* - * Detect if the performance domain is in capacity inversion state. - * - * Capacity inversion happens when another perf domain with equal or - * lower capacity_orig_of() ends up having higher capacity than this - * domain after subtracting thermal pressure. - * - * We only take into account thermal pressure in this detection as it's - * the only metric that actually results in *real* reduction of - * capacity due to performance points (OPPs) being dropped/become - * unreachable due to thermal throttling. - * - * We assume: - * * That all cpus in a perf domain have the same capacity_orig - * (same uArch). - * * Thermal pressure will impact all cpus in this perf domain - * equally. - */ - if (sched_energy_enabled()) { - unsigned long inv_cap = capacity_orig - thermal_load_avg(rq); - struct perf_domain *pd; - - rcu_read_lock(); - - pd = rcu_dereference(rq->rd->pd); - rq->cpu_capacity_inverted = 0; - - for (; pd; pd = pd->next) { - struct cpumask *pd_span = perf_domain_span(pd); - unsigned long pd_cap_orig, pd_cap; - - /* We can't be inverted against our own pd */ - if (cpumask_test_cpu(cpu_of(rq), pd_span)) - continue; - - cpu = cpumask_any(pd_span); - pd_cap_orig = arch_scale_cpu_capacity(cpu); - - if (capacity_orig < pd_cap_orig) - continue; - - /* - * handle the case of multiple perf domains have the - * same capacity_orig but one of them is under higher - * thermal pressure. We record it as capacity - * inversion. - */ - if (capacity_orig == pd_cap_orig) { - pd_cap = pd_cap_orig - thermal_load_avg(cpu_rq(cpu)); - - if (pd_cap > inv_cap) { - rq->cpu_capacity_inverted = inv_cap; - break; - } - } else if (pd_cap_orig > inv_cap) { - rq->cpu_capacity_inverted = inv_cap; - break; - } - } - - rcu_read_unlock(); - } - - trace_sched_cpu_capacity_tp(rq); + cpu_rq(cpu)->cpu_capacity = capacity; + trace_sched_cpu_capacity_tp(cpu_rq(cpu)); sdg->sgc->capacity = capacity; sdg->sgc->min_capacity = capacity; @@ -10141,24 +10267,23 @@ static struct sched_group *find_busiest_group(struct lb_env *env) */ update_sd_lb_stats(env, &sds); - if (sched_energy_enabled()) { - struct root_domain *rd = env->dst_rq->rd; - - if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) - goto out_balanced; - } - - local = &sds.local_stat; - busiest = &sds.busiest_stat; - /* There is no busy sibling group to pull tasks from */ if (!sds.busiest) goto out_balanced; + busiest = &sds.busiest_stat; + /* Misfit tasks should be dealt with regardless of the avg load */ if (busiest->group_type == group_misfit_task) goto force_balance; + if (sched_energy_enabled()) { + struct root_domain *rd = env->dst_rq->rd; + + if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) + goto out_balanced; + } + /* ASYM feature bypasses nice load balance check */ if (busiest->group_type == group_asym_packing) goto force_balance; @@ -10171,6 +10296,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; + local = &sds.local_stat; /* * If the local group is busier than the selected busiest group * don't try and pull any tasks. @@ -11734,7 +11860,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) /* * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. */ -static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle) +static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, + bool forceidle) { for_each_sched_entity(se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -11759,11 +11886,12 @@ void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi) se_fi_update(se, rq->core->core_forceidle_seq, in_fi); } -bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi) +bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, + bool in_fi) { struct rq *rq = task_rq(a); - struct sched_entity *sea = &a->se; - struct sched_entity *seb = &b->se; + const struct sched_entity *sea = &a->se; + const struct sched_entity *seb = &b->se; struct cfs_rq *cfs_rqa; struct cfs_rq *cfs_rqb; s64 delta; @@ -12480,6 +12608,11 @@ __init void init_sched_fair_class(void) for_each_possible_cpu(i) { zalloc_cpumask_var_node(&per_cpu(load_balance_mask, i), GFP_KERNEL, cpu_to_node(i)); zalloc_cpumask_var_node(&per_cpu(select_rq_mask, i), GFP_KERNEL, cpu_to_node(i)); + +#ifdef CONFIG_CFS_BANDWIDTH + INIT_CSD(&cpu_rq(i)->cfsb_csd, __cfsb_csd_unthrottle, cpu_rq(i)); + INIT_LIST_HEAD(&cpu_rq(i)->cfsb_csd_list); +#endif } open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f26ab2675f7d..e9ef66be2870 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -51,18 +51,22 @@ __setup("hlt", cpu_idle_nopoll_setup); static noinline int __cpuidle cpu_idle_poll(void) { + instrumentation_begin(); trace_cpu_idle(0, smp_processor_id()); stop_critical_timings(); - ct_idle_enter(); - local_irq_enable(); + ct_cpuidle_enter(); + raw_local_irq_enable(); while (!tif_need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired())) cpu_relax(); + raw_local_irq_disable(); - ct_idle_exit(); + ct_cpuidle_exit(); start_critical_timings(); trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); + local_irq_enable(); + instrumentation_end(); return 1; } @@ -75,7 +79,6 @@ void __weak arch_cpu_idle_dead(void) { } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; - raw_local_irq_enable(); } /** @@ -85,44 +88,20 @@ void __weak arch_cpu_idle(void) */ void __cpuidle default_idle_call(void) { - if (current_clr_polling_and_test()) { - local_irq_enable(); - } else { - + instrumentation_begin(); + if (!current_clr_polling_and_test()) { trace_cpu_idle(1, smp_processor_id()); stop_critical_timings(); - /* - * arch_cpu_idle() is supposed to enable IRQs, however - * we can't do that because of RCU and tracing. - * - * Trace IRQs enable here, then switch off RCU, and have - * arch_cpu_idle() use raw_local_irq_enable(). Note that - * ct_idle_enter() relies on lockdep IRQ state, so switch that - * last -- this is very similar to the entry code. - */ - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(); - ct_idle_enter(); - lockdep_hardirqs_on(_THIS_IP_); - + ct_cpuidle_enter(); arch_cpu_idle(); - - /* - * OK, so IRQs are enabled here, but RCU needs them disabled to - * turn itself back on.. funny thing is that disabling IRQs - * will cause tracing, which needs RCU. Jump through hoops to - * make it 'work'. - */ - raw_local_irq_disable(); - lockdep_hardirqs_off(_THIS_IP_); - ct_idle_exit(); - lockdep_hardirqs_on(_THIS_IP_); - raw_local_irq_enable(); + ct_cpuidle_exit(); start_critical_timings(); trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } + local_irq_enable(); + instrumentation_end(); } static int call_cpuidle_s2idle(struct cpuidle_driver *drv, diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 0c5be7ebb1dc..2ad881d07752 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -159,7 +159,8 @@ | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ - | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK) + | MEMBARRIER_PRIVATE_EXPEDITED_RSEQ_BITMASK \ + | MEMBARRIER_CMD_GET_REGISTRATIONS) static void ipi_mb(void *info) { @@ -540,6 +541,40 @@ static int membarrier_register_private_expedited(int flags) return 0; } +static int membarrier_get_registrations(void) +{ + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + int registrations_mask = 0, membarrier_state, i; + static const int states[] = { + MEMBARRIER_STATE_GLOBAL_EXPEDITED | + MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, + MEMBARRIER_STATE_PRIVATE_EXPEDITED | + MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE | + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY, + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ | + MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY + }; + static const int registration_cmds[] = { + MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED, + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED, + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE, + MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ + }; + BUILD_BUG_ON(ARRAY_SIZE(states) != ARRAY_SIZE(registration_cmds)); + + membarrier_state = atomic_read(&mm->membarrier_state); + for (i = 0; i < ARRAY_SIZE(states); ++i) { + if (membarrier_state & states[i]) { + registrations_mask |= registration_cmds[i]; + membarrier_state &= ~states[i]; + } + } + WARN_ON_ONCE(membarrier_state != 0); + return registrations_mask; +} + /** * sys_membarrier - issue memory barriers on a set of threads * @cmd: Takes command values defined in enum membarrier_cmd. @@ -623,6 +658,8 @@ SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id) return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ: return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ); + case MEMBARRIER_CMD_GET_REGISTRATIONS: + return membarrier_get_registrations(); default: return -EINVAL; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ed2a47e4ddae..0a11f44adee5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1777,6 +1777,8 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq) BUG_ON(idx >= MAX_RT_PRIO); queue = array->queue + idx; + if (SCHED_WARN_ON(list_empty(queue))) + return NULL; next = list_entry(queue->next, struct sched_rt_entity, run_list); return next; @@ -1789,7 +1791,8 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) do { rt_se = pick_next_rt_entity(rt_rq); - BUG_ON(!rt_se); + if (unlikely(!rt_se)) + return NULL; rt_rq = group_rt_rq(rt_se); } while (rt_rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 771f8ddb7053..3e8df6d31c1e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -248,7 +248,7 @@ static inline void update_avg(u64 *avg, u64 sample) #define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV) -static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) +static inline bool dl_entity_is_special(const struct sched_dl_entity *dl_se) { #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); @@ -260,8 +260,8 @@ static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) /* * Tells if entity @a should preempt entity @b. */ -static inline bool -dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) +static inline bool dl_entity_preempt(const struct sched_dl_entity *a, + const struct sched_dl_entity *b) { return dl_entity_is_special(a) || dl_time_before(a->deadline, b->deadline); @@ -645,6 +645,9 @@ struct cfs_rq { int throttled; int throttle_count; struct list_head throttled_list; +#ifdef CONFIG_SMP + struct list_head throttled_csd_list; +#endif #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -1041,7 +1044,6 @@ struct rq { unsigned long cpu_capacity; unsigned long cpu_capacity_orig; - unsigned long cpu_capacity_inverted; struct balance_callback *balance_callback; @@ -1154,6 +1156,11 @@ struct rq { /* Scratch cpumask to be temporarily used under rq_lock */ cpumask_var_t scratch_mask; + +#if defined(CONFIG_CFS_BANDWIDTH) && defined(CONFIG_SMP) + call_single_data_t cfsb_csd; + struct list_head cfsb_csd_list; +#endif }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1236,7 +1243,8 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq) return &rq->__lock; } -bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool fi); +bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, + bool fi); /* * Helpers to check if the CPU's core cookie matches with the task's cookie @@ -1415,7 +1423,7 @@ static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) } /* runqueue on which this entity is (to be) queued */ -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se) { return se->cfs_rq; } @@ -1428,19 +1436,16 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) #else -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} +#define task_of(_se) container_of(_se, struct task_struct, se) -static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +static inline struct cfs_rq *task_cfs_rq(const struct task_struct *p) { return &task_rq(p)->cfs; } -static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +static inline struct cfs_rq *cfs_rq_of(const struct sched_entity *se) { - struct task_struct *p = task_of(se); + const struct task_struct *p = task_of(se); struct rq *rq = task_rq(p); return &rq->cfs; @@ -2893,24 +2898,6 @@ static inline unsigned long capacity_orig_of(int cpu) return cpu_rq(cpu)->cpu_capacity_orig; } -/* - * Returns inverted capacity if the CPU is in capacity inversion state. - * 0 otherwise. - * - * Capacity inversion detection only considers thermal impact where actual - * performance points (OPPs) gets dropped. - * - * Capacity inversion state happens when another performance domain that has - * equal or lower capacity_orig_of() becomes effectively larger than the perf - * domain this CPU belongs to due to thermal pressure throttling it hard. - * - * See comment in update_cpu_capacity(). - */ -static inline unsigned long cpu_in_capacity_inversion(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_inverted; -} - /** * enum cpu_util_type - CPU utilization type * @FREQUENCY_UTIL: Utilization used to select frequency @@ -3261,4 +3248,62 @@ static inline void update_current_exec_runtime(struct task_struct *curr, cgroup_account_cputime(curr, delta_exec); } +#ifdef CONFIG_SCHED_MM_CID +static inline int __mm_cid_get(struct mm_struct *mm) +{ + struct cpumask *cpumask; + int cid; + + cpumask = mm_cidmask(mm); + cid = cpumask_first_zero(cpumask); + if (cid >= nr_cpu_ids) + return -1; + __cpumask_set_cpu(cid, cpumask); + return cid; +} + +static inline void mm_cid_put(struct mm_struct *mm, int cid) +{ + lockdep_assert_irqs_disabled(); + if (cid < 0) + return; + raw_spin_lock(&mm->cid_lock); + __cpumask_clear_cpu(cid, mm_cidmask(mm)); + raw_spin_unlock(&mm->cid_lock); +} + +static inline int mm_cid_get(struct mm_struct *mm) +{ + int ret; + + lockdep_assert_irqs_disabled(); + raw_spin_lock(&mm->cid_lock); + ret = __mm_cid_get(mm); + raw_spin_unlock(&mm->cid_lock); + return ret; +} + +static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) +{ + if (prev->mm_cid_active) { + if (next->mm_cid_active && next->mm == prev->mm) { + /* + * Context switch between threads in same mm, hand over + * the mm_cid from prev to next. + */ + next->mm_cid = prev->mm_cid; + prev->mm_cid = -1; + return; + } + mm_cid_put(prev->mm, prev->mm_cid); + prev->mm_cid = -1; + } + if (next->mm_cid_active) + next->mm_cid = mm_cid_get(next->mm); +} + +#else +static inline void switch_mm_cid(struct task_struct *prev, struct task_struct *next) { } +#endif + #endif /* _KERNEL_SCHED_SCHED_H */ diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8739c2a5a54e..d93c3379e901 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -578,7 +578,7 @@ out: */ struct root_domain def_root_domain; -void init_defrootdomain(void) +void __init init_defrootdomain(void) { init_rootdomain(&def_root_domain); @@ -2451,7 +2451,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) * Set up scheduler domains and groups. For now this just excludes isolated * CPUs, but could be used to exclude other special cases in the future. */ -int sched_init_domains(const struct cpumask *cpu_map) +int __init sched_init_domains(const struct cpumask *cpu_map) { int err; |