summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-11-19 23:16:06 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2024-11-19 23:16:06 +0100
commit3f020399e4f1c690ce87b4c472f75b1fc89e07d5 (patch)
treed6946d266768a5c95b1c82c060ca38af4782ab03 /kernel/sched
parentMerge tag 'perf-core-2024-11-18' of git://git.kernel.org/pub/scm/linux/kernel... (diff)
parentsched, x86: Update the comment for TIF_NEED_RESCHED_LAZY. (diff)
downloadlinux-3f020399e4f1c690ce87b4c472f75b1fc89e07d5.tar.xz
linux-3f020399e4f1c690ce87b4c472f75b1fc89e07d5.zip
Merge tag 'sched-core-2024-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "Core facilities: - Add the "Lazy preemption" model (CONFIG_PREEMPT_LAZY=y), which optimizes fair-class preemption by delaying preemption requests to the tick boundary, while working as full preemption for RR/FIFO/DEADLINE classes. (Peter Zijlstra) - x86: Enable Lazy preemption (Peter Zijlstra) - riscv: Enable Lazy preemption (Jisheng Zhang) - Initialize idle tasks only once (Thomas Gleixner) - sched/ext: Remove sched_fork() hack (Thomas Gleixner) Fair scheduler: - Optimize the PLACE_LAG when se->vlag is zero (Huang Shijie) Idle loop: - Optimize the generic idle loop by removing unnecessary memory barrier (Zhongqiu Han) RSEQ: - Improve cache locality of RSEQ concurrency IDs for intermittent workloads (Mathieu Desnoyers) Waitqueues: - Make wake_up_{bit,var} less fragile (Neil Brown) PSI: - Pass enqueue/dequeue flags to psi callbacks directly (Johannes Weiner) Preparatory patches for proxy execution: - Add move_queued_task_locked helper (Connor O'Brien) - Consolidate pick_*_task to task_is_pushable helper (Connor O'Brien) - Split out __schedule() deactivate task logic into a helper (John Stultz) - Split scheduler and execution contexts (Peter Zijlstra) - Make mutex::wait_lock irq safe (Juri Lelli) - Expose __mutex_owner() (Juri Lelli) - Remove wakeups from under mutex::wait_lock (Peter Zijlstra) Misc fixes and cleanups: - Remove unused __HAVE_THREAD_FUNCTIONS hook support (David Disseldorp) - Update the comment for TIF_NEED_RESCHED_LAZY (Sebastian Andrzej Siewior) - Remove unused bit_wait_io_timeout (Dr. David Alan Gilbert) - remove the DOUBLE_TICK feature (Huang Shijie) - fix the comment for PREEMPT_SHORT (Huang Shijie) - Fix unnused variable warning (Christian Loehle) - No PREEMPT_RT=y for all{yes,mod}config" * tag 'sched-core-2024-11-18' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits) sched, x86: Update the comment for TIF_NEED_RESCHED_LAZY. sched: No PREEMPT_RT=y for all{yes,mod}config riscv: add PREEMPT_LAZY support sched, x86: Enable Lazy preemption sched: Enable PREEMPT_DYNAMIC for PREEMPT_RT sched: Add Lazy preemption model sched: Add TIF_NEED_RESCHED_LAZY infrastructure sched/ext: Remove sched_fork() hack sched: Initialize idle tasks only once sched: psi: pass enqueue/dequeue flags to psi callbacks directly sched/uclamp: Fix unnused variable warning sched: Split scheduler and execution contexts sched: Split out __schedule() deactivate task logic into a helper sched: Consolidate pick_*_task to task_is_pushable helper sched: Add move_queued_task_locked helper locking/mutex: Expose __mutex_owner() locking/mutex: Make mutex::wait_lock irq safe locking/mutex: Remove wakeups from under mutex::wait_lock sched: Improve cache locality of RSEQ concurrency IDs for intermittent workloads sched: idle: Optimize the generic idle loop by removing needless memory barrier ...
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c289
-rw-r--r--kernel/sched/deadline.c57
-rw-r--r--kernel/sched/debug.c7
-rw-r--r--kernel/sched/ext.c7
-rw-r--r--kernel/sched/fair.c42
-rw-r--r--kernel/sched/features.h3
-rw-r--r--kernel/sched/idle.c1
-rw-r--r--kernel/sched/pelt.c2
-rw-r--r--kernel/sched/rt.c67
-rw-r--r--kernel/sched/sched.h155
-rw-r--r--kernel/sched/stats.h29
-rw-r--r--kernel/sched/syscalls.c4
-rw-r--r--kernel/sched/wait_bit.c90
13 files changed, 460 insertions, 293 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a1c353a62c56..95e40895a519 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -832,7 +832,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
rq_lock(rq, &rf);
update_rq_clock(rq);
- rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+ rq->donor->sched_class->task_tick(rq, rq->curr, 1);
rq_unlock(rq, &rf);
return HRTIMER_NORESTART;
@@ -941,10 +941,9 @@ static inline void hrtick_rq_init(struct rq *rq)
* this avoids any races wrt polling state changes and thereby avoids
* spurious IPIs.
*/
-static inline bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
{
- struct thread_info *ti = task_thread_info(p);
- return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+ return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG);
}
/*
@@ -969,9 +968,9 @@ static bool set_nr_if_polling(struct task_struct *p)
}
#else
-static inline bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
{
- set_tsk_need_resched(p);
+ set_ti_thread_flag(ti, tif);
return true;
}
@@ -1076,28 +1075,70 @@ void wake_up_q(struct wake_q_head *head)
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
-void resched_curr(struct rq *rq)
+static void __resched_curr(struct rq *rq, int tif)
{
struct task_struct *curr = rq->curr;
+ struct thread_info *cti = task_thread_info(curr);
int cpu;
lockdep_assert_rq_held(rq);
- if (test_tsk_need_resched(curr))
+ /*
+ * Always immediately preempt the idle task; no point in delaying doing
+ * actual work.
+ */
+ if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY)
+ tif = TIF_NEED_RESCHED;
+
+ if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
return;
cpu = cpu_of(rq);
if (cpu == smp_processor_id()) {
- set_tsk_need_resched(curr);
- set_preempt_need_resched();
+ set_ti_thread_flag(cti, tif);
+ if (tif == TIF_NEED_RESCHED)
+ set_preempt_need_resched();
return;
}
- if (set_nr_and_not_polling(curr))
- smp_send_reschedule(cpu);
- else
+ if (set_nr_and_not_polling(cti, tif)) {
+ if (tif == TIF_NEED_RESCHED)
+ smp_send_reschedule(cpu);
+ } else {
trace_sched_wake_idle_without_ipi(cpu);
+ }
+}
+
+void resched_curr(struct rq *rq)
+{
+ __resched_curr(rq, TIF_NEED_RESCHED);
+}
+
+#ifdef CONFIG_PREEMPT_DYNAMIC
+static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy);
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+ return static_branch_unlikely(&sk_dynamic_preempt_lazy);
+}
+#else
+static __always_inline bool dynamic_preempt_lazy(void)
+{
+ return IS_ENABLED(CONFIG_PREEMPT_LAZY);
+}
+#endif
+
+static __always_inline int get_lazy_tif_bit(void)
+{
+ if (dynamic_preempt_lazy())
+ return TIF_NEED_RESCHED_LAZY;
+
+ return TIF_NEED_RESCHED;
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+ __resched_curr(rq, get_lazy_tif_bit());
}
void resched_cpu(int cpu)
@@ -1192,7 +1233,7 @@ static void wake_up_idle_cpu(int cpu)
* and testing of the above solutions didn't appear to report
* much benefits.
*/
- if (set_nr_and_not_polling(rq->idle))
+ if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
@@ -1399,7 +1440,7 @@ void set_load_weight(struct task_struct *p, bool update_load)
* requests are serialized using a mutex to reduce the risk of conflicting
* updates or API abuses.
*/
-static DEFINE_MUTEX(uclamp_mutex);
+static __maybe_unused DEFINE_MUTEX(uclamp_mutex);
/* Max allowed minimum utilization */
static unsigned int __maybe_unused sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
@@ -2024,10 +2065,10 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
*/
uclamp_rq_inc(rq, p);
- if (!(flags & ENQUEUE_RESTORE)) {
+ psi_enqueue(p, flags);
+
+ if (!(flags & ENQUEUE_RESTORE))
sched_info_enqueue(rq, p);
- psi_enqueue(p, flags & ENQUEUE_MIGRATED);
- }
if (sched_core_enabled(rq))
sched_core_enqueue(rq, p);
@@ -2044,10 +2085,10 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
if (!(flags & DEQUEUE_NOCLOCK))
update_rq_clock(rq);
- if (!(flags & DEQUEUE_SAVE)) {
+ if (!(flags & DEQUEUE_SAVE))
sched_info_dequeue(rq, p);
- psi_dequeue(p, !(flags & DEQUEUE_SLEEP));
- }
+
+ psi_dequeue(p, flags);
/*
* Must be before ->dequeue_task() because ->dequeue_task() can 'fail'
@@ -2135,16 +2176,18 @@ void check_class_changed(struct rq *rq, struct task_struct *p,
void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
- if (p->sched_class == rq->curr->sched_class)
- rq->curr->sched_class->wakeup_preempt(rq, p, flags);
- else if (sched_class_above(p->sched_class, rq->curr->sched_class))
+ struct task_struct *donor = rq->donor;
+
+ if (p->sched_class == donor->sched_class)
+ donor->sched_class->wakeup_preempt(rq, p, flags);
+ else if (sched_class_above(p->sched_class, donor->sched_class))
resched_curr(rq);
/*
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
- if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
+ if (task_on_rq_queued(donor) && test_tsk_need_resched(rq->curr))
rq_clock_skip_update(rq);
}
@@ -2620,9 +2663,7 @@ int push_cpu_stop(void *arg)
// XXX validate p is still the highest prio task
if (task_rq(p) == rq) {
- deactivate_task(rq, p, 0);
- set_task_cpu(p, lowest_rq->cpu);
- activate_task(lowest_rq, p, 0);
+ move_queued_task_locked(rq, lowest_rq, p);
resched_curr(lowest_rq);
}
@@ -2682,7 +2723,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
lockdep_assert_held(&p->pi_lock);
queued = task_on_rq_queued(p);
- running = task_current(rq, p);
+ running = task_current_donor(rq, p);
if (queued) {
/*
@@ -2696,6 +2737,7 @@ __do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
put_prev_task(rq, p);
p->sched_class->set_cpus_allowed(p, ctx);
+ mm_set_cpus_allowed(p->mm, ctx->new_mask);
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
@@ -3308,9 +3350,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
rq_pin_lock(src_rq, &srf);
rq_pin_lock(dst_rq, &drf);
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, cpu);
- activate_task(dst_rq, p, 0);
+ move_queued_task_locked(src_rq, dst_rq, p);
wakeup_preempt(dst_rq, p, 0);
rq_unpin_lock(dst_rq, &drf);
@@ -4424,7 +4464,8 @@ int wake_up_state(struct task_struct *p, unsigned int state)
* Perform scheduler related setup for a newly forked process p.
* p is forked by current.
*
- * __sched_fork() is basic setup used by init_idle() too:
+ * __sched_fork() is basic setup which is also used by sched_init() to
+ * initialize the boot CPU's idle task.
*/
static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
{
@@ -5517,7 +5558,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* project cycles that may never be accounted to this
* thread, breaking clock_gettime().
*/
- if (task_current(rq, p) && task_on_rq_queued(p)) {
+ if (task_current_donor(rq, p) && task_on_rq_queued(p)) {
prefetch_curr_exec_start(p);
update_rq_clock(rq);
p->sched_class->update_curr(rq);
@@ -5585,7 +5626,8 @@ void sched_tick(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
- struct task_struct *curr;
+ /* accounting goes to the donor task */
+ struct task_struct *donor;
struct rq_flags rf;
unsigned long hw_pressure;
u64 resched_latency;
@@ -5596,19 +5638,23 @@ void sched_tick(void)
sched_clock_tick();
rq_lock(rq, &rf);
+ donor = rq->donor;
- curr = rq->curr;
- psi_account_irqtime(rq, curr, NULL);
+ psi_account_irqtime(rq, donor, NULL);
update_rq_clock(rq);
hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure);
- curr->sched_class->task_tick(rq, curr, 0);
+
+ if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY))
+ resched_curr(rq);
+
+ donor->sched_class->task_tick(rq, donor, 0);
if (sched_feat(LATENCY_WARN))
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);
- task_tick_mm_cid(rq, curr);
+ task_tick_mm_cid(rq, donor);
scx_tick(rq);
rq_unlock(rq, &rf);
@@ -5618,8 +5664,8 @@ void sched_tick(void)
perf_event_task_tick();
- if (curr->flags & PF_WQ_WORKER)
- wq_worker_tick(curr);
+ if (donor->flags & PF_WQ_WORKER)
+ wq_worker_tick(donor);
#ifdef CONFIG_SMP
if (!scx_switched_all()) {
@@ -5686,6 +5732,12 @@ static void sched_tick_remote(struct work_struct *work)
struct task_struct *curr = rq->curr;
if (cpu_online(cpu)) {
+ /*
+ * Since this is a remote tick for full dynticks mode,
+ * we are always sure that there is no proxy (only a
+ * single task is running).
+ */
+ SCHED_WARN_ON(rq->curr != rq->donor);
update_rq_clock(rq);
if (!is_idle_task(curr)) {
@@ -6309,10 +6361,7 @@ static bool try_steal_cookie(int this, int that)
if (sched_task_is_throttled(p, this))
goto next;
- deactivate_task(src, p, 0);
- set_task_cpu(p, this);
- activate_task(dst, p, 0);
-
+ move_queued_task_locked(src, dst, p);
resched_curr(dst);
success = true;
@@ -6507,6 +6556,45 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
#define SM_RTLOCK_WAIT 2
/*
+ * Helper function for __schedule()
+ *
+ * If a task does not have signals pending, deactivate it
+ * Otherwise marks the task's __state as RUNNING
+ */
+static bool try_to_block_task(struct rq *rq, struct task_struct *p,
+ unsigned long task_state)
+{
+ int flags = DEQUEUE_NOCLOCK;
+
+ if (signal_pending_state(task_state, p)) {
+ WRITE_ONCE(p->__state, TASK_RUNNING);
+ return false;
+ }
+
+ p->sched_contributes_to_load =
+ (task_state & TASK_UNINTERRUPTIBLE) &&
+ !(task_state & TASK_NOLOAD) &&
+ !(task_state & TASK_FROZEN);
+
+ if (unlikely(is_special_task_state(task_state)))
+ flags |= DEQUEUE_SPECIAL;
+
+ /*
+ * __schedule() ttwu()
+ * prev_state = prev->state; if (p->on_rq && ...)
+ * if (prev_state) goto out;
+ * p->on_rq = 0; smp_acquire__after_ctrl_dep();
+ * p->state = TASK_WAKING
+ *
+ * Where __schedule() and ttwu() have matching control dependencies.
+ *
+ * After this, schedule() must not care about p->state any more.
+ */
+ block_task(rq, p, flags);
+ return true;
+}
+
+/*
* __schedule() is the main scheduler function.
*
* The main means of driving the scheduler and thus entering this function are:
@@ -6614,37 +6702,12 @@ static void __sched notrace __schedule(int sched_mode)
goto picked;
}
} else if (!preempt && prev_state) {
- if (signal_pending_state(prev_state, prev)) {
- WRITE_ONCE(prev->__state, TASK_RUNNING);
- } else {
- int flags = DEQUEUE_NOCLOCK;
-
- prev->sched_contributes_to_load =
- (prev_state & TASK_UNINTERRUPTIBLE) &&
- !(prev_state & TASK_NOLOAD) &&
- !(prev_state & TASK_FROZEN);
-
- if (unlikely(is_special_task_state(prev_state)))
- flags |= DEQUEUE_SPECIAL;
-
- /*
- * __schedule() ttwu()
- * prev_state = prev->state; if (p->on_rq && ...)
- * if (prev_state) goto out;
- * p->on_rq = 0; smp_acquire__after_ctrl_dep();
- * p->state = TASK_WAKING
- *
- * Where __schedule() and ttwu() have matching control dependencies.
- *
- * After this, schedule() must not care about p->state any more.
- */
- block_task(rq, prev, flags);
- block = true;
- }
+ block = try_to_block_task(rq, prev, prev_state);
switch_count = &prev->nvcsw;
}
next = pick_next_task(rq, prev, &rf);
+ rq_set_donor(rq, next);
picked:
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
@@ -7151,7 +7214,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
queued = task_on_rq_queued(p);
- running = task_current(rq, p);
+ running = task_current_donor(rq, p);
if (queued)
dequeue_task(rq, p, queue_flag);
if (running)
@@ -7351,6 +7414,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
* preempt_schedule <- NOP
* preempt_schedule_notrace <- NOP
* irqentry_exit_cond_resched <- NOP
+ * dynamic_preempt_lazy <- false
*
* VOLUNTARY:
* cond_resched <- __cond_resched
@@ -7358,6 +7422,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
* preempt_schedule <- NOP
* preempt_schedule_notrace <- NOP
* irqentry_exit_cond_resched <- NOP
+ * dynamic_preempt_lazy <- false
*
* FULL:
* cond_resched <- RET0
@@ -7365,6 +7430,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write);
* preempt_schedule <- preempt_schedule
* preempt_schedule_notrace <- preempt_schedule_notrace
* irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ * dynamic_preempt_lazy <- false
+ *
+ * LAZY:
+ * cond_resched <- RET0
+ * might_resched <- RET0
+ * preempt_schedule <- preempt_schedule
+ * preempt_schedule_notrace <- preempt_schedule_notrace
+ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched
+ * dynamic_preempt_lazy <- true
*/
enum {
@@ -7372,30 +7446,41 @@ enum {
preempt_dynamic_none,
preempt_dynamic_voluntary,
preempt_dynamic_full,
+ preempt_dynamic_lazy,
};
int preempt_dynamic_mode = preempt_dynamic_undefined;
int sched_dynamic_mode(const char *str)
{
+#ifndef CONFIG_PREEMPT_RT
if (!strcmp(str, "none"))
return preempt_dynamic_none;
if (!strcmp(str, "voluntary"))
return preempt_dynamic_voluntary;
+#endif
if (!strcmp(str, "full"))
return preempt_dynamic_full;
+#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+ if (!strcmp(str, "lazy"))
+ return preempt_dynamic_lazy;
+#endif
+
return -EINVAL;
}
+#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key)
+#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key)
+
#if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
#define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled)
#define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled)
#elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
-#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key)
-#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key)
+#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f)
+#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f)
#else
#error "Unsupported PREEMPT_DYNAMIC mechanism"
#endif
@@ -7415,6 +7500,7 @@ static void __sched_dynamic_update(int mode)
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
preempt_dynamic_enable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
switch (mode) {
case preempt_dynamic_none:
@@ -7424,6 +7510,7 @@ static void __sched_dynamic_update(int mode)
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
if (mode != preempt_dynamic_mode)
pr_info("Dynamic Preempt: none\n");
break;
@@ -7435,6 +7522,7 @@ static void __sched_dynamic_update(int mode)
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
if (mode != preempt_dynamic_mode)
pr_info("Dynamic Preempt: voluntary\n");
break;
@@ -7446,9 +7534,22 @@ static void __sched_dynamic_update(int mode)
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
preempt_dynamic_enable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_disable(preempt_lazy);
if (mode != preempt_dynamic_mode)
pr_info("Dynamic Preempt: full\n");
break;
+
+ case preempt_dynamic_lazy:
+ if (!klp_override)
+ preempt_dynamic_disable(cond_resched);
+ preempt_dynamic_disable(might_resched);
+ preempt_dynamic_enable(preempt_schedule);
+ preempt_dynamic_enable(preempt_schedule_notrace);
+ preempt_dynamic_enable(irqentry_exit_cond_resched);
+ preempt_dynamic_key_enable(preempt_lazy);
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: lazy\n");
+ break;
}
preempt_dynamic_mode = mode;
@@ -7511,6 +7612,8 @@ static void __init preempt_dynamic_init(void)
sched_dynamic_update(preempt_dynamic_none);
} else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
sched_dynamic_update(preempt_dynamic_voluntary);
+ } else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) {
+ sched_dynamic_update(preempt_dynamic_lazy);
} else {
/* Default static call setting, nothing to do */
WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
@@ -7531,6 +7634,7 @@ static void __init preempt_dynamic_init(void)
PREEMPT_MODEL_ACCESSOR(none);
PREEMPT_MODEL_ACCESSOR(voluntary);
PREEMPT_MODEL_ACCESSOR(full);
+PREEMPT_MODEL_ACCESSOR(lazy);
#else /* !CONFIG_PREEMPT_DYNAMIC: */
@@ -7683,8 +7787,6 @@ void __init init_idle(struct task_struct *idle, int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- __sched_fork(0, idle);
-
raw_spin_lock_irqsave(&idle->pi_lock, flags);
raw_spin_rq_lock(rq);
@@ -7699,10 +7801,8 @@ void __init init_idle(struct task_struct *idle, int cpu)
#ifdef CONFIG_SMP
/*
- * It's possible that init_idle() gets called multiple times on a task,
- * in that case do_set_cpus_allowed() will not do the right thing.
- *
- * And since this is boot we can forgo the serialization.
+ * No validation and serialization required at boot time and for
+ * setting up the idle tasks of not yet online CPUs.
*/
set_cpus_allowed_common(idle, &ac);
#endif
@@ -7721,6 +7821,7 @@ void __init init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();
rq->idle = idle;
+ rq_set_donor(rq, idle);
rcu_assign_pointer(rq->curr, idle);
idle->on_rq = TASK_ON_RQ_QUEUED;
#ifdef CONFIG_SMP
@@ -7810,7 +7911,7 @@ void sched_setnuma(struct task_struct *p, int nid)
rq = task_rq_lock(p, &rf);
queued = task_on_rq_queued(p);
- running = task_current(rq, p);
+ running = task_current_donor(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE);
@@ -8546,6 +8647,7 @@ void __init sched_init(void)
* but because we are the idle thread, we just pick up running again
* when this runqueue becomes "idle".
*/
+ __sched_fork(0, current);
init_idle(current, smp_processor_id());
calc_load_update = jiffies + LOAD_FREQ;
@@ -8960,7 +9062,7 @@ void sched_move_task(struct task_struct *tsk)
update_rq_clock(rq);
- running = task_current(rq, tsk);
+ running = task_current_donor(rq, tsk);
queued = task_on_rq_queued(tsk);
if (queued)
@@ -10253,6 +10355,7 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
*/
if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
return -1;
+ WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET);
return src_cid;
}
@@ -10265,7 +10368,8 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
{
struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
struct mm_struct *mm = t->mm;
- int src_cid, dst_cid, src_cpu;
+ int src_cid, src_cpu;
+ bool dst_cid_is_set;
struct rq *src_rq;
lockdep_assert_rq_held(dst_rq);
@@ -10282,9 +10386,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
* allocation closest to 0 in cases where few threads migrate around
* many CPUs.
*
- * If destination cid is already set, we may have to just clear
- * the src cid to ensure compactness in frequent migrations
- * scenarios.
+ * If destination cid or recent cid is already set, we may have
+ * to just clear the src cid to ensure compactness in frequent
+ * migrations scenarios.
*
* It is not useful to clear the src cid when the number of threads is
* greater or equal to the number of allowed CPUs, because user-space
@@ -10292,9 +10396,9 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
* allowed CPUs.
*/
dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
- dst_cid = READ_ONCE(dst_pcpu_cid->cid);
- if (!mm_cid_is_unset(dst_cid) &&
- atomic_read(&mm->mm_users) >= t->nr_cpus_allowed)
+ dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) ||
+ !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid));
+ if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed))
return;
src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
src_rq = cpu_rq(src_cpu);
@@ -10305,13 +10409,14 @@ void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
src_cid);
if (src_cid == -1)
return;
- if (!mm_cid_is_unset(dst_cid)) {
+ if (dst_cid_is_set) {
__mm_cid_put(mm, src_cid);
return;
}
/* Move src_cid to dst cpu. */
mm_cid_snapshot_time(dst_rq, mm);
WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
+ WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid);
}
static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
@@ -10550,7 +10655,7 @@ void sched_mm_cid_after_execve(struct task_struct *t)
* Matches barrier in sched_mm_cid_remote_clear_old().
*/
smp_mb();
- t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm);
+ t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm);
}
rseq_set_notify_resume(t);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index be1b917dc8ce..d9d5a702f1a6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1339,7 +1339,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
#endif
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
- if (dl_task(rq->curr))
+ if (dl_task(rq->donor))
wakeup_preempt_dl(rq, p, 0);
else
resched_curr(rq);
@@ -1736,11 +1736,11 @@ int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 perio
*/
static void update_curr_dl(struct rq *rq)
{
- struct task_struct *curr = rq->curr;
- struct sched_dl_entity *dl_se = &curr->dl;
+ struct task_struct *donor = rq->donor;
+ struct sched_dl_entity *dl_se = &donor->dl;
s64 delta_exec;
- if (!dl_task(curr) || !on_dl_rq(dl_se))
+ if (!dl_task(donor) || !on_dl_rq(dl_se))
return;
/*
@@ -2213,7 +2213,7 @@ static int find_later_rq(struct task_struct *task);
static int
select_task_rq_dl(struct task_struct *p, int cpu, int flags)
{
- struct task_struct *curr;
+ struct task_struct *curr, *donor;
bool select_rq;
struct rq *rq;
@@ -2224,6 +2224,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags)
rcu_read_lock();
curr = READ_ONCE(rq->curr); /* unlocked access */
+ donor = READ_ONCE(rq->donor);
/*
* If we are dealing with a -deadline task, we must
@@ -2234,9 +2235,9 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags)
* other hand, if it has a shorter deadline, we
* try to make it stay here, it might be important.
*/
- select_rq = unlikely(dl_task(curr)) &&
+ select_rq = unlikely(dl_task(donor)) &&
(curr->nr_cpus_allowed < 2 ||
- !dl_entity_preempt(&p->dl, &curr->dl)) &&
+ !dl_entity_preempt(&p->dl, &donor->dl)) &&
p->nr_cpus_allowed > 1;
/*
@@ -2299,7 +2300,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* let's hope p can move out.
*/
if (rq->curr->nr_cpus_allowed == 1 ||
- !cpudl_find(&rq->rd->cpudl, rq->curr, NULL))
+ !cpudl_find(&rq->rd->cpudl, rq->donor, NULL))
return;
/*
@@ -2338,7 +2339,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
int flags)
{
- if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
+ if (dl_entity_preempt(&p->dl, &rq->donor->dl)) {
resched_curr(rq);
return;
}
@@ -2348,7 +2349,7 @@ static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p,
* In the unlikely case current and p have the same deadline
* let us try to decide what's the best thing to do...
*/
- if ((p->dl.deadline == rq->curr->dl.deadline) &&
+ if ((p->dl.deadline == rq->donor->dl.deadline) &&
!test_tsk_need_resched(rq->curr))
check_preempt_equal_dl(rq, p);
#endif /* CONFIG_SMP */
@@ -2380,7 +2381,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
if (!first)
return;
- if (rq->curr->sched_class != &dl_sched_class)
+ if (rq->donor->sched_class != &dl_sched_class)
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
deadline_queue_push_tasks(rq);
@@ -2487,14 +2488,6 @@ static void task_fork_dl(struct task_struct *p)
/* Only try algorithms three times */
#define DL_MAX_TRIES 3
-static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
-{
- if (!task_on_cpu(rq, p) &&
- cpumask_test_cpu(cpu, &p->cpus_mask))
- return 1;
- return 0;
-}
-
/*
* Return the earliest pushable rq's task, which is suitable to be executed
* on the CPU, NULL otherwise:
@@ -2513,7 +2506,7 @@ next_node:
if (next_node) {
p = __node_2_pdl(next_node);
- if (pick_dl_task(rq, p, cpu))
+ if (task_is_pushable(rq, p, cpu))
return p;
next_node = rb_next(next_node);
@@ -2707,8 +2700,8 @@ retry:
* can move away, it makes sense to just reschedule
* without going further in pushing next_task.
*/
- if (dl_task(rq->curr) &&
- dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
+ if (dl_task(rq->donor) &&
+ dl_time_before(next_task->dl.deadline, rq->donor->dl.deadline) &&
rq->curr->nr_cpus_allowed > 1) {
resched_curr(rq);
return 0;
@@ -2751,9 +2744,7 @@ retry:
goto retry;
}
- deactivate_task(rq, next_task, 0);
- set_task_cpu(next_task, later_rq->cpu);
- activate_task(later_rq, next_task, 0);
+ move_queued_task_locked(rq, later_rq, next_task);
ret = 1;
resched_curr(later_rq);
@@ -2833,15 +2824,13 @@ static void pull_dl_task(struct rq *this_rq)
* deadline than the current task of its runqueue.
*/
if (dl_time_before(p->dl.deadline,
- src_rq->curr->dl.deadline))
+ src_rq->donor->dl.deadline))
goto skip;
if (is_migration_disabled(p)) {
push_task = get_push_task(src_rq);
} else {
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
+ move_queued_task_locked(src_rq, this_rq, p);
dmin = p->dl.deadline;
resched = true;
}
@@ -2874,9 +2863,9 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
if (!task_on_cpu(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
p->nr_cpus_allowed > 1 &&
- dl_task(rq->curr) &&
+ dl_task(rq->donor) &&
(rq->curr->nr_cpus_allowed < 2 ||
- !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
+ !dl_entity_preempt(&p->dl, &rq->donor->dl))) {
push_dl_tasks(rq);
}
}
@@ -3051,12 +3040,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
return;
}
- if (rq->curr != p) {
+ if (rq->donor != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
deadline_queue_push_tasks(rq);
#endif
- if (dl_task(rq->curr))
+ if (dl_task(rq->donor))
wakeup_preempt_dl(rq, p, 0);
else
resched_curr(rq);
@@ -3085,7 +3074,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
if (!rq->dl.overloaded)
deadline_queue_pull_task(rq);
- if (task_current(rq, p)) {
+ if (task_current_donor(rq, p)) {
/*
* If we now have a earlier deadline task than p,
* then reschedule, provided p is still on this
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index f4035c7a0fa1..a48b2a701ec2 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
static int sched_dynamic_show(struct seq_file *m, void *v)
{
static const char * preempt_modes[] = {
- "none", "voluntary", "full"
+ "none", "voluntary", "full", "lazy",
};
- int i;
+ int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY);
+ int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2;
- for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
+ for (; i < j; i++) {
if (preempt_dynamic_mode == i)
seq_puts(m, "(");
seq_puts(m, preempt_modes[i]);
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 751d73d500e5..ecb88c528544 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3567,12 +3567,7 @@ static void scx_ops_exit_task(struct task_struct *p)
void init_scx_entity(struct sched_ext_entity *scx)
{
- /*
- * init_idle() calls this function again after fork sequence is
- * complete. Don't touch ->tasks_node as it's already linked.
- */
- memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node));
-
+ memset(scx, 0, sizeof(*scx));
INIT_LIST_HEAD(&scx->dsq_list.node);
RB_CLEAR_NODE(&scx->dsq_priq);
scx->sticky_cpu = -1;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2d16c8545c71..fbdca89c677f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1200,12 +1200,12 @@ static inline bool do_preempt_short(struct cfs_rq *cfs_rq,
*/
s64 update_curr_common(struct rq *rq)
{
- struct task_struct *curr = rq->curr;
+ struct task_struct *donor = rq->donor;
s64 delta_exec;
- delta_exec = update_curr_se(rq, &curr->se);
+ delta_exec = update_curr_se(rq, &donor->se);
if (likely(delta_exec > 0))
- update_curr_task(curr, delta_exec);
+ update_curr_task(donor, delta_exec);
return delta_exec;
}
@@ -1251,14 +1251,14 @@ static void update_curr(struct cfs_rq *cfs_rq)
return;
if (resched || did_preempt_short(cfs_rq, curr)) {
- resched_curr(rq);
+ resched_curr_lazy(rq);
clear_buddies(cfs_rq, curr);
}
}
static void update_curr_fair(struct rq *rq)
{
- update_curr(cfs_rq_of(&rq->curr->se));
+ update_curr(cfs_rq_of(&rq->donor->se));
}
static inline void
@@ -5280,7 +5280,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*
* EEVDF: placement strategy #1 / #2
*/
- if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) {
+ if (sched_feat(PLACE_LAG) && cfs_rq->nr_running && se->vlag) {
struct sched_entity *curr = cfs_rq->curr;
unsigned long load;
@@ -5678,15 +5678,9 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
* validating it and just reschedule.
*/
if (queued) {
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
return;
}
- /*
- * don't let the period tick interfere with the hrtick preemption
- */
- if (!sched_feat(DOUBLE_TICK) &&
- hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
- return;
#endif
}
@@ -6822,7 +6816,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
s64 delta = slice - ran;
if (delta < 0) {
- if (task_current(rq, p))
+ if (task_current_donor(rq, p))
resched_curr(rq);
return;
}
@@ -6837,12 +6831,12 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
*/
static void hrtick_update(struct rq *rq)
{
- struct task_struct *curr = rq->curr;
+ struct task_struct *donor = rq->donor;
- if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
+ if (!hrtick_enabled_fair(rq) || donor->sched_class != &fair_sched_class)
return;
- hrtick_start_fair(rq, curr);
+ hrtick_start_fair(rq, donor);
}
#else /* !CONFIG_SCHED_HRTICK */
static inline void
@@ -8763,9 +8757,9 @@ static void set_next_buddy(struct sched_entity *se)
*/
static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags)
{
- struct task_struct *curr = rq->curr;
- struct sched_entity *se = &curr->se, *pse = &p->se;
- struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+ struct task_struct *donor = rq->donor;
+ struct sched_entity *se = &donor->se, *pse = &p->se;
+ struct cfs_rq *cfs_rq = task_cfs_rq(donor);
int cse_is_idle, pse_is_idle;
if (unlikely(se == pse))
@@ -8794,7 +8788,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
* prevents us from potentially nominating it as a false LAST_BUDDY
* below.
*/
- if (test_tsk_need_resched(curr))
+ if (test_tsk_need_resched(rq->curr))
return;
if (!sched_feat(WAKEUP_PREEMPTION))
@@ -8842,7 +8836,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int
return;
preempt:
- resched_curr(rq);
+ resched_curr_lazy(rq);
}
static struct task_struct *pick_task_fair(struct rq *rq)
@@ -13093,7 +13087,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
- if (task_current(rq, p)) {
+ if (task_current_donor(rq, p)) {
if (p->prio > oldprio)
resched_curr(rq);
} else
@@ -13200,7 +13194,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
* kick off the schedule if running, otherwise just see
* if we can still preempt the current task.
*/
- if (task_current(rq, p))
+ if (task_current_donor(rq, p))
resched_curr(rq);
else
wakeup_preempt(rq, p, 0);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 290874079f60..a3d331dd2d8f 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -19,7 +19,7 @@ SCHED_FEAT(PLACE_REL_DEADLINE, true)
*/
SCHED_FEAT(RUN_TO_PARITY, true)
/*
- * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for
+ * Allow wakeup of tasks with a shorter slice to cancel RUN_TO_PARITY for
* current.
*/
SCHED_FEAT(PREEMPT_SHORT, true)
@@ -56,7 +56,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(HRTICK_DL, false)
-SCHED_FEAT(DOUBLE_TICK, false)
/*
* Decrement CPU capacity based on time not spent running tasks
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index d2f096bb274c..ab911d1335ba 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -271,7 +271,6 @@ static void do_idle(void)
tick_nohz_idle_enter();
while (!need_resched()) {
- rmb();
/*
* Interrupts shouldn't be re-enabled from that point on until
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index a9c65d97b3ca..fc07382361a8 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -476,7 +476,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
bool update_other_load_avgs(struct rq *rq)
{
u64 now = rq_clock_pelt(rq);
- const struct sched_class *curr_class = rq->curr->sched_class;
+ const struct sched_class *curr_class = rq->donor->sched_class;
unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq));
lockdep_assert_rq_held(rq);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 172c588de542..bd66a46b06ac 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -528,7 +528,7 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
- struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+ struct task_struct *donor = rq_of_rt_rq(rt_rq)->donor;
struct rq *rq = rq_of_rt_rq(rt_rq);
struct sched_rt_entity *rt_se;
@@ -542,7 +542,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
else if (!on_rt_rq(rt_se))
enqueue_rt_entity(rt_se, 0);
- if (rt_rq->highest_prio.curr < curr->prio)
+ if (rt_rq->highest_prio.curr < donor->prio)
resched_curr(rq);
}
}
@@ -988,10 +988,10 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
*/
static void update_curr_rt(struct rq *rq)
{
- struct task_struct *curr = rq->curr;
+ struct task_struct *donor = rq->donor;
s64 delta_exec;
- if (curr->sched_class != &rt_sched_class)
+ if (donor->sched_class != &rt_sched_class)
return;
delta_exec = update_curr_common(rq);
@@ -999,7 +999,7 @@ static void update_curr_rt(struct rq *rq)
return;
#ifdef CONFIG_RT_GROUP_SCHED
- struct sched_rt_entity *rt_se = &curr->rt;
+ struct sched_rt_entity *rt_se = &donor->rt;
if (!rt_bandwidth_enabled())
return;
@@ -1535,7 +1535,7 @@ static int find_lowest_rq(struct task_struct *task);
static int
select_task_rq_rt(struct task_struct *p, int cpu, int flags)
{
- struct task_struct *curr;
+ struct task_struct *curr, *donor;
struct rq *rq;
bool test;
@@ -1547,6 +1547,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags)
rcu_read_lock();
curr = READ_ONCE(rq->curr); /* unlocked access */
+ donor = READ_ONCE(rq->donor);
/*
* If the current task on @p's runqueue is an RT task, then
@@ -1575,8 +1576,8 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags)
* systems like big.LITTLE.
*/
test = curr &&
- unlikely(rt_task(curr)) &&
- (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio);
+ unlikely(rt_task(donor)) &&
+ (curr->nr_cpus_allowed < 2 || donor->prio <= p->prio);
if (test || !rt_task_fits_capacity(p, cpu)) {
int target = find_lowest_rq(p);
@@ -1606,12 +1607,8 @@ out:
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- /*
- * Current can't be migrated, useless to reschedule,
- * let's hope p can move out.
- */
if (rq->curr->nr_cpus_allowed == 1 ||
- !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
+ !cpupri_find(&rq->rd->cpupri, rq->donor, NULL))
return;
/*
@@ -1654,7 +1651,9 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
*/
static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
{
- if (p->prio < rq->curr->prio) {
+ struct task_struct *donor = rq->donor;
+
+ if (p->prio < donor->prio) {
resched_curr(rq);
return;
}
@@ -1672,7 +1671,7 @@ static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
* to move current somewhere else, making room for our non-migratable
* task.
*/
- if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
+ if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr))
check_preempt_equal_prio(rq, p);
#endif
}
@@ -1697,7 +1696,7 @@ static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool f
* utilization. We only care of the case where we start to schedule a
* rt task
*/
- if (rq->curr->sched_class != &rt_sched_class)
+ if (rq->donor->sched_class != &rt_sched_class)
update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
rt_queue_push_tasks(rq);
@@ -1773,15 +1772,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_s
/* Only try algorithms three times */
#define RT_MAX_TRIES 3
-static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
-{
- if (!task_on_cpu(rq, p) &&
- cpumask_test_cpu(cpu, &p->cpus_mask))
- return 1;
-
- return 0;
-}
-
/*
* Return the highest pushable rq's task, which is suitable to be executed
* on the CPU, NULL otherwise
@@ -1795,7 +1785,7 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
return NULL;
plist_for_each_entry(p, head, pushable_tasks) {
- if (pick_rt_task(rq, p, cpu))
+ if (task_is_pushable(rq, p, cpu))
return p;
}
@@ -1968,6 +1958,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(rq->cpu != task_cpu(p));
BUG_ON(task_current(rq, p));
+ BUG_ON(task_current_donor(rq, p));
BUG_ON(p->nr_cpus_allowed <= 1);
BUG_ON(!task_on_rq_queued(p));
@@ -2000,7 +1991,7 @@ retry:
* higher priority than current. If that's the case
* just reschedule current.
*/
- if (unlikely(next_task->prio < rq->curr->prio)) {
+ if (unlikely(next_task->prio < rq->donor->prio)) {
resched_curr(rq);
return 0;
}
@@ -2021,7 +2012,7 @@ retry:
* Note that the stoppers are masqueraded as SCHED_FIFO
* (cf. sched_set_stop_task()), so we can't rely on rt_task().
*/
- if (rq->curr->sched_class != &rt_sched_class)
+ if (rq->donor->sched_class != &rt_sched_class)
return 0;
cpu = find_lowest_rq(rq->curr);
@@ -2088,9 +2079,7 @@ retry:
goto retry;
}
- deactivate_task(rq, next_task, 0);
- set_task_cpu(next_task, lowest_rq->cpu);
- activate_task(lowest_rq, next_task, 0);
+ move_queued_task_locked(rq, lowest_rq, next_task);
resched_curr(lowest_rq);
ret = 1;
@@ -2355,15 +2344,13 @@ static void pull_rt_task(struct rq *this_rq)
* p if it is lower in priority than the
* current task on the run queue
*/
- if (p->prio < src_rq->curr->prio)
+ if (p->prio < src_rq->donor->prio)
goto skip;
if (is_migration_disabled(p)) {
push_task = get_push_task(src_rq);
} else {
- deactivate_task(src_rq, p, 0);
- set_task_cpu(p, this_cpu);
- activate_task(this_rq, p, 0);
+ move_queued_task_locked(src_rq, this_rq, p);
resched = true;
}
/*
@@ -2399,9 +2386,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
bool need_to_push = !task_on_cpu(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
p->nr_cpus_allowed > 1 &&
- (dl_task(rq->curr) || rt_task(rq->curr)) &&
+ (dl_task(rq->donor) || rt_task(rq->donor)) &&
(rq->curr->nr_cpus_allowed < 2 ||
- rq->curr->prio <= p->prio);
+ rq->donor->prio <= p->prio);
if (need_to_push)
push_rt_tasks(rq);
@@ -2485,7 +2472,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
rt_queue_push_tasks(rq);
#endif /* CONFIG_SMP */
- if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
+ if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq)))
resched_curr(rq);
}
}
@@ -2500,7 +2487,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
if (!task_on_rq_queued(p))
return;
- if (task_current(rq, p)) {
+ if (task_current_donor(rq, p)) {
#ifdef CONFIG_SMP
/*
* If our priority decreases while running, we
@@ -2526,7 +2513,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* greater than the current running task
* then reschedule.
*/
- if (p->prio < rq->curr->prio)
+ if (p->prio < rq->donor->prio)
resched_curr(rq);
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c03b3d7b320e..76f5f53a645f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1148,7 +1148,10 @@ struct rq {
*/
unsigned int nr_uninterruptible;
- struct task_struct __rcu *curr;
+ union {
+ struct task_struct __rcu *donor; /* Scheduler context */
+ struct task_struct __rcu *curr; /* Execution context */
+ };
struct sched_dl_entity *dl_server;
struct task_struct *idle;
struct task_struct *stop;
@@ -1345,6 +1348,11 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#define raw_rq() raw_cpu_ptr(&runqueues)
+static inline void rq_set_donor(struct rq *rq, struct task_struct *t)
+{
+ /* Do nothing */
+}
+
#ifdef CONFIG_SCHED_CORE
static inline struct cpumask *sched_group_span(struct sched_group *sg);
@@ -2086,34 +2094,6 @@ static inline const struct cpumask *task_user_cpus(struct task_struct *p)
#endif /* CONFIG_SMP */
-#include "stats.h"
-
-#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
-
-extern void __sched_core_account_forceidle(struct rq *rq);
-
-static inline void sched_core_account_forceidle(struct rq *rq)
-{
- if (schedstat_enabled())
- __sched_core_account_forceidle(rq);
-}
-
-extern void __sched_core_tick(struct rq *rq);
-
-static inline void sched_core_tick(struct rq *rq)
-{
- if (sched_core_enabled(rq) && schedstat_enabled())
- __sched_core_tick(rq);
-}
-
-#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */
-
-static inline void sched_core_account_forceidle(struct rq *rq) { }
-
-static inline void sched_core_tick(struct rq *rq) { }
-
-#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */
-
#ifdef CONFIG_CGROUP_SCHED
/*
@@ -2261,11 +2241,25 @@ static inline u64 global_rt_runtime(void)
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
+/*
+ * Is p the current execution context?
+ */
static inline int task_current(struct rq *rq, struct task_struct *p)
{
return rq->curr == p;
}
+/*
+ * Is p the current scheduling context?
+ *
+ * Note that it might be the current execution context at the same time if
+ * rq->curr == rq->donor == p.
+ */
+static inline int task_current_donor(struct rq *rq, struct task_struct *p)
+{
+ return rq->donor == p;
+}
+
static inline int task_on_cpu(struct rq *rq, struct task_struct *p)
{
#ifdef CONFIG_SMP
@@ -2452,7 +2446,7 @@ struct sched_class {
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- WARN_ON_ONCE(rq->curr != prev);
+ WARN_ON_ONCE(rq->donor != prev);
prev->sched_class->put_prev_task(rq, prev, NULL);
}
@@ -2616,7 +2610,7 @@ static inline cpumask_t *alloc_user_cpus_ptr(int node)
static inline struct task_struct *get_push_task(struct rq *rq)
{
- struct task_struct *p = rq->curr;
+ struct task_struct *p = rq->donor;
lockdep_assert_rq_held(rq);
@@ -2696,6 +2690,7 @@ extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
extern void resched_curr(struct rq *rq);
+extern void resched_curr_lazy(struct rq *rq);
extern void resched_cpu(int cpu);
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
@@ -3200,6 +3195,34 @@ extern void nohz_run_idle_balance(int cpu);
static inline void nohz_run_idle_balance(int cpu) { }
#endif
+#include "stats.h"
+
+#if defined(CONFIG_SCHED_CORE) && defined(CONFIG_SCHEDSTATS)
+
+extern void __sched_core_account_forceidle(struct rq *rq);
+
+static inline void sched_core_account_forceidle(struct rq *rq)
+{
+ if (schedstat_enabled())
+ __sched_core_account_forceidle(rq);
+}
+
+extern void __sched_core_tick(struct rq *rq);
+
+static inline void sched_core_tick(struct rq *rq)
+{
+ if (sched_core_enabled(rq) && schedstat_enabled())
+ __sched_core_tick(rq);
+}
+
+#else /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS): */
+
+static inline void sched_core_account_forceidle(struct rq *rq) { }
+
+static inline void sched_core_tick(struct rq *rq) { }
+
+#endif /* !(CONFIG_SCHED_CORE && CONFIG_SCHEDSTATS) */
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
struct irqtime {
@@ -3630,24 +3653,41 @@ static inline void mm_cid_put(struct mm_struct *mm)
__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
}
-static inline int __mm_cid_try_get(struct mm_struct *mm)
+static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm)
{
- struct cpumask *cpumask;
- int cid;
+ struct cpumask *cidmask = mm_cidmask(mm);
+ struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
+ int cid = __this_cpu_read(pcpu_cid->recent_cid);
- cpumask = mm_cidmask(mm);
+ /* Try to re-use recent cid. This improves cache locality. */
+ if (!mm_cid_is_unset(cid) && !cpumask_test_and_set_cpu(cid, cidmask))
+ return cid;
/*
+ * Expand cid allocation if the maximum number of concurrency
+ * IDs allocated (max_nr_cid) is below the number cpus allowed
+ * and number of threads. Expanding cid allocation as much as
+ * possible improves cache locality.
+ */
+ cid = atomic_read(&mm->max_nr_cid);
+ while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) {
+ if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1))
+ continue;
+ if (!cpumask_test_and_set_cpu(cid, cidmask))
+ return cid;
+ }
+ /*
+ * Find the first available concurrency id.
* Retry finding first zero bit if the mask is temporarily
* filled. This only happens during concurrent remote-clear
* which owns a cid without holding a rq lock.
*/
for (;;) {
- cid = cpumask_first_zero(cpumask);
- if (cid < nr_cpu_ids)
+ cid = cpumask_first_zero(cidmask);
+ if (cid < READ_ONCE(mm->nr_cpus_allowed))
break;
cpu_relax();
}
- if (cpumask_test_and_set_cpu(cid, cpumask))
+ if (cpumask_test_and_set_cpu(cid, cidmask))
return -1;
return cid;
@@ -3665,7 +3705,8 @@ static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm)
WRITE_ONCE(pcpu_cid->time, rq->clock);
}
-static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm)
+static inline int __mm_cid_get(struct rq *rq, struct task_struct *t,
+ struct mm_struct *mm)
{
int cid;
@@ -3675,13 +3716,13 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm)
* guarantee forward progress.
*/
if (!READ_ONCE(use_cid_lock)) {
- cid = __mm_cid_try_get(mm);
+ cid = __mm_cid_try_get(t, mm);
if (cid >= 0)
goto end;
raw_spin_lock(&cid_lock);
} else {
raw_spin_lock(&cid_lock);
- cid = __mm_cid_try_get(mm);
+ cid = __mm_cid_try_get(t, mm);
if (cid >= 0)
goto unlock;
}
@@ -3701,7 +3742,7 @@ static inline int __mm_cid_get(struct rq *rq, struct mm_struct *mm)
* all newcoming allocations observe the use_cid_lock flag set.
*/
do {
- cid = __mm_cid_try_get(mm);
+ cid = __mm_cid_try_get(t, mm);
cpu_relax();
} while (cid < 0);
/*
@@ -3718,7 +3759,8 @@ end:
return cid;
}
-static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm)
+static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
+ struct mm_struct *mm)
{
struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
struct cpumask *cpumask;
@@ -3735,8 +3777,9 @@ static inline int mm_cid_get(struct rq *rq, struct mm_struct *mm)
if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
__mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
}
- cid = __mm_cid_get(rq, mm);
+ cid = __mm_cid_get(rq, t, mm);
__this_cpu_write(pcpu_cid->cid, cid);
+ __this_cpu_write(pcpu_cid->recent_cid, cid);
return cid;
}
@@ -3789,7 +3832,7 @@ static inline void switch_mm_cid(struct rq *rq,
prev->mm_cid = -1;
}
if (next->mm_cid_active)
- next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next->mm);
+ next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm);
}
#else /* !CONFIG_SCHED_MM_CID: */
@@ -3802,6 +3845,28 @@ static inline void init_sched_mm_cid(struct task_struct *t) { }
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
+#ifdef CONFIG_SMP
+static inline
+void move_queued_task_locked(struct rq *src_rq, struct rq *dst_rq, struct task_struct *task)
+{
+ lockdep_assert_rq_held(src_rq);
+ lockdep_assert_rq_held(dst_rq);
+
+ deactivate_task(src_rq, task, 0);
+ set_task_cpu(task, dst_rq->cpu);
+ activate_task(dst_rq, task, 0);
+}
+
+static inline
+bool task_is_pushable(struct rq *rq, struct task_struct *p, int cpu)
+{
+ if (!task_on_cpu(rq, p) &&
+ cpumask_test_cpu(cpu, &p->cpus_mask))
+ return true;
+
+ return false;
+}
+#endif
#ifdef CONFIG_RT_MUTEXES
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 767e098a3bd1..8ee0add5a48a 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -127,21 +127,25 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
* go through migration requeues. In this case, *sleeping* states need
* to be transferred.
*/
-static inline void psi_enqueue(struct task_struct *p, bool migrate)
+static inline void psi_enqueue(struct task_struct *p, int flags)
{
int clear = 0, set = 0;
if (static_branch_likely(&psi_disabled))
return;
+ /* Same runqueue, nothing changed for psi */
+ if (flags & ENQUEUE_RESTORE)
+ return;
+
if (p->se.sched_delayed) {
/* CPU migration of "sleeping" task */
- SCHED_WARN_ON(!migrate);
+ SCHED_WARN_ON(!(flags & ENQUEUE_MIGRATED));
if (p->in_memstall)
set |= TSK_MEMSTALL;
if (p->in_iowait)
set |= TSK_IOWAIT;
- } else if (migrate) {
+ } else if (flags & ENQUEUE_MIGRATED) {
/* CPU migration of runnable task */
set = TSK_RUNNING;
if (p->in_memstall)
@@ -158,17 +162,14 @@ static inline void psi_enqueue(struct task_struct *p, bool migrate)
psi_task_change(p, clear, set);
}
-static inline void psi_dequeue(struct task_struct *p, bool migrate)
+static inline void psi_dequeue(struct task_struct *p, int flags)
{
if (static_branch_likely(&psi_disabled))
return;
- /*
- * When migrating a task to another CPU, clear all psi
- * state. The enqueue callback above will work it out.
- */
- if (migrate)
- psi_task_change(p, p->psi_flags, 0);
+ /* Same runqueue, nothing changed for psi */
+ if (flags & DEQUEUE_SAVE)
+ return;
/*
* A voluntary sleep is a dequeue followed by a task switch. To
@@ -176,6 +177,14 @@ static inline void psi_dequeue(struct task_struct *p, bool migrate)
* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
* Do nothing here.
*/
+ if (flags & DEQUEUE_SLEEP)
+ return;
+
+ /*
+ * When migrating a task to another CPU, clear all psi
+ * state. The enqueue callback above will work it out.
+ */
+ psi_task_change(p, p->psi_flags, 0);
}
static inline void psi_ttwu_dequeue(struct task_struct *p)
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index f9bed5ec719e..0d71fcbaf1e3 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -91,7 +91,7 @@ void set_user_nice(struct task_struct *p, long nice)
}
queued = task_on_rq_queued(p);
- running = task_current(rq, p);
+ running = task_current_donor(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
if (running)
@@ -713,7 +713,7 @@ change:
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
queued = task_on_rq_queued(p);
- running = task_current(rq, p);
+ running = task_current_donor(rq, p);
if (queued)
dequeue_task(rq, p, queue_flags);
if (running)
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 134d7112ef71..b410b61cec95 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -9,7 +9,7 @@
static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
-wait_queue_head_t *bit_waitqueue(void *word, int bit)
+wait_queue_head_t *bit_waitqueue(unsigned long *word, int bit)
{
const int shift = BITS_PER_LONG == 32 ? 5 : 6;
unsigned long val = (unsigned long)word << shift | bit;
@@ -55,7 +55,7 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
}
EXPORT_SYMBOL(__wait_on_bit);
-int __sched out_of_line_wait_on_bit(void *word, int bit,
+int __sched out_of_line_wait_on_bit(unsigned long *word, int bit,
wait_bit_action_f *action, unsigned mode)
{
struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
@@ -66,7 +66,7 @@ int __sched out_of_line_wait_on_bit(void *word, int bit,
EXPORT_SYMBOL(out_of_line_wait_on_bit);
int __sched out_of_line_wait_on_bit_timeout(
- void *word, int bit, wait_bit_action_f *action,
+ unsigned long *word, int bit, wait_bit_action_f *action,
unsigned mode, unsigned long timeout)
{
struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
@@ -108,7 +108,7 @@ __wait_on_bit_lock(struct wait_queue_head *wq_head, struct wait_bit_queue_entry
}
EXPORT_SYMBOL(__wait_on_bit_lock);
-int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
+int __sched out_of_line_wait_on_bit_lock(unsigned long *word, int bit,
wait_bit_action_f *action, unsigned mode)
{
struct wait_queue_head *wq_head = bit_waitqueue(word, bit);
@@ -118,7 +118,7 @@ int __sched out_of_line_wait_on_bit_lock(void *word, int bit,
}
EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
-void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
+void __wake_up_bit(struct wait_queue_head *wq_head, unsigned long *word, int bit)
{
struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
@@ -128,23 +128,31 @@ void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
EXPORT_SYMBOL(__wake_up_bit);
/**
- * wake_up_bit - wake up a waiter on a bit
- * @word: the word being waited on, a kernel virtual address
- * @bit: the bit of the word being waited on
+ * wake_up_bit - wake up waiters on a bit
+ * @word: the address containing the bit being waited on
+ * @bit: the bit at that address being waited on
*
- * There is a standard hashed waitqueue table for generic use. This
- * is the part of the hash-table's accessor API that wakes up waiters
- * on a bit. For instance, if one were to have waiters on a bitflag,
- * one would call wake_up_bit() after clearing the bit.
+ * Wake up any process waiting in wait_on_bit() or similar for the
+ * given bit to be cleared.
*
- * In order for this to function properly, as it uses waitqueue_active()
- * internally, some kind of memory barrier must be done prior to calling
- * this. Typically, this will be smp_mb__after_atomic(), but in some
- * cases where bitflags are manipulated non-atomically under a lock, one
- * may need to use a less regular barrier, such fs/inode.c's smp_mb(),
- * because spin_unlock() does not guarantee a memory barrier.
+ * The wake-up is sent to tasks in a waitqueue selected by hash from a
+ * shared pool. Only those tasks on that queue which have requested
+ * wake_up on this specific address and bit will be woken, and only if the
+ * bit is clear.
+ *
+ * In order for this to function properly there must be a full memory
+ * barrier after the bit is cleared and before this function is called.
+ * If the bit was cleared atomically, such as a by clear_bit() then
+ * smb_mb__after_atomic() can be used, othwewise smb_mb() is needed.
+ * If the bit was cleared with a fully-ordered operation, no further
+ * barrier is required.
+ *
+ * Normally the bit should be cleared by an operation with RELEASE
+ * semantics so that any changes to memory made before the bit is
+ * cleared are guaranteed to be visible after the matching wait_on_bit()
+ * completes.
*/
-void wake_up_bit(void *word, int bit)
+void wake_up_bit(unsigned long *word, int bit)
{
__wake_up_bit(bit_waitqueue(word, bit), word, bit);
}
@@ -188,6 +196,36 @@ void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int
}
EXPORT_SYMBOL(init_wait_var_entry);
+/**
+ * wake_up_var - wake up waiters on a variable (kernel address)
+ * @var: the address of the variable being waited on
+ *
+ * Wake up any process waiting in wait_var_event() or similar for the
+ * given variable to change. wait_var_event() can be waiting for an
+ * arbitrary condition to be true and associates that condition with an
+ * address. Calling wake_up_var() suggests that the condition has been
+ * made true, but does not strictly require the condtion to use the
+ * address given.
+ *
+ * The wake-up is sent to tasks in a waitqueue selected by hash from a
+ * shared pool. Only those tasks on that queue which have requested
+ * wake_up on this specific address will be woken.
+ *
+ * In order for this to function properly there must be a full memory
+ * barrier after the variable is updated (or more accurately, after the
+ * condition waited on has been made to be true) and before this function
+ * is called. If the variable was updated atomically, such as a by
+ * atomic_dec() then smb_mb__after_atomic() can be used. If the
+ * variable was updated by a fully ordered operation such as
+ * atomic_dec_and_test() then no extra barrier is required. Otherwise
+ * smb_mb() is needed.
+ *
+ * Normally the variable should be updated (the condition should be made
+ * to be true) by an operation with RELEASE semantics such as
+ * smp_store_release() so that any changes to memory made before the
+ * variable was updated are guaranteed to be visible after the matching
+ * wait_var_event() completes.
+ */
void wake_up_var(void *var)
{
__wake_up_bit(__var_waitqueue(var), var, -1);
@@ -228,20 +266,6 @@ __sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
}
EXPORT_SYMBOL_GPL(bit_wait_timeout);
-__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
-{
- unsigned long now = READ_ONCE(jiffies);
-
- if (time_after_eq(now, word->timeout))
- return -EAGAIN;
- io_schedule_timeout(word->timeout - now);
- if (signal_pending_state(mode, current))
- return -EINTR;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
-
void __init wait_bit_init(void)
{
int i;