diff options
Diffstat (limited to 'kernel/sched/ext.c')
-rw-r--r-- | kernel/sched/ext.c | 424 |
1 files changed, 301 insertions, 123 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 19d2699cf638..8857c0709bdd 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -206,7 +206,7 @@ struct scx_dump_ctx { */ struct sched_ext_ops { /** - * select_cpu - Pick the target CPU for a task which is being woken up + * @select_cpu: Pick the target CPU for a task which is being woken up * @p: task being woken up * @prev_cpu: the cpu @p was on before sleeping * @wake_flags: SCX_WAKE_* @@ -233,7 +233,7 @@ struct sched_ext_ops { s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); /** - * enqueue - Enqueue a task on the BPF scheduler + * @enqueue: Enqueue a task on the BPF scheduler * @p: task being enqueued * @enq_flags: %SCX_ENQ_* * @@ -248,7 +248,7 @@ struct sched_ext_ops { void (*enqueue)(struct task_struct *p, u64 enq_flags); /** - * dequeue - Remove a task from the BPF scheduler + * @dequeue: Remove a task from the BPF scheduler * @p: task being dequeued * @deq_flags: %SCX_DEQ_* * @@ -264,7 +264,7 @@ struct sched_ext_ops { void (*dequeue)(struct task_struct *p, u64 deq_flags); /** - * dispatch - Dispatch tasks from the BPF scheduler and/or user DSQs + * @dispatch: Dispatch tasks from the BPF scheduler and/or user DSQs * @cpu: CPU to dispatch tasks for * @prev: previous task being switched out * @@ -287,7 +287,7 @@ struct sched_ext_ops { void (*dispatch)(s32 cpu, struct task_struct *prev); /** - * tick - Periodic tick + * @tick: Periodic tick * @p: task running currently * * This operation is called every 1/HZ seconds on CPUs which are @@ -297,7 +297,7 @@ struct sched_ext_ops { void (*tick)(struct task_struct *p); /** - * runnable - A task is becoming runnable on its associated CPU + * @runnable: A task is becoming runnable on its associated CPU * @p: task becoming runnable * @enq_flags: %SCX_ENQ_* * @@ -324,7 +324,7 @@ struct sched_ext_ops { void (*runnable)(struct task_struct *p, u64 enq_flags); /** - * running - A task is starting to run on its associated CPU + * @running: A task is starting to run on its associated CPU * @p: task starting to run * * See ->runnable() for explanation on the task state notifiers. @@ -332,7 +332,7 @@ struct sched_ext_ops { void (*running)(struct task_struct *p); /** - * stopping - A task is stopping execution + * @stopping: A task is stopping execution * @p: task stopping to run * @runnable: is task @p still runnable? * @@ -343,7 +343,7 @@ struct sched_ext_ops { void (*stopping)(struct task_struct *p, bool runnable); /** - * quiescent - A task is becoming not runnable on its associated CPU + * @quiescent: A task is becoming not runnable on its associated CPU * @p: task becoming not runnable * @deq_flags: %SCX_DEQ_* * @@ -363,7 +363,7 @@ struct sched_ext_ops { void (*quiescent)(struct task_struct *p, u64 deq_flags); /** - * yield - Yield CPU + * @yield: Yield CPU * @from: yielding task * @to: optional yield target task * @@ -378,7 +378,7 @@ struct sched_ext_ops { bool (*yield)(struct task_struct *from, struct task_struct *to); /** - * core_sched_before - Task ordering for core-sched + * @core_sched_before: Task ordering for core-sched * @a: task A * @b: task B * @@ -396,7 +396,7 @@ struct sched_ext_ops { bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); /** - * set_weight - Set task weight + * @set_weight: Set task weight * @p: task to set weight for * @weight: new weight [1..10000] * @@ -405,7 +405,7 @@ struct sched_ext_ops { void (*set_weight)(struct task_struct *p, u32 weight); /** - * set_cpumask - Set CPU affinity + * @set_cpumask: Set CPU affinity * @p: task to set CPU affinity for * @cpumask: cpumask of cpus that @p can run on * @@ -415,7 +415,7 @@ struct sched_ext_ops { const struct cpumask *cpumask); /** - * update_idle - Update the idle state of a CPU + * @update_idle: Update the idle state of a CPU * @cpu: CPU to udpate the idle state for * @idle: whether entering or exiting the idle state * @@ -436,7 +436,7 @@ struct sched_ext_ops { void (*update_idle)(s32 cpu, bool idle); /** - * cpu_acquire - A CPU is becoming available to the BPF scheduler + * @cpu_acquire: A CPU is becoming available to the BPF scheduler * @cpu: The CPU being acquired by the BPF scheduler. * @args: Acquire arguments, see the struct definition. * @@ -446,7 +446,7 @@ struct sched_ext_ops { void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); /** - * cpu_release - A CPU is taken away from the BPF scheduler + * @cpu_release: A CPU is taken away from the BPF scheduler * @cpu: The CPU being released by the BPF scheduler. * @args: Release arguments, see the struct definition. * @@ -458,7 +458,7 @@ struct sched_ext_ops { void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); /** - * init_task - Initialize a task to run in a BPF scheduler + * @init_task: Initialize a task to run in a BPF scheduler * @p: task to initialize for BPF scheduling * @args: init arguments, see the struct definition * @@ -473,8 +473,9 @@ struct sched_ext_ops { s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); /** - * exit_task - Exit a previously-running task from the system + * @exit_task: Exit a previously-running task from the system * @p: task to exit + * @args: exit arguments, see the struct definition * * @p is exiting or the BPF scheduler is being unloaded. Perform any * necessary cleanup for @p. @@ -482,7 +483,7 @@ struct sched_ext_ops { void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); /** - * enable - Enable BPF scheduling for a task + * @enable: Enable BPF scheduling for a task * @p: task to enable BPF scheduling for * * Enable @p for BPF scheduling. enable() is called on @p any time it @@ -491,7 +492,7 @@ struct sched_ext_ops { void (*enable)(struct task_struct *p); /** - * disable - Disable BPF scheduling for a task + * @disable: Disable BPF scheduling for a task * @p: task to disable BPF scheduling for * * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. @@ -501,7 +502,7 @@ struct sched_ext_ops { void (*disable)(struct task_struct *p); /** - * dump - Dump BPF scheduler state on error + * @dump: Dump BPF scheduler state on error * @ctx: debug dump context * * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. @@ -509,7 +510,7 @@ struct sched_ext_ops { void (*dump)(struct scx_dump_ctx *ctx); /** - * dump_cpu - Dump BPF scheduler state for a CPU on error + * @dump_cpu: Dump BPF scheduler state for a CPU on error * @ctx: debug dump context * @cpu: CPU to generate debug dump for * @idle: @cpu is currently idle without any runnable tasks @@ -521,7 +522,7 @@ struct sched_ext_ops { void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); /** - * dump_task - Dump BPF scheduler state for a runnable task on error + * @dump_task: Dump BPF scheduler state for a runnable task on error * @ctx: debug dump context * @p: runnable task to generate debug dump for * @@ -532,7 +533,7 @@ struct sched_ext_ops { #ifdef CONFIG_EXT_GROUP_SCHED /** - * cgroup_init - Initialize a cgroup + * @cgroup_init: Initialize a cgroup * @cgrp: cgroup being initialized * @args: init arguments, see the struct definition * @@ -547,7 +548,7 @@ struct sched_ext_ops { struct scx_cgroup_init_args *args); /** - * cgroup_exit - Exit a cgroup + * @cgroup_exit: Exit a cgroup * @cgrp: cgroup being exited * * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit @@ -556,7 +557,7 @@ struct sched_ext_ops { void (*cgroup_exit)(struct cgroup *cgrp); /** - * cgroup_prep_move - Prepare a task to be moved to a different cgroup + * @cgroup_prep_move: Prepare a task to be moved to a different cgroup * @p: task being moved * @from: cgroup @p is being moved from * @to: cgroup @p is being moved to @@ -571,7 +572,7 @@ struct sched_ext_ops { struct cgroup *from, struct cgroup *to); /** - * cgroup_move - Commit cgroup move + * @cgroup_move: Commit cgroup move * @p: task being moved * @from: cgroup @p is being moved from * @to: cgroup @p is being moved to @@ -582,7 +583,7 @@ struct sched_ext_ops { struct cgroup *from, struct cgroup *to); /** - * cgroup_cancel_move - Cancel cgroup move + * @cgroup_cancel_move: Cancel cgroup move * @p: task whose cgroup move is being canceled * @from: cgroup @p was being moved from * @to: cgroup @p was being moved to @@ -594,7 +595,7 @@ struct sched_ext_ops { struct cgroup *from, struct cgroup *to); /** - * cgroup_set_weight - A cgroup's weight is being changed + * @cgroup_set_weight: A cgroup's weight is being changed * @cgrp: cgroup whose weight is being updated * @weight: new weight [1..10000] * @@ -608,7 +609,7 @@ struct sched_ext_ops { */ /** - * cpu_online - A CPU became online + * @cpu_online: A CPU became online * @cpu: CPU which just came up * * @cpu just came online. @cpu will not call ops.enqueue() or @@ -617,7 +618,7 @@ struct sched_ext_ops { void (*cpu_online)(s32 cpu); /** - * cpu_offline - A CPU is going offline + * @cpu_offline: A CPU is going offline * @cpu: CPU which is going offline * * @cpu is going offline. @cpu will not call ops.enqueue() or @@ -630,12 +631,12 @@ struct sched_ext_ops { */ /** - * init - Initialize the BPF scheduler + * @init: Initialize the BPF scheduler */ s32 (*init)(void); /** - * exit - Clean up after the BPF scheduler + * @exit: Clean up after the BPF scheduler * @info: Exit info * * ops.exit() is also called on ops.init() failure, which is a bit @@ -645,17 +646,17 @@ struct sched_ext_ops { void (*exit)(struct scx_exit_info *info); /** - * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch + * @dispatch_max_batch: Max nr of tasks that dispatch() can dispatch */ u32 dispatch_max_batch; /** - * flags - %SCX_OPS_* flags + * @flags: %SCX_OPS_* flags */ u64 flags; /** - * timeout_ms - The maximum amount of time, in milliseconds, that a + * @timeout_ms: The maximum amount of time, in milliseconds, that a * runnable task should be able to wait before being scheduled. The * maximum timeout may not exceed the default timeout of 30 seconds. * @@ -664,13 +665,13 @@ struct sched_ext_ops { u32 timeout_ms; /** - * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default + * @exit_dump_len: scx_exit_info.dump buffer length. If 0, the default * value of 32768 is used. */ u32 exit_dump_len; /** - * hotplug_seq - A sequence number that may be set by the scheduler to + * @hotplug_seq: A sequence number that may be set by the scheduler to * detect when a hotplug event has occurred during the loading process. * If 0, no detection occurs. Otherwise, the scheduler will fail to * load if the sequence number does not match @scx_hotplug_seq on the @@ -679,7 +680,7 @@ struct sched_ext_ops { u64 hotplug_seq; /** - * name - BPF scheduler's name + * @name: BPF scheduler's name * * Must be a non-zero valid BPF object name including only isalnum(), * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the @@ -960,7 +961,7 @@ static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); static struct scx_dispatch_q **global_dsqs; static const struct rhashtable_params dsq_hash_params = { - .key_len = 8, + .key_len = sizeof_field(struct scx_dispatch_q, id), .key_offset = offsetof(struct scx_dispatch_q, id), .head_offset = offsetof(struct scx_dispatch_q, hash_node), }; @@ -1408,7 +1409,6 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) /** * scx_task_iter_next_locked - Next non-idle task with its rq locked * @iter: iterator to walk - * @include_dead: Whether we should include dead tasks in the iteration * * Visit the non-idle task with its rq lock held. Allows callers to specify * whether they would like to filter out dead tasks. See scx_task_iter_start() @@ -2747,6 +2747,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) { struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); bool prev_on_scx = prev->sched_class == &ext_sched_class; + bool prev_on_rq = prev->scx.flags & SCX_TASK_QUEUED; int nr_loops = SCX_DSP_MAX_LOOPS; lockdep_assert_rq_held(rq); @@ -2779,8 +2780,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) * See scx_ops_disable_workfn() for the explanation on the * bypassing test. */ - if ((prev->scx.flags & SCX_TASK_QUEUED) && - prev->scx.slice && !scx_rq_bypassing(rq)) { + if (prev_on_rq && prev->scx.slice && !scx_rq_bypassing(rq)) { rq->scx.flags |= SCX_RQ_BAL_KEEP; goto has_tasks; } @@ -2813,6 +2813,10 @@ static int balance_one(struct rq *rq, struct task_struct *prev) flush_dispatch_buf(rq); + if (prev_on_rq && prev->scx.slice) { + rq->scx.flags |= SCX_RQ_BAL_KEEP; + goto has_tasks; + } if (rq->scx.local_dsq.nr) goto has_tasks; if (consume_global_dsq(rq)) @@ -2838,8 +2842,7 @@ no_tasks: * Didn't find another task to run. Keep running @prev unless * %SCX_OPS_ENQ_LAST is in effect. */ - if ((prev->scx.flags & SCX_TASK_QUEUED) && - (!static_branch_unlikely(&scx_ops_enq_last) || + if (prev_on_rq && (!static_branch_unlikely(&scx_ops_enq_last) || scx_rq_bypassing(rq))) { rq->scx.flags |= SCX_RQ_BAL_KEEP; goto has_tasks; @@ -3034,7 +3037,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, */ if (p->scx.slice && !scx_rq_bypassing(rq)) { dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); - return; + goto switch_class; } /* @@ -3051,6 +3054,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, } } +switch_class: if (next && next->sched_class != &ext_sched_class) switch_class(rq, next); } @@ -3132,6 +3136,7 @@ static struct task_struct *pick_task_scx(struct rq *rq) * scx_prio_less - Task ordering for core-sched * @a: task A * @b: task B + * @in_fi: in forced idle state * * Core-sched is implemented as an additional scheduling layer on top of the * usual sched_class'es and needs to find out the expected task ordering. For @@ -3180,6 +3185,10 @@ static bool test_and_clear_cpu_idle(int cpu) * scx_pick_idle_cpu() can get caught in an infinite loop as * @cpu is never cleared from idle_masks.smt. Ensure that @cpu * is eventually cleared. + * + * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to + * reduce memory writes, which may help alleviate cache + * coherence pressure. */ if (cpumask_intersects(smt, idle_masks.smt)) cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); @@ -3216,6 +3225,74 @@ found: } /* + * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC + * domain is not defined). + */ +static unsigned int llc_weight(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sd->span_weight; +} + +/* + * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC + * domain is not defined). + */ +static struct cpumask *llc_span(s32 cpu) +{ + struct sched_domain *sd; + + sd = rcu_dereference(per_cpu(sd_llc, cpu)); + if (!sd) + return 0; + + return sched_domain_span(sd); +} + +/* + * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the + * NUMA domain is not defined). + */ +static unsigned int numa_weight(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return 0; + sg = sd->groups; + if (!sg) + return 0; + + return sg->group_weight; +} + +/* + * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA + * domain is not defined). + */ +static struct cpumask *numa_span(s32 cpu) +{ + struct sched_domain *sd; + struct sched_group *sg; + + sd = rcu_dereference(per_cpu(sd_numa, cpu)); + if (!sd) + return NULL; + sg = sd->groups; + if (!sg) + return NULL; + + return sched_group_span(sg); +} + +/* * Return true if the LLC domains do not perfectly overlap with the NUMA * domains, false otherwise. */ @@ -3246,19 +3323,10 @@ static bool llc_numa_mismatch(void) * overlapping, which is incorrect (as NUMA 1 has two distinct LLC * domains). */ - for_each_online_cpu(cpu) { - const struct cpumask *numa_cpus; - struct sched_domain *sd; - - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - if (!sd) + for_each_online_cpu(cpu) + if (llc_weight(cpu) != numa_weight(cpu)) return true; - numa_cpus = cpumask_of_node(cpu_to_node(cpu)); - if (sd->span_weight != cpumask_weight(numa_cpus)) - return true; - } - return false; } @@ -3276,8 +3344,7 @@ static bool llc_numa_mismatch(void) static void update_selcpu_topology(void) { bool enable_llc = false, enable_numa = false; - struct sched_domain *sd; - const struct cpumask *cpus; + unsigned int nr_cpus; s32 cpu = cpumask_first(cpu_online_mask); /* @@ -3291,10 +3358,12 @@ static void update_selcpu_topology(void) * CPUs. */ rcu_read_lock(); - sd = rcu_dereference(per_cpu(sd_llc, cpu)); - if (sd) { - if (sd->span_weight < num_online_cpus()) + nr_cpus = llc_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus()) enable_llc = true; + pr_debug("sched_ext: LLC=%*pb weight=%u\n", + cpumask_pr_args(llc_span(cpu)), llc_weight(cpu)); } /* @@ -3306,15 +3375,19 @@ static void update_selcpu_topology(void) * enabling both NUMA and LLC optimizations is unnecessary, as checking * for an idle CPU in the same domain twice is redundant. */ - cpus = cpumask_of_node(cpu_to_node(cpu)); - if ((cpumask_weight(cpus) < num_online_cpus()) && llc_numa_mismatch()) - enable_numa = true; + nr_cpus = numa_weight(cpu); + if (nr_cpus > 0) { + if (nr_cpus < num_online_cpus() && llc_numa_mismatch()) + enable_numa = true; + pr_debug("sched_ext: NUMA=%*pb weight=%u\n", + cpumask_pr_args(numa_span(cpu)), numa_weight(cpu)); + } rcu_read_unlock(); pr_debug("sched_ext: LLC idle selection %s\n", - enable_llc ? "enabled" : "disabled"); + str_enabled_disabled(enable_llc)); pr_debug("sched_ext: NUMA idle selection %s\n", - enable_numa ? "enabled" : "disabled"); + str_enabled_disabled(enable_numa)); if (enable_llc) static_branch_enable_cpuslocked(&scx_selcpu_topo_llc); @@ -3344,6 +3417,8 @@ static void update_selcpu_topology(void) * 4. Pick a CPU within the same NUMA node, if enabled: * - choose a CPU from the same NUMA node to reduce memory access latency. * + * 5. Pick any idle CPU usable by the task. + * * Step 3 and 4 are performed only if the system has, respectively, multiple * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and * scx_selcpu_topo_numa). @@ -3360,7 +3435,6 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, *found = false; - /* * This is necessary to protect llc_cpus. */ @@ -3379,15 +3453,10 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, */ if (p->nr_cpus_allowed >= num_possible_cpus()) { if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) - numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu)); + numa_cpus = numa_span(prev_cpu); - if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) { - struct sched_domain *sd; - - sd = rcu_dereference(per_cpu(sd_llc, prev_cpu)); - if (sd) - llc_cpus = sched_domain_span(sd); - } + if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) + llc_cpus = llc_span(prev_cpu); } /* @@ -3586,20 +3655,9 @@ static void reset_idle_masks(void) cpumask_copy(idle_masks.smt, cpu_online_mask); } -void __scx_update_idle(struct rq *rq, bool idle) +static void update_builtin_idle(int cpu, bool idle) { - int cpu = cpu_of(rq); - - if (SCX_HAS_OP(update_idle) && !scx_rq_bypassing(rq)) { - SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); - if (!static_branch_unlikely(&scx_builtin_idle_enabled)) - return; - } - - if (idle) - cpumask_set_cpu(cpu, idle_masks.cpu); - else - cpumask_clear_cpu(cpu, idle_masks.cpu); + assign_cpu(cpu, idle_masks.cpu, idle); #ifdef CONFIG_SCHED_SMT if (sched_smt_active()) { @@ -3610,10 +3668,8 @@ void __scx_update_idle(struct rq *rq, bool idle) * idle_masks.smt handling is racy but that's fine as * it's only for optimization and self-correcting. */ - for_each_cpu(cpu, smt) { - if (!cpumask_test_cpu(cpu, idle_masks.cpu)) - return; - } + if (!cpumask_subset(smt, idle_masks.cpu)) + return; cpumask_or(idle_masks.smt, idle_masks.smt, smt); } else { cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); @@ -3622,6 +3678,57 @@ void __scx_update_idle(struct rq *rq, bool idle) #endif } +/* + * Update the idle state of a CPU to @idle. + * + * If @do_notify is true, ops.update_idle() is invoked to notify the scx + * scheduler of an actual idle state transition (idle to busy or vice + * versa). If @do_notify is false, only the idle state in the idle masks is + * refreshed without invoking ops.update_idle(). + * + * This distinction is necessary, because an idle CPU can be "reserved" and + * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as + * busy even if no tasks are dispatched. In this case, the CPU may return + * to idle without a true state transition. Refreshing the idle masks + * without invoking ops.update_idle() ensures accurate idle state tracking + * while avoiding unnecessary updates and maintaining balanced state + * transitions. + */ +void __scx_update_idle(struct rq *rq, bool idle, bool do_notify) +{ + int cpu = cpu_of(rq); + + lockdep_assert_rq_held(rq); + + /* + * Trigger ops.update_idle() only when transitioning from a task to + * the idle thread and vice versa. + * + * Idle transitions are indicated by do_notify being set to true, + * managed by put_prev_task_idle()/set_next_task_idle(). + */ + if (SCX_HAS_OP(update_idle) && do_notify && !scx_rq_bypassing(rq)) + SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); + + /* + * Update the idle masks: + * - for real idle transitions (do_notify == true) + * - for idle-to-idle transitions (indicated by the previous task + * being the idle thread, managed by pick_task_idle()) + * + * Skip updating idle masks if the previous task is not the idle + * thread, since set_next_task_idle() has already handled it when + * transitioning from a task to the idle thread (calling this + * function with do_notify == true). + * + * In this way we can avoid updating the idle masks twice, + * unnecessarily. + */ + if (static_branch_likely(&scx_builtin_idle_enabled)) + if (do_notify || is_idle_task(rq->curr)) + update_builtin_idle(cpu, idle); +} + static void handle_hotplug(struct rq *rq, bool online) { int cpu = cpu_of(rq); @@ -4641,6 +4748,7 @@ bool task_should_scx(int policy) /** * scx_softlockup - sched_ext softlockup handler + * @dur_s: number of seconds of CPU stuck due to soft lockup * * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can * live-lock the system by making many CPUs target the same DSQ to the point @@ -4684,6 +4792,7 @@ static void scx_clear_softlockup(void) /** * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress + * @bypass: true for bypass, false for unbypass * * Bypassing guarantees that all runnable tasks make forward progress without * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might @@ -4744,10 +4853,9 @@ static void scx_ops_bypass(bool bypass) */ for_each_possible_cpu(cpu) { struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; struct task_struct *p, *n; - rq_lock(rq, &rf); + raw_spin_rq_lock(rq); if (bypass) { WARN_ON_ONCE(rq->scx.flags & SCX_RQ_BYPASSING); @@ -4763,7 +4871,7 @@ static void scx_ops_bypass(bool bypass) * sees scx_rq_bypassing() before moving tasks to SCX. */ if (!scx_enabled()) { - rq_unlock(rq, &rf); + raw_spin_rq_unlock(rq); continue; } @@ -4783,10 +4891,11 @@ static void scx_ops_bypass(bool bypass) sched_enq_and_set_task(&ctx); } - rq_unlock(rq, &rf); - /* resched to restore ticks and idle state */ - resched_cpu(cpu); + if (cpu_online(cpu) || cpu == smp_processor_id()) + resched_curr(rq); + + raw_spin_rq_unlock(rq); } atomic_dec(&scx_ops_breather_depth); @@ -4852,7 +4961,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work) struct task_struct *p; struct rhashtable_iter rht_iter; struct scx_dispatch_q *dsq; - int i, kind; + int i, kind, cpu; kind = atomic_read(&scx_exit_kind); while (true) { @@ -4935,6 +5044,15 @@ static void scx_ops_disable_workfn(struct kthread_work *work) scx_task_iter_stop(&sti); percpu_up_write(&scx_fork_rwsem); + /* + * Invalidate all the rq clocks to prevent getting outdated + * rq clocks from a previous scx scheduler. + */ + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + scx_rq_clock_invalidate(rq); + } + /* no task is on scx, turn off all the switches and flush in-progress calls */ static_branch_disable(&__scx_ops_enabled); for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) @@ -5159,9 +5277,9 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, p->scx.dsq_flags, ops_state & SCX_OPSS_STATE_MASK, ops_state >> SCX_OPSS_QSEQ_SHIFT); - dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu", + dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu slice=%llu", p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf, - p->scx.dsq_vtime); + p->scx.dsq_vtime, p->scx.slice); dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); if (SCX_HAS_OP(dump_task)) { @@ -5352,7 +5470,7 @@ static struct kthread_worker *scx_create_rt_helper(const char *name) { struct kthread_worker *helper; - helper = kthread_create_worker(0, name); + helper = kthread_run_worker(0, name); if (helper) sched_set_fifo(helper->task); return helper; @@ -6236,6 +6354,15 @@ void __init init_sched_ext_class(void) __bpf_kfunc_start_defs(); +static bool check_builtin_idle_enabled(void) +{ + if (static_branch_likely(&scx_builtin_idle_enabled)) + return true; + + scx_ops_error("built-in idle tracking is disabled"); + return false; +} + /** * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() * @p: task_struct to select a CPU for @@ -6253,10 +6380,8 @@ __bpf_kfunc_start_defs(); __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) { - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); + if (!check_builtin_idle_enabled()) goto prev_cpu; - } if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) goto prev_cpu; @@ -6340,9 +6465,7 @@ __bpf_kfunc_start_defs(); * ops.select_cpu(), and ops.dispatch(). * * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch - * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be - * used to target the local DSQ of a CPU other than the enqueueing one. Use - * ops.select_cpu() to be on the target CPU in the first place. + * and @p must match the task being enqueued. * * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p * will be directly inserted into the corresponding dispatch queue after @@ -7181,7 +7304,7 @@ __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, } /** - * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler + * scx_bpf_dump_bstr - Generate extra debug dump specific to the BPF scheduler * @fmt: format string * @data: format string parameters packaged using ___bpf_fill() macro * @data__sz: @data len, must end in '__sz' for the verifier @@ -7273,7 +7396,6 @@ __bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) * scx_bpf_cpuperf_set - Set the relative performance target of a CPU * @cpu: CPU of interest * @perf: target performance level [0, %SCX_CPUPERF_ONE] - * @flags: %SCX_CPUPERF_* flags * * Set the target performance level of @cpu to @perf. @perf is in linear * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the @@ -7350,10 +7472,8 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) { - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); + if (!check_builtin_idle_enabled()) return cpu_none_mask; - } #ifdef CONFIG_SMP return idle_masks.cpu; @@ -7371,10 +7491,8 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) */ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) { - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); + if (!check_builtin_idle_enabled()) return cpu_none_mask; - } #ifdef CONFIG_SMP if (sched_smt_active()) @@ -7389,6 +7507,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) /** * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to * either the percpu, or SMT idle-tracking cpumask. + * @idle_mask: &cpumask to use */ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) { @@ -7412,10 +7531,8 @@ __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) */ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) { - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); + if (!check_builtin_idle_enabled()) return false; - } if (ops_cpu_valid(cpu, NULL)) return test_and_clear_cpu_idle(cpu); @@ -7445,10 +7562,8 @@ __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { - if (!static_branch_likely(&scx_builtin_idle_enabled)) { - scx_ops_error("built-in idle tracking is disabled"); + if (!check_builtin_idle_enabled()) return -EBUSY; - } return scx_pick_idle_cpu(cpus_allowed, flags); } @@ -7543,6 +7658,68 @@ out: } #endif +/** + * scx_bpf_now - Returns a high-performance monotonically non-decreasing + * clock for the current CPU. The clock returned is in nanoseconds. + * + * It provides the following properties: + * + * 1) High performance: Many BPF schedulers call bpf_ktime_get_ns() frequently + * to account for execution time and track tasks' runtime properties. + * Unfortunately, in some hardware platforms, bpf_ktime_get_ns() -- which + * eventually reads a hardware timestamp counter -- is neither performant nor + * scalable. scx_bpf_now() aims to provide a high-performance clock by + * using the rq clock in the scheduler core whenever possible. + * + * 2) High enough resolution for the BPF scheduler use cases: In most BPF + * scheduler use cases, the required clock resolution is lower than the most + * accurate hardware clock (e.g., rdtsc in x86). scx_bpf_now() basically + * uses the rq clock in the scheduler core whenever it is valid. It considers + * that the rq clock is valid from the time the rq clock is updated + * (update_rq_clock) until the rq is unlocked (rq_unpin_lock). + * + * 3) Monotonically non-decreasing clock for the same CPU: scx_bpf_now() + * guarantees the clock never goes backward when comparing them in the same + * CPU. On the other hand, when comparing clocks in different CPUs, there + * is no such guarantee -- the clock can go backward. It provides a + * monotonically *non-decreasing* clock so that it would provide the same + * clock values in two different scx_bpf_now() calls in the same CPU + * during the same period of when the rq clock is valid. + */ +__bpf_kfunc u64 scx_bpf_now(void) +{ + struct rq *rq; + u64 clock; + + preempt_disable(); + + rq = this_rq(); + if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) { + /* + * If the rq clock is valid, use the cached rq clock. + * + * Note that scx_bpf_now() is re-entrant between a process + * context and an interrupt context (e.g., timer interrupt). + * However, we don't need to consider the race between them + * because such race is not observable from a caller. + */ + clock = READ_ONCE(rq->scx.clock); + } else { + /* + * Otherwise, return a fresh rq clock. + * + * The rq clock is updated outside of the rq lock. + * In this case, keep the updated rq clock invalid so the next + * kfunc call outside the rq lock gets a fresh rq clock. + */ + clock = sched_clock_cpu(cpu_of(rq)); + } + + preempt_enable(); + + return clock; +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_any) @@ -7574,6 +7751,7 @@ BTF_ID_FLAGS(func, scx_bpf_cpu_rq) #ifdef CONFIG_CGROUP_SCHED BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) #endif +BTF_ID_FLAGS(func, scx_bpf_now) BTF_KFUNCS_END(scx_kfunc_ids_any) static const struct btf_kfunc_id_set scx_kfunc_set_any = { |