diff options
author | Tejun Heo <tj@kernel.org> | 2024-08-04 19:07:40 +0200 |
---|---|---|
committer | Tejun Heo <tj@kernel.org> | 2024-08-04 19:36:54 +0200 |
commit | 0df340ceae2e51ccc6a8a19f4182f389223fbfdf (patch) | |
tree | 635f0cdae0c4a1db431cd91158db34367c9c4218 /kernel/sched | |
parent | sched_ext: Allow p->scx.disallow only while loading (diff) | |
parent | sched/fair: Cleanup fair_server (diff) | |
download | linux-0df340ceae2e51ccc6a8a19f4182f389223fbfdf.tar.xz linux-0df340ceae2e51ccc6a8a19f4182f389223fbfdf.zip |
Merge branch 'sched/core' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip into for-6.12
Pull tip/sched/core to resolve the following four conflicts. While 2-4 are
simple context conflicts, 1 is a bit subtle and easy to resolve incorrectly.
1. 2c8d046d5d51 ("sched: Add normal_policy()")
vs.
faa42d29419d ("sched/fair: Make SCHED_IDLE entity be preempted in strict hierarchy")
The former converts direct test on p->policy to use the helper
normal_policy(). The latter moves the p->policy test to a different
location. Resolve by converting the test on p->plicy in the new location to
use normal_policy().
2. a7a9fc549293 ("sched_ext: Add boilerplate for extensible scheduler class")
vs.
a110a81c52a9 ("sched/deadline: Deferrable dl server")
Both add calls to put_prev_task_idle() and set_next_task_idle(). Simple
context conflict. Resolve by taking changes from both.
3. a7a9fc549293 ("sched_ext: Add boilerplate for extensible scheduler class")
vs.
c245910049d0 ("sched/core: Add clearing of ->dl_server in put_prev_task_balance()")
The former changes for_each_class() itertion to use for_each_active_class().
The latter moves away the adjacent dl_server handling code. Simple context
conflict. Resolve by taking changes from both.
4. 60c27fb59f6c ("sched_ext: Implement sched_ext_ops.cpu_online/offline()")
vs.
31b164e2e4af ("sched/smt: Introduce sched_smt_present_inc/dec() helper")
2f027354122f ("sched/core: Introduce sched_set_rq_on/offline() helper")
The former adds scx_rq_deactivate() call. The latter two change code around
it. Simple context conflict. Resolve by taking changes from both.
Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 146 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 6 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 449 | ||||
-rw-r--r-- | kernel/sched/debug.c | 166 | ||||
-rw-r--r-- | kernel/sched/fair.c | 116 | ||||
-rw-r--r-- | kernel/sched/features.h | 2 | ||||
-rw-r--r-- | kernel/sched/idle.c | 2 | ||||
-rw-r--r-- | kernel/sched/rt.c | 242 | ||||
-rw-r--r-- | kernel/sched/sched.h | 23 | ||||
-rw-r--r-- | kernel/sched/topology.c | 8 |
10 files changed, 879 insertions, 281 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 22f86d5e9231..ec67b30f87c4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -163,6 +163,9 @@ static inline int __task_prio(const struct task_struct *p) if (p->sched_class == &stop_sched_class) /* trumps deadline */ return -2; + if (p->dl_server) + return -1; /* deadline */ + if (rt_prio(p->prio)) /* includes deadline */ return p->prio; /* [-1, 99] */ @@ -195,8 +198,24 @@ static inline bool prio_less(const struct task_struct *a, if (-pb < -pa) return false; - if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ - return !dl_time_before(a->dl.deadline, b->dl.deadline); + if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */ + const struct sched_dl_entity *a_dl, *b_dl; + + a_dl = &a->dl; + /* + * Since,'a' and 'b' can be CFS tasks served by DL server, + * __task_prio() can return -1 (for DL) even for those. In that + * case, get to the dl_server's DL entity. + */ + if (a->dl_server) + a_dl = a->dl_server; + + b_dl = &b->dl; + if (b->dl_server) + b_dl = b->dl_server; + + return !dl_time_before(a_dl->deadline, b_dl->deadline); + } if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ return cfs_prio_less(a, b, in_fi); @@ -1280,7 +1299,7 @@ bool sched_can_stop_tick(struct rq *rq) * dequeued by migrating while the constrained task continues to run. * E.g. going from 2->1 without going through pick_next_task(). */ - if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) { + if (__need_bw_check(rq, rq->curr)) { if (cfs_task_bw_constrained(rq->curr)) return false; } @@ -2255,6 +2274,12 @@ void migrate_disable(void) struct task_struct *p = current; if (p->migration_disabled) { +#ifdef CONFIG_DEBUG_PREEMPT + /* + *Warn about overflow half-way through the range. + */ + WARN_ON_ONCE((s16)p->migration_disabled < 0); +#endif p->migration_disabled++; return; } @@ -2273,14 +2298,20 @@ void migrate_enable(void) .flags = SCA_MIGRATE_ENABLE, }; +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Check both overflow from migrate_disable() and superfluous + * migrate_enable(). + */ + if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) + return; +#endif + if (p->migration_disabled > 1) { p->migration_disabled--; return; } - if (WARN_ON_ONCE(!p->migration_disabled)) - return; - /* * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). @@ -4737,7 +4768,7 @@ void wake_up_new_task(struct task_struct *p) update_rq_clock(rq); post_init_entity_util_avg(p); - activate_task(rq, p, ENQUEUE_NOCLOCK); + activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); trace_sched_wakeup_new(p); wakeup_preempt(rq, p, WF_FORK); #ifdef CONFIG_SMP @@ -5855,6 +5886,14 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, #endif put_prev_task(rq, prev); + + /* + * We've updated @prev and no longer need the server link, clear it. + * Must be done before ->pick_next_task() because that can (re)set + * ->dl_server. + */ + if (prev->dl_server) + prev->dl_server = NULL; } /* @@ -5889,6 +5928,13 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) } /* + * This is a normal CFS pick, but the previous could be a DL pick. + * Clear it as previous is no longer picked. + */ + if (prev->dl_server) + prev->dl_server = NULL; + + /* * This is the fast path; it cannot be a DL server pick; * therefore even if @p == @prev, ->dl_server must be NULL. */ @@ -5901,14 +5947,6 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) restart: put_prev_task_balance(rq, prev, rf); - /* - * We've updated @prev and no longer need the server link, clear it. - * Must be done before ->pick_next_task() because that can (re)set - * ->dl_server. - */ - if (prev->dl_server) - prev->dl_server = NULL; - for_each_active_class(class) { p = class->pick_next_task(rq); if (p) { @@ -7925,6 +7963,30 @@ void set_rq_offline(struct rq *rq) } } +static inline void sched_set_rq_online(struct rq *rq, int cpu) +{ + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_online(rq); + } + rq_unlock_irqrestore(rq, &rf); +} + +static inline void sched_set_rq_offline(struct rq *rq, int cpu) +{ + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + rq_unlock_irqrestore(rq, &rf); +} + /* * used to mark begin/end of suspend/resume: */ @@ -7975,10 +8037,25 @@ static int cpuset_cpu_inactive(unsigned int cpu) return 0; } +static inline void sched_smt_present_inc(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_inc_cpuslocked(&sched_smt_present); +#endif +} + +static inline void sched_smt_present_dec(int cpu) +{ +#ifdef CONFIG_SCHED_SMT + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_dec_cpuslocked(&sched_smt_present); +#endif +} + int sched_cpu_activate(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; /* * Clear the balance_push callback and prepare to schedule @@ -7986,13 +8063,10 @@ int sched_cpu_activate(unsigned int cpu) */ balance_push_set(cpu, false); -#ifdef CONFIG_SCHED_SMT /* * When going up, increment the number of cores with SMT present. */ - if (cpumask_weight(cpu_smt_mask(cpu)) == 2) - static_branch_inc_cpuslocked(&sched_smt_present); -#endif + sched_smt_present_inc(cpu); set_cpu_active(cpu, true); if (sched_smp_initialized) { @@ -8012,12 +8086,7 @@ int sched_cpu_activate(unsigned int cpu) * 2) At runtime, if cpuset_cpu_active() fails to rebuild the * domains. */ - rq_lock_irqsave(rq, &rf); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_online(rq); - } - rq_unlock_irqrestore(rq, &rf); + sched_set_rq_online(rq, cpu); return 0; } @@ -8025,7 +8094,6 @@ int sched_cpu_activate(unsigned int cpu) int sched_cpu_deactivate(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - struct rq_flags rf; int ret; /* @@ -8056,22 +8124,16 @@ int sched_cpu_deactivate(unsigned int cpu) */ synchronize_rcu(); - rq_lock_irqsave(rq, &rf); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); - } - rq_unlock_irqrestore(rq, &rf); + sched_set_rq_offline(rq, cpu); scx_rq_deactivate(rq); -#ifdef CONFIG_SCHED_SMT /* * When going down, decrement the number of cores with SMT present. */ - if (cpumask_weight(cpu_smt_mask(cpu)) == 2) - static_branch_dec_cpuslocked(&sched_smt_present); + sched_smt_present_dec(cpu); +#ifdef CONFIG_SCHED_SMT sched_core_cpu_deactivate(cpu); #endif @@ -8081,6 +8143,8 @@ int sched_cpu_deactivate(unsigned int cpu) sched_update_numa(cpu, false); ret = cpuset_cpu_inactive(cpu); if (ret) { + sched_smt_present_inc(cpu); + sched_set_rq_online(rq, cpu); balance_push_set(cpu, false); set_cpu_active(cpu, true); sched_update_numa(cpu, true); @@ -8290,8 +8354,6 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ } - init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); - #ifdef CONFIG_SMP init_defrootdomain(); #endif @@ -8346,8 +8408,13 @@ void __init sched_init(void) init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ - rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED + /* + * This is required for init cpu because rt.c:__enable_runtime() + * starts working after scheduler_running, which is not the case + * yet. + */ + rq->rt.rt_runtime = global_rt_runtime(); init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); #endif #ifdef CONFIG_SMP @@ -8379,6 +8446,7 @@ void __init sched_init(void) #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); + fair_server_init(rq); #ifdef CONFIG_SCHED_CORE rq->core = rq; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a5e00293ae43..0bed0fa1acd9 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -582,6 +582,12 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, } stime = mul_u64_u64_div_u64(stime, rtime, stime + utime); + /* + * Because mul_u64_u64_div_u64() can approximate on some + * achitectures; enforce the constraint that: a*b/(b+c) <= a. + */ + if (unlikely(stime > rtime)) + stime = rtime; update: /* diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f59e5c19d944..c5f1cc753a31 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -320,19 +320,12 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) __sub_running_bw(dl_se->dl_bw, dl_rq); } -static void dl_change_utilization(struct task_struct *p, u64 new_bw) +static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw) { - struct rq *rq; - - WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); - - if (task_on_rq_queued(p)) - return; + if (dl_se->dl_non_contending) { + sub_running_bw(dl_se, &rq->dl); + dl_se->dl_non_contending = 0; - rq = task_rq(p); - if (p->dl.dl_non_contending) { - sub_running_bw(&p->dl, &rq->dl); - p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the * timer cannot be canceled, inactive_task_timer() @@ -340,13 +333,25 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) * will not touch the rq's active utilization, * so we are still safe. */ - if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) - put_task_struct(p); + if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { + if (!dl_server(dl_se)) + put_task_struct(dl_task_of(dl_se)); + } } - __sub_rq_bw(p->dl.dl_bw, &rq->dl); + __sub_rq_bw(dl_se->dl_bw, &rq->dl); __add_rq_bw(new_bw, &rq->dl); } +static void dl_change_utilization(struct task_struct *p, u64 new_bw) +{ + WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); + + if (task_on_rq_queued(p)) + return; + + dl_rq_change_utilization(task_rq(p), &p->dl, new_bw); +} + static void __dl_clear_params(struct sched_dl_entity *dl_se); /* @@ -771,6 +776,15 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, /* for non-boosted task, pi_of(dl_se) == dl_se */ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; dl_se->runtime = pi_of(dl_se)->dl_runtime; + + /* + * If it is a deferred reservation, and the server + * is not handling an starvation case, defer it. + */ + if (dl_se->dl_defer & !dl_se->dl_defer_running) { + dl_se->dl_throttled = 1; + dl_se->dl_defer_armed = 1; + } } /* @@ -809,6 +823,9 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) replenish_dl_new_period(dl_se, rq); } +static int start_dl_timer(struct sched_dl_entity *dl_se); +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t); + /* * Pure Earliest Deadline First (EDF) scheduling does not deal with the * possibility of a entity lasting more than what it declared, and thus @@ -837,9 +854,18 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. + * + * Or, it could be the case of a deferred reservation that + * was not able to consume its runtime in background and + * reached this point with current u > U. + * + * In both cases, set a new period. */ - if (dl_se->dl_deadline == 0) - replenish_dl_new_period(dl_se, rq); + if (dl_se->dl_deadline == 0 || + (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) { + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; + } if (dl_se->dl_yielded && dl_se->runtime > 0) dl_se->runtime = 0; @@ -873,6 +899,44 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) dl_se->dl_yielded = 0; if (dl_se->dl_throttled) dl_se->dl_throttled = 0; + + /* + * If this is the replenishment of a deferred reservation, + * clear the flag and return. + */ + if (dl_se->dl_defer_armed) { + dl_se->dl_defer_armed = 0; + return; + } + + /* + * A this point, if the deferred server is not armed, and the deadline + * is in the future, if it is not running already, throttle the server + * and arm the defer timer. + */ + if (dl_se->dl_defer && !dl_se->dl_defer_running && + dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { + if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { + + /* + * Set dl_se->dl_defer_armed and dl_throttled variables to + * inform the start_dl_timer() that this is a deferred + * activation. + */ + dl_se->dl_defer_armed = 1; + dl_se->dl_throttled = 1; + if (!start_dl_timer(dl_se)) { + /* + * If for whatever reason (delays), a previous timer was + * queued but not serviced, cancel it and clean the + * deferrable server variables intended for start_dl_timer(). + */ + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + } + } + } } /* @@ -1023,6 +1087,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) } replenish_dl_new_period(dl_se, rq); + } else if (dl_server(dl_se) && dl_se->dl_defer) { + /* + * The server can still use its previous deadline, so check if + * it left the dl_defer_running state. + */ + if (!dl_se->dl_defer_running) { + dl_se->dl_defer_armed = 1; + dl_se->dl_throttled = 1; + } } } @@ -1055,8 +1128,21 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) * We want the timer to fire at the deadline, but considering * that it is actually coming from rq->clock and not from * hrtimer's time base reading. + * + * The deferred reservation will have its timer set to + * (deadline - runtime). At that point, the CBS rule will decide + * if the current deadline can be used, or if a replenishment is + * required to avoid add too much pressure on the system + * (current u > U). */ - act = ns_to_ktime(dl_next_period(dl_se)); + if (dl_se->dl_defer_armed) { + WARN_ON_ONCE(!dl_se->dl_throttled); + act = ns_to_ktime(dl_se->deadline - dl_se->runtime); + } else { + /* act = deadline - rel-deadline + period */ + act = ns_to_ktime(dl_next_period(dl_se)); + } + now = hrtimer_cb_get_time(timer); delta = ktime_to_ns(now) - rq_clock(rq); act = ktime_add_ns(act, delta); @@ -1106,6 +1192,62 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) #endif } +/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ +static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; + +static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) +{ + struct rq *rq = rq_of_dl_se(dl_se); + u64 fw; + + scoped_guard (rq_lock, rq) { + struct rq_flags *rf = &scope.rf; + + if (!dl_se->dl_throttled || !dl_se->dl_runtime) + return HRTIMER_NORESTART; + + sched_clock_tick(); + update_rq_clock(rq); + + if (!dl_se->dl_runtime) + return HRTIMER_NORESTART; + + if (!dl_se->server_has_tasks(dl_se)) { + replenish_dl_entity(dl_se); + return HRTIMER_NORESTART; + } + + if (dl_se->dl_defer_armed) { + /* + * First check if the server could consume runtime in background. + * If so, it is possible to push the defer timer for this amount + * of time. The dl_server_min_res serves as a limit to avoid + * forwarding the timer for a too small amount of time. + */ + if (dl_time_before(rq_clock(dl_se->rq), + (dl_se->deadline - dl_se->runtime - dl_server_min_res))) { + + /* reset the defer timer */ + fw = dl_se->deadline - rq_clock(dl_se->rq) - dl_se->runtime; + + hrtimer_forward_now(timer, ns_to_ktime(fw)); + return HRTIMER_RESTART; + } + + dl_se->dl_defer_running = 1; + } + + enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); + + if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &dl_se->rq->curr->dl)) + resched_curr(rq); + + __push_dl_task(rq, rf); + } + + return HRTIMER_NORESTART; +} + /* * This is the bandwidth enforcement timer callback. If here, we know * a task is not on its dl_rq, since the fact that the timer was running @@ -1128,28 +1270,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) struct rq_flags rf; struct rq *rq; - if (dl_server(dl_se)) { - struct rq *rq = rq_of_dl_se(dl_se); - struct rq_flags rf; - - rq_lock(rq, &rf); - if (dl_se->dl_throttled) { - sched_clock_tick(); - update_rq_clock(rq); - - if (dl_se->server_has_tasks(dl_se)) { - enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); - resched_curr(rq); - __push_dl_task(rq, &rf); - } else { - replenish_dl_entity(dl_se); - } - - } - rq_unlock(rq, &rf); - - return HRTIMER_NORESTART; - } + if (dl_server(dl_se)) + return dl_server_timer(timer, dl_se); p = dl_task_of(dl_se); rq = task_rq_lock(p, &rf); @@ -1319,22 +1441,10 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) return (delta * u_act) >> BW_SHIFT; } -static inline void -update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, - int flags); -static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) +s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) { s64 scaled_delta_exec; - if (unlikely(delta_exec <= 0)) { - if (unlikely(dl_se->dl_yielded)) - goto throttle; - return; - } - - if (dl_entity_is_special(dl_se)) - return; - /* * For tasks that participate in GRUB, we implement GRUB-PA: the * spare reclaimed bandwidth is used to clock down frequency. @@ -1353,8 +1463,64 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); } + return scaled_delta_exec; +} + +static inline void +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, + int flags); +static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) +{ + s64 scaled_delta_exec; + + if (unlikely(delta_exec <= 0)) { + if (unlikely(dl_se->dl_yielded)) + goto throttle; + return; + } + + if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer) + return; + + if (dl_entity_is_special(dl_se)) + return; + + scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); + dl_se->runtime -= scaled_delta_exec; + /* + * The fair server can consume its runtime while throttled (not queued/ + * running as regular CFS). + * + * If the server consumes its entire runtime in this state. The server + * is not required for the current period. Thus, reset the server by + * starting a new period, pushing the activation. + */ + if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { + /* + * If the server was previously activated - the starving condition + * took place, it this point it went away because the fair scheduler + * was able to get runtime in background. So return to the initial + * state. + */ + dl_se->dl_defer_running = 0; + + hrtimer_try_to_cancel(&dl_se->dl_timer); + + replenish_dl_new_period(dl_se, dl_se->rq); + + /* + * Not being able to start the timer seems problematic. If it could not + * be started for whatever reason, we need to "unthrottle" the DL server + * and queue right away. Otherwise nothing might queue it. That's similar + * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn. + */ + WARN_ON_ONCE(!start_dl_timer(dl_se)); + + return; + } + throttle: if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; @@ -1382,6 +1548,14 @@ throttle: } /* + * The fair server (sole dl_server) does not account for real-time + * workload because it is running fair work. + */ + if (dl_se == &rq->fair_server) + return; + +#ifdef CONFIG_RT_GROUP_SCHED + /* * Because -- for now -- we share the rt bandwidth, we need to * account our runtime there too, otherwise actual rt tasks * would be able to exceed the shared quota. @@ -1405,34 +1579,157 @@ throttle: rt_rq->rt_time += delta_exec; raw_spin_unlock(&rt_rq->rt_runtime_lock); } +#endif +} + +/* + * In the non-defer mode, the idle time is not accounted, as the + * server provides a guarantee. + * + * If the dl_server is in defer mode, the idle time is also considered + * as time available for the fair server, avoiding a penalty for the + * rt scheduler that did not consumed that time. + */ +void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) +{ + s64 delta_exec, scaled_delta_exec; + + if (!rq->fair_server.dl_defer) + return; + + /* no need to discount more */ + if (rq->fair_server.runtime < 0) + return; + + delta_exec = rq_clock_task(rq) - p->se.exec_start; + if (delta_exec < 0) + return; + + scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); + + rq->fair_server.runtime -= scaled_delta_exec; + + if (rq->fair_server.runtime < 0) { + rq->fair_server.dl_defer_running = 0; + rq->fair_server.runtime = 0; + } + + p->se.exec_start = rq_clock_task(rq); } void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { - update_curr_dl_se(dl_se->rq, dl_se, delta_exec); + /* 0 runtime = fair server disabled */ + if (dl_se->dl_runtime) + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); } void dl_server_start(struct sched_dl_entity *dl_se) { + struct rq *rq = dl_se->rq; + + /* + * XXX: the apply do not work fine at the init phase for the + * fair server because things are not yet set. We need to improve + * this before getting generic. + */ if (!dl_server(dl_se)) { + u64 runtime = 50 * NSEC_PER_MSEC; + u64 period = 1000 * NSEC_PER_MSEC; + + dl_server_apply_params(dl_se, runtime, period, 1); + dl_se->dl_server = 1; + dl_se->dl_defer = 1; setup_new_dl_entity(dl_se); } + + if (!dl_se->dl_runtime) + return; + enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); + if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) + resched_curr(dl_se->rq); } void dl_server_stop(struct sched_dl_entity *dl_se) { + if (!dl_se->dl_runtime) + return; + dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; } void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, - dl_server_pick_f pick) + dl_server_pick_f pick_next, + dl_server_pick_f pick_task) { dl_se->rq = rq; dl_se->server_has_tasks = has_tasks; - dl_se->server_pick = pick; + dl_se->server_pick_next = pick_next; + dl_se->server_pick_task = pick_task; +} + +void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) +{ + u64 new_bw = dl_se->dl_bw; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + + dl_b = dl_bw_of(cpu_of(rq)); + guard(raw_spinlock)(&dl_b->lock); + + if (!dl_bw_cpus(cpu)) + return; + + __dl_add(dl_b, new_bw, dl_bw_cpus(cpu)); +} + +int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init) +{ + u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime); + u64 new_bw = to_ratio(period, runtime); + struct rq *rq = dl_se->rq; + int cpu = cpu_of(rq); + struct dl_bw *dl_b; + unsigned long cap; + int retval = 0; + int cpus; + + dl_b = dl_bw_of(cpu); + guard(raw_spinlock)(&dl_b->lock); + + cpus = dl_bw_cpus(cpu); + cap = dl_bw_capacity(cpu); + + if (__dl_overflow(dl_b, cap, old_bw, new_bw)) + return -EBUSY; + + if (init) { + __add_rq_bw(new_bw, &rq->dl); + __dl_add(dl_b, new_bw, cpus); + } else { + __dl_sub(dl_b, dl_se->dl_bw, cpus); + __dl_add(dl_b, new_bw, cpus); + + dl_rq_change_utilization(rq, dl_se, new_bw); + } + + dl_se->dl_runtime = runtime; + dl_se->dl_deadline = period; + dl_se->dl_period = period; + + dl_se->runtime = 0; + dl_se->deadline = 0; + + dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); + dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); + + return retval; } /* @@ -1735,7 +2032,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) * be counted in the active utilization; hence, we need to call * add_running_bw(). */ - if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { if (flags & ENQUEUE_WAKEUP) task_contending(dl_se, flags); @@ -1757,6 +2054,25 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) setup_new_dl_entity(dl_se); } + /* + * If the reservation is still throttled, e.g., it got replenished but is a + * deferred task and still got to wait, don't enqueue. + */ + if (dl_se->dl_throttled && start_dl_timer(dl_se)) + return; + + /* + * We're about to enqueue, make sure we're not ->dl_throttled! + * In case the timer was not started, say because the defer time + * has passed, mark as not throttled and mark unarmed. + * Also cancel earlier timers, since letting those run is pointless. + */ + if (dl_se->dl_throttled) { + hrtimer_try_to_cancel(&dl_se->dl_timer); + dl_se->dl_defer_armed = 0; + dl_se->dl_throttled = 0; + } + __enqueue_dl_entity(dl_se); } @@ -2086,7 +2402,12 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) return __node_2_dle(left); } -static struct task_struct *pick_task_dl(struct rq *rq) +/* + * __pick_next_task_dl - Helper to pick the next -deadline task to run. + * @rq: The runqueue to pick the next task from. + * @peek: If true, just peek at the next task. Only relevant for dlserver. + */ +static struct task_struct *__pick_next_task_dl(struct rq *rq, bool peek) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; @@ -2100,7 +2421,10 @@ again: WARN_ON_ONCE(!dl_se); if (dl_server(dl_se)) { - p = dl_se->server_pick(dl_se); + if (IS_ENABLED(CONFIG_SMP) && peek) + p = dl_se->server_pick_task(dl_se); + else + p = dl_se->server_pick_next(dl_se); if (!p) { WARN_ON_ONCE(1); dl_se->dl_yielded = 1; @@ -2115,11 +2439,18 @@ again: return p; } +#ifdef CONFIG_SMP +static struct task_struct *pick_task_dl(struct rq *rq) +{ + return __pick_next_task_dl(rq, true); +} +#endif + static struct task_struct *pick_next_task_dl(struct rq *rq) { struct task_struct *p; - p = pick_task_dl(rq); + p = __pick_next_task_dl(rq, false); if (!p) return p; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index c057ef46c5f8..0148bc65d39c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -333,8 +333,165 @@ static const struct file_operations sched_debug_fops = { .release = seq_release, }; +enum dl_param { + DL_RUNTIME = 0, + DL_PERIOD, +}; + +static unsigned long fair_server_period_max = (1 << 22) * NSEC_PER_USEC; /* ~4 seconds */ +static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */ + +static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos, enum dl_param param) +{ + long cpu = (long) ((struct seq_file *) filp->private_data)->private; + struct rq *rq = cpu_rq(cpu); + u64 runtime, period; + size_t err; + int retval; + u64 value; + + err = kstrtoull_from_user(ubuf, cnt, 10, &value); + if (err) + return err; + + scoped_guard (rq_lock_irqsave, rq) { + runtime = rq->fair_server.dl_runtime; + period = rq->fair_server.dl_period; + + switch (param) { + case DL_RUNTIME: + if (runtime == value) + break; + runtime = value; + break; + case DL_PERIOD: + if (value == period) + break; + period = value; + break; + } + + if (runtime > period || + period > fair_server_period_max || + period < fair_server_period_min) { + return -EINVAL; + } + + if (rq->cfs.h_nr_running) { + update_rq_clock(rq); + dl_server_stop(&rq->fair_server); + } + + retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); + if (retval) + cnt = retval; + + if (!runtime) + printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", + cpu_of(rq)); + + if (rq->cfs.h_nr_running) + dl_server_start(&rq->fair_server); + } + + *ppos += cnt; + return cnt; +} + +static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param) +{ + unsigned long cpu = (unsigned long) m->private; + struct rq *rq = cpu_rq(cpu); + u64 value; + + switch (param) { + case DL_RUNTIME: + value = rq->fair_server.dl_runtime; + break; + case DL_PERIOD: + value = rq->fair_server.dl_period; + break; + } + + seq_printf(m, "%llu\n", value); + return 0; + +} + +static ssize_t +sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME); +} + +static int sched_fair_server_runtime_show(struct seq_file *m, void *v) +{ + return sched_fair_server_show(m, v, DL_RUNTIME); +} + +static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_fair_server_runtime_show, inode->i_private); +} + +static const struct file_operations fair_server_runtime_fops = { + .open = sched_fair_server_runtime_open, + .write = sched_fair_server_runtime_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static ssize_t +sched_fair_server_period_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD); +} + +static int sched_fair_server_period_show(struct seq_file *m, void *v) +{ + return sched_fair_server_show(m, v, DL_PERIOD); +} + +static int sched_fair_server_period_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_fair_server_period_show, inode->i_private); +} + +static const struct file_operations fair_server_period_fops = { + .open = sched_fair_server_period_open, + .write = sched_fair_server_period_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + static struct dentry *debugfs_sched; +static void debugfs_fair_server_init(void) +{ + struct dentry *d_fair; + unsigned long cpu; + + d_fair = debugfs_create_dir("fair_server", debugfs_sched); + if (!d_fair) + return; + + for_each_possible_cpu(cpu) { + struct dentry *d_cpu; + char buf[32]; + + snprintf(buf, sizeof(buf), "cpu%lu", cpu); + d_cpu = debugfs_create_dir(buf, d_fair); + + debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops); + debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops); + } +} + static __init int sched_init_debug(void) { struct dentry __maybe_unused *numa; @@ -374,6 +531,8 @@ static __init int sched_init_debug(void) debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + debugfs_fair_server_init(); + return 0; } late_initcall(sched_init_debug); @@ -641,8 +800,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, "\n"); SEQ_printf(m, "cfs_rq[%d]:\n", cpu); #endif - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", - SPLIT_NS(cfs_rq->exec_clock)); raw_spin_rq_lock_irqsave(rq, flags); root = __pick_root_entity(cfs_rq); @@ -669,8 +826,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SPLIT_NS(right_vruntime)); spread = right_vruntime - left_vruntime; SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); - SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", - cfs_rq->nr_spread_over); SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", @@ -730,9 +885,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) PU(rt_nr_running); + +#ifdef CONFIG_RT_GROUP_SCHED P(rt_throttled); PN(rt_time); PN(rt_runtime); +#endif #undef PN #undef PU diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5904405ffc59..8a37409c23f5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -511,7 +511,7 @@ static int cfs_rq_is_idle(struct cfs_rq *cfs_rq) static int se_is_idle(struct sched_entity *se) { - return 0; + return task_has_idle_policy(task_of(se)); } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -1156,12 +1156,13 @@ s64 update_curr_common(struct rq *rq) static void update_curr(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; + struct rq *rq = rq_of(cfs_rq); s64 delta_exec; if (unlikely(!curr)) return; - delta_exec = update_curr_se(rq_of(cfs_rq), curr); + delta_exec = update_curr_se(rq, curr); if (unlikely(delta_exec <= 0)) return; @@ -1169,8 +1170,19 @@ static void update_curr(struct cfs_rq *cfs_rq) update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); - if (entity_is_task(curr)) - update_curr_task(task_of(curr), delta_exec); + if (entity_is_task(curr)) { + struct task_struct *p = task_of(curr); + + update_curr_task(p, delta_exec); + + /* + * Any fair task that runs outside of fair_server should + * account against fair_server such that it can account for + * this time and possibly avoid running this period. + */ + if (p->dl_server != &rq->fair_server) + dl_server_update(&rq->fair_server, delta_exec); + } account_cfs_rq_runtime(cfs_rq, delta_exec); } @@ -5766,6 +5778,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta, dequeue = 1; + long rq_h_nr_running = rq->cfs.h_nr_running; raw_spin_lock(&cfs_b->lock); /* This will start the period timer if necessary */ @@ -5837,6 +5850,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, task_delta); + /* Stop the fair server if throttling resulted in no runnable tasks */ + if (rq_h_nr_running && !rq->cfs.h_nr_running) + dl_server_stop(&rq->fair_server); done: /* * Note: distribution will already see us throttled via the @@ -5855,6 +5871,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, idle_task_delta; + long rq_h_nr_running = rq->cfs.h_nr_running; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -5924,6 +5941,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) goto unthrottle_throttle; } + /* Start the fair server if un-throttling resulted in new runnable tasks */ + if (!rq_h_nr_running && rq->cfs.h_nr_running) + dl_server_start(&rq->fair_server); + /* At this point se is NULL and we are at root level*/ add_nr_running(rq, task_delta); @@ -6556,7 +6577,7 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) { int cpu = cpu_of(rq); - if (!sched_feat(HZ_BW) || !cfs_bandwidth_used()) + if (!cfs_bandwidth_used()) return; if (!tick_nohz_full_cpu(cpu)) @@ -6751,6 +6772,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); int task_new = !(flags & ENQUEUE_WAKEUP); + int rq_h_nr_running = rq->cfs.h_nr_running; /* * The code below (indirectly) updates schedutil which looks at @@ -6805,6 +6827,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) goto enqueue_throttle; } + if (!rq_h_nr_running && rq->cfs.h_nr_running) { + /* Account for idle runtime */ + if (!rq->nr_running) + dl_server_update_idle_time(rq, rq->curr); + dl_server_start(&rq->fair_server); + } + /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); @@ -6845,6 +6874,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) int task_sleep = flags & DEQUEUE_SLEEP; int idle_h_nr_running = task_has_idle_policy(p); bool was_sched_idle = sched_idle_rq(rq); + int rq_h_nr_running = rq->cfs.h_nr_running; util_est_dequeue(&rq->cfs, p); @@ -6899,6 +6929,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, 1); + if (rq_h_nr_running && !rq->cfs.h_nr_running) + dl_server_stop(&rq->fair_server); + /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; @@ -8382,16 +8415,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (test_tsk_need_resched(curr)) return; - /* Idle tasks are by definition preempted by non-idle tasks. */ - if (unlikely(task_has_idle_policy(curr)) && - likely(!task_has_idle_policy(p))) - goto preempt; - - /* - * Batch and idle tasks do not preempt non-idle tasks (their preemption - * is driven by the tick): - */ - if (unlikely(!normal_policy(p->policy)) || !sched_feat(WAKEUP_PREEMPTION)) + if (!sched_feat(WAKEUP_PREEMPTION)) return; find_matching_se(&se, &pse); @@ -8401,7 +8425,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int pse_is_idle = se_is_idle(pse); /* - * Preempt an idle group in favor of a non-idle group (and don't preempt + * Preempt an idle entity in favor of a non-idle entity (and don't preempt * in the inverse case). */ if (cse_is_idle && !pse_is_idle) @@ -8409,9 +8433,14 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (cse_is_idle != pse_is_idle) return; + /* + * BATCH and IDLE tasks do not preempt others. + */ + if (unlikely(!normal_policy(p->policy))) + return; + cfs_rq = cfs_rq_of(se); update_curr(cfs_rq); - /* * XXX pick_eevdf(cfs_rq) != se ? */ @@ -8453,6 +8482,14 @@ again: cfs_rq = group_cfs_rq(se); } while (cfs_rq); + /* + * This can be called from directly from CFS's ->pick_task() or indirectly + * from DL's ->pick_task when fair server is enabled. In the indirect case, + * DL will set ->dl_server just after this function is called, so its Ok to + * clear. In the direct case, we are picking directly so we must clear it. + */ + task_of(se)->dl_server = NULL; + return task_of(se); } #endif @@ -8607,6 +8644,36 @@ static struct task_struct *__pick_next_task_fair(struct rq *rq) return pick_next_task_fair(rq, NULL, NULL); } +static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) +{ + return !!dl_se->rq->cfs.nr_running; +} + +static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) +{ +#ifdef CONFIG_SMP + return pick_task_fair(dl_se->rq); +#else + return NULL; +#endif +} + +static struct task_struct *fair_server_pick_next(struct sched_dl_entity *dl_se) +{ + return pick_next_task_fair(dl_se->rq, NULL, NULL); +} + +void fair_server_init(struct rq *rq) +{ + struct sched_dl_entity *dl_se = &rq->fair_server; + + init_dl_entity(dl_se); + + dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_next, + fair_server_pick_task); + +} + /* * Account for a descheduled task: */ @@ -12693,22 +12760,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) */ static void task_fork_fair(struct task_struct *p) { - struct sched_entity *se = &p->se, *curr; - struct cfs_rq *cfs_rq; - struct rq *rq = this_rq(); - struct rq_flags rf; - - rq_lock(rq, &rf); - update_rq_clock(rq); - set_task_max_allowed_capacity(p); - - cfs_rq = task_cfs_rq(current); - curr = cfs_rq->curr; - if (curr) - update_curr(cfs_rq); - place_entity(cfs_rq, se, ENQUEUE_INITIAL); - rq_unlock(rq, &rf); } /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 143f55df890b..929021fd6bc4 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -85,5 +85,3 @@ SCHED_FEAT(WA_BIAS, true) SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(LATENCY_WARN, false) - -SCHED_FEAT(HZ_BW, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c7a218123b7a..e53e2da04ba4 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -452,6 +452,7 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { + dl_server_update_idle_time(rq, prev); scx_update_idle(rq, false); } @@ -460,6 +461,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir update_idle_core(rq); scx_update_idle(rq, true); schedstat_inc(rq->sched_goidle); + next->se.exec_start = rq_clock_task(rq); } #ifdef CONFIG_SMP diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 310523c1b9e3..a8731da04cd5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -8,10 +8,6 @@ int sched_rr_timeslice = RR_TIMESLICE; /* More than 4 hours if BW_SHIFT equals 20. */ static const u64 max_rt_runtime = MAX_BW; -static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); - -struct rt_bandwidth def_rt_bandwidth; - /* * period over which we measure -rt task CPU usage in us. * default: 1s @@ -66,6 +62,40 @@ static int __init sched_rt_sysctl_init(void) late_initcall(sched_rt_sysctl_init); #endif +void init_rt_rq(struct rt_rq *rt_rq) +{ + struct rt_prio_array *array; + int i; + + array = &rt_rq->active; + for (i = 0; i < MAX_RT_PRIO; i++) { + INIT_LIST_HEAD(array->queue + i); + __clear_bit(i, array->bitmap); + } + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); + +#if defined CONFIG_SMP + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; + rt_rq->highest_prio.next = MAX_RT_PRIO-1; + rt_rq->overloaded = 0; + plist_head_init(&rt_rq->pushable_tasks); +#endif /* CONFIG_SMP */ + /* We start is dequeued state, because no RT tasks are queued */ + rt_rq->rt_queued = 0; + +#ifdef CONFIG_RT_GROUP_SCHED + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + raw_spin_lock_init(&rt_rq->rt_runtime_lock); +#endif +} + +#ifdef CONFIG_RT_GROUP_SCHED + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) { struct rt_bandwidth *rt_b = @@ -130,35 +160,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) do_start_rt_bandwidth(rt_b); } -void init_rt_rq(struct rt_rq *rt_rq) -{ - struct rt_prio_array *array; - int i; - - array = &rt_rq->active; - for (i = 0; i < MAX_RT_PRIO; i++) { - INIT_LIST_HEAD(array->queue + i); - __clear_bit(i, array->bitmap); - } - /* delimiter for bit-search: */ - __set_bit(MAX_RT_PRIO, array->bitmap); - -#if defined CONFIG_SMP - rt_rq->highest_prio.curr = MAX_RT_PRIO-1; - rt_rq->highest_prio.next = MAX_RT_PRIO-1; - rt_rq->overloaded = 0; - plist_head_init(&rt_rq->pushable_tasks); -#endif /* CONFIG_SMP */ - /* We start is dequeued state, because no RT tasks are queued */ - rt_rq->rt_queued = 0; - - rt_rq->rt_time = 0; - rt_rq->rt_throttled = 0; - rt_rq->rt_runtime = 0; - raw_spin_lock_init(&rt_rq->rt_runtime_lock); -} - -#ifdef CONFIG_RT_GROUP_SCHED static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) { hrtimer_cancel(&rt_b->rt_period_timer); @@ -195,7 +196,6 @@ void unregister_rt_sched_group(struct task_group *tg) { if (tg->rt_se) destroy_rt_bandwidth(&tg->rt_bandwidth); - } void free_rt_sched_group(struct task_group *tg) @@ -253,8 +253,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!tg->rt_se) goto err; - init_rt_bandwidth(&tg->rt_bandwidth, - ktime_to_ns(def_rt_bandwidth.rt_period), 0); + init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0); for_each_possible_cpu(i) { rt_rq = kzalloc_node(sizeof(struct rt_rq), @@ -604,70 +603,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) return &rt_rq->tg->rt_bandwidth; } -#else /* !CONFIG_RT_GROUP_SCHED */ - -static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) -{ - return rt_rq->rt_runtime; -} - -static inline u64 sched_rt_period(struct rt_rq *rt_rq) -{ - return ktime_to_ns(def_rt_bandwidth.rt_period); -} - -typedef struct rt_rq *rt_rq_iter_t; - -#define for_each_rt_rq(rt_rq, iter, rq) \ - for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) - -#define for_each_sched_rt_entity(rt_se) \ - for (; rt_se; rt_se = NULL) - -static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) -{ - return NULL; -} - -static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) -{ - struct rq *rq = rq_of_rt_rq(rt_rq); - - if (!rt_rq->rt_nr_running) - return; - - enqueue_top_rt_rq(rt_rq); - resched_curr(rq); -} - -static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) -{ - dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); -} - -static inline int rt_rq_throttled(struct rt_rq *rt_rq) -{ - return rt_rq->rt_throttled; -} - -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} - -static inline -struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) -{ - return &cpu_rq(cpu)->rt; -} - -static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) -{ - return &def_rt_bandwidth; -} - -#endif /* CONFIG_RT_GROUP_SCHED */ - bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) { struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); @@ -859,7 +794,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) const struct cpumask *span; span = sched_rt_period_mask(); -#ifdef CONFIG_RT_GROUP_SCHED + /* * FIXME: isolated CPUs should really leave the root task group, * whether they are isolcpus or were isolated via cpusets, lest @@ -871,7 +806,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) */ if (rt_b == &root_task_group.rt_bandwidth) span = cpu_online_mask; -#endif + for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); @@ -938,18 +873,6 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) return idle; } -static inline int rt_se_prio(struct sched_rt_entity *rt_se) -{ -#ifdef CONFIG_RT_GROUP_SCHED - struct rt_rq *rt_rq = group_rt_rq(rt_se); - - if (rt_rq) - return rt_rq->highest_prio.curr; -#endif - - return rt_task_of(rt_se)->prio; -} - static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); @@ -993,6 +916,72 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) return 0; } +#else /* !CONFIG_RT_GROUP_SCHED */ + +typedef struct rt_rq *rt_rq_iter_t; + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) + +#define for_each_sched_rt_entity(rt_se) \ + for (; rt_se; rt_se = NULL) + +static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) +{ + return NULL; +} + +static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + if (!rt_rq->rt_nr_running) + return; + + enqueue_top_rt_rq(rt_rq); + resched_curr(rq); +} + +static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) +{ + dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); +} + +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return false; +} + +static inline const struct cpumask *sched_rt_period_mask(void) +{ + return cpu_online_mask; +} + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ + return &cpu_rq(cpu)->rt; +} + +#ifdef CONFIG_SMP +static void __enable_runtime(struct rq *rq) { } +static void __disable_runtime(struct rq *rq) { } +#endif + +#endif /* CONFIG_RT_GROUP_SCHED */ + +static inline int rt_se_prio(struct sched_rt_entity *rt_se) +{ +#ifdef CONFIG_RT_GROUP_SCHED + struct rt_rq *rt_rq = group_rt_rq(rt_se); + + if (rt_rq) + return rt_rq->highest_prio.curr; +#endif + + return rt_task_of(rt_se)->prio; +} + /* * Update the current task's runtime statistics. Skip current tasks that * are not in our scheduling class. @@ -1000,7 +989,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; - struct sched_rt_entity *rt_se = &curr->rt; s64 delta_exec; if (curr->sched_class != &rt_sched_class) @@ -1010,6 +998,9 @@ static void update_curr_rt(struct rq *rq) if (unlikely(delta_exec <= 0)) return; +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity *rt_se = &curr->rt; + if (!rt_bandwidth_enabled()) return; @@ -1028,6 +1019,7 @@ static void update_curr_rt(struct rq *rq) do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); } } +#endif } static void @@ -1184,7 +1176,6 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static void inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { - start_rt_bandwidth(&def_rt_bandwidth); } static inline @@ -2912,19 +2903,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) #ifdef CONFIG_SYSCTL static int sched_rt_global_constraints(void) { - unsigned long flags; - int i; - - raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); - for_each_possible_cpu(i) { - struct rt_rq *rt_rq = &cpu_rq(i)->rt; - - raw_spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_runtime = global_rt_runtime(); - raw_spin_unlock(&rt_rq->rt_runtime_lock); - } - raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); - return 0; } #endif /* CONFIG_SYSCTL */ @@ -2944,12 +2922,6 @@ static int sched_rt_global_validate(void) static void sched_rt_do_global(void) { - unsigned long flags; - - raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); - def_rt_bandwidth.rt_runtime = global_rt_runtime(); - def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); - raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); } static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 42b4d1428c2c..c74ef008f3ae 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -362,7 +362,7 @@ extern bool __checkparam_dl(const struct sched_attr *attr); extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); extern int dl_bw_check_overflow(int cpu); - +extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec); /* * SCHED_DEADLINE supports servers (nested scheduling) with the following * interface: @@ -388,7 +388,15 @@ extern void dl_server_start(struct sched_dl_entity *dl_se); extern void dl_server_stop(struct sched_dl_entity *dl_se); extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_has_tasks_f has_tasks, - dl_server_pick_f pick); + dl_server_pick_f pick_next, + dl_server_pick_f pick_task); + +extern void dl_server_update_idle_time(struct rq *rq, + struct task_struct *p); +extern void fair_server_init(struct rq *rq); +extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); +extern int dl_server_apply_params(struct sched_dl_entity *dl_se, + u64 runtime, u64 period, bool init); #ifdef CONFIG_CGROUP_SCHED @@ -631,7 +639,6 @@ struct cfs_rq { s64 avg_vruntime; u64 avg_load; - u64 exec_clock; u64 min_vruntime; #ifdef CONFIG_SCHED_CORE unsigned int forceidle_seq; @@ -651,10 +658,6 @@ struct cfs_rq { struct sched_entity *curr; struct sched_entity *next; -#ifdef CONFIG_SCHED_DEBUG - unsigned int nr_spread_over; -#endif - #ifdef CONFIG_SMP /* * CFS load tracking @@ -794,13 +797,13 @@ struct rt_rq { #endif /* CONFIG_SMP */ int rt_queued; +#ifdef CONFIG_RT_GROUP_SCHED int rt_throttled; u64 rt_time; u64 rt_runtime; /* Nests inside the rq lock: */ raw_spinlock_t rt_runtime_lock; -#ifdef CONFIG_RT_GROUP_SCHED unsigned int rt_nr_boosted; struct rq *rq; @@ -1110,6 +1113,8 @@ struct rq { struct scx_rq scx; #endif + struct sched_dl_entity fair_server; + #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ struct list_head leaf_cfs_rq_list; @@ -1224,7 +1229,6 @@ struct rq { /* latency stats */ struct sched_info rq_sched_info; unsigned long long rq_cpu_time; - /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ /* sys_sched_yield() stats */ unsigned int yld_count; @@ -2619,7 +2623,6 @@ extern void init_sched_fair_class(void); extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); -extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 76504b776d03..9748a4c8d668 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -516,6 +516,14 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); + /* + * Because the rq is not a task, dl_add_task_root_domain() did not + * move the fair server bw to the rd if it already started. + * Add it now. + */ + if (rq->fair_server.dl_server) + __dl_server_attach_root(&rq->fair_server, rq); + rq_unlock_irqrestore(rq, &rf); if (old_rd) |