summaryrefslogtreecommitdiffstats
path: root/tools/sched_ext/scx_simple.bpf.c
diff options
context:
space:
mode:
Diffstat (limited to 'tools/sched_ext/scx_simple.bpf.c')
-rw-r--r--tools/sched_ext/scx_simple.bpf.c97
1 files changed, 95 insertions, 2 deletions
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
index 6bb13a3c801b..ed7e8d535fc5 100644
--- a/tools/sched_ext/scx_simple.bpf.c
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -2,11 +2,20 @@
/*
* A simple scheduler.
*
- * A simple global FIFO scheduler. It also demonstrates the following niceties.
+ * By default, it operates as a simple global weighted vtime scheduler and can
+ * be switched to FIFO scheduling. It also demonstrates the following niceties.
*
* - Statistics tracking how many tasks are queued to local and global dsq's.
* - Termination notification for userspace.
*
+ * While very simple, this scheduler should work reasonably well on CPUs with a
+ * uniform L3 cache topology. While preemption is not implemented, the fact that
+ * the scheduling queue is shared across all CPUs means that whatever is at the
+ * front of the queue is likely to be executed fairly quickly given enough
+ * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads
+ * but comes with the usual problems with FIFO scheduling where saturating
+ * threads can easily drown out interactive ones.
+ *
* Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
* Copyright (c) 2022 Tejun Heo <tj@kernel.org>
* Copyright (c) 2022 David Vernet <dvernet@meta.com>
@@ -15,8 +24,20 @@
char _license[] SEC("license") = "GPL";
+const volatile bool fifo_sched;
+
+static u64 vtime_now;
UEI_DEFINE(uei);
+/*
+ * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues
+ * (meaning, cannot be dispatched to with scx_bpf_dispatch_vtime()). We
+ * therefore create a separate DSQ with ID 0 that we dispatch to and consume
+ * from. If scx_simple only supported global FIFO scheduling, then we could
+ * just use SCX_DSQ_GLOBAL.
+ */
+#define SHARED_DSQ 0
+
struct {
__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
__uint(key_size, sizeof(u32));
@@ -31,6 +52,11 @@ static void stat_inc(u32 idx)
(*cnt_p)++;
}
+static inline bool vtime_before(u64 a, u64 b)
+{
+ return (s64)(a - b) < 0;
+}
+
s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags)
{
bool is_idle = false;
@@ -48,7 +74,69 @@ s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 w
void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
{
stat_inc(1); /* count global queueing */
- scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+
+ if (fifo_sched) {
+ scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+ } else {
+ u64 vtime = p->scx.dsq_vtime;
+
+ /*
+ * Limit the amount of budget that an idling task can accumulate
+ * to one slice.
+ */
+ if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
+ vtime = vtime_now - SCX_SLICE_DFL;
+
+ scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
+ enq_flags);
+ }
+}
+
+void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev)
+{
+ scx_bpf_consume(SHARED_DSQ);
+}
+
+void BPF_STRUCT_OPS(simple_running, struct task_struct *p)
+{
+ if (fifo_sched)
+ return;
+
+ /*
+ * Global vtime always progresses forward as tasks start executing. The
+ * test and update can be performed concurrently from multiple CPUs and
+ * thus racy. Any error should be contained and temporary. Let's just
+ * live with it.
+ */
+ if (vtime_before(vtime_now, p->scx.dsq_vtime))
+ vtime_now = p->scx.dsq_vtime;
+}
+
+void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable)
+{
+ if (fifo_sched)
+ return;
+
+ /*
+ * Scale the execution time by the inverse of the weight and charge.
+ *
+ * Note that the default yield implementation yields by setting
+ * @p->scx.slice to zero and the following would treat the yielding task
+ * as if it has consumed all its slice. If this penalizes yielding tasks
+ * too much, determine the execution time by taking explicit timestamps
+ * instead of depending on @p->scx.slice.
+ */
+ p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight;
+}
+
+void BPF_STRUCT_OPS(simple_enable, struct task_struct *p)
+{
+ p->scx.dsq_vtime = vtime_now;
+}
+
+s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
+{
+ return scx_bpf_create_dsq(SHARED_DSQ, -1);
}
void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
@@ -59,5 +147,10 @@ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei)
SCX_OPS_DEFINE(simple_ops,
.select_cpu = (void *)simple_select_cpu,
.enqueue = (void *)simple_enqueue,
+ .dispatch = (void *)simple_dispatch,
+ .running = (void *)simple_running,
+ .stopping = (void *)simple_stopping,
+ .enable = (void *)simple_enable,
+ .init = (void *)simple_init,
.exit = (void *)simple_exit,
.name = "simple");