aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--MAINTAINERS13
-rw-r--r--include/asm-generic/vmlinux.lds.h1
-rw-r--r--include/linux/sched.h5
-rw-r--r--include/linux/sched/ext.h141
-rw-r--r--include/uapi/linux/sched.h1
-rw-r--r--init/init_task.c11
-rw-r--r--kernel/Kconfig.preempt25
-rw-r--r--kernel/sched/build_policy.c9
-rw-r--r--kernel/sched/core.c66
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--kernel/sched/ext.c4256
-rw-r--r--kernel/sched/ext.h73
-rw-r--r--kernel/sched/sched.h17
-rw-r--r--kernel/sched/syscalls.c2
14 files changed, 4616 insertions, 7 deletions
diff --git a/MAINTAINERS b/MAINTAINERS
index cd3277a98cfe..9e3ee22f015e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -19936,6 +19936,19 @@ F: include/linux/wait.h
F: include/uapi/linux/sched.h
F: kernel/sched/
+SCHEDULER - SCHED_EXT
+R: Tejun Heo <tj@kernel.org>
+R: David Vernet <void@manifault.com>
+L: linux-kernel@vger.kernel.org
+S: Maintained
+W: https://github.com/sched-ext/scx
+T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git
+F: include/linux/sched/ext.h
+F: kernel/sched/ext.h
+F: kernel/sched/ext.c
+F: tools/sched_ext/
+F: tools/testing/selftests/sched_ext
+
SCSI LIBSAS SUBSYSTEM
R: John Garry <john.g.garry@oracle.com>
R: Jason Yan <yanaijie@huawei.com>
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 5703526d6ebf..2e712183ba09 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -133,6 +133,7 @@
*(__dl_sched_class) \
*(__rt_sched_class) \
*(__fair_sched_class) \
+ *(__ext_sched_class) \
*(__idle_sched_class) \
__sched_class_lowest = .;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 90691d99027e..06beb8a6e0ca 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -80,6 +80,8 @@ struct task_group;
struct task_struct;
struct user_event_mm;
+#include <linux/sched/ext.h>
+
/*
* Task state bitmask. NOTE! These bits are also
* encoded in fs/proc/array.c: get_task_state().
@@ -802,6 +804,9 @@ struct task_struct {
struct sched_rt_entity rt;
struct sched_dl_entity dl;
struct sched_dl_entity *dl_server;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ struct sched_ext_entity scx;
+#endif
const struct sched_class *sched_class;
#ifdef CONFIG_SCHED_CORE
diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index a05dfcf533b0..c1530a7992cc 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -1,9 +1,148 @@
/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
#ifndef _LINUX_SCHED_EXT_H
#define _LINUX_SCHED_EXT_H
#ifdef CONFIG_SCHED_CLASS_EXT
-#error "NOT IMPLEMENTED YET"
+
+#include <linux/llist.h>
+#include <linux/rhashtable-types.h>
+
+enum scx_public_consts {
+ SCX_OPS_NAME_LEN = 128,
+
+ SCX_SLICE_DFL = 20 * 1000000, /* 20ms */
+};
+
+/*
+ * DSQ (dispatch queue) IDs are 64bit of the format:
+ *
+ * Bits: [63] [62 .. 0]
+ * [ B] [ ID ]
+ *
+ * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs
+ * ID: 63 bit ID
+ *
+ * Built-in IDs:
+ *
+ * Bits: [63] [62] [61..32] [31 .. 0]
+ * [ 1] [ L] [ R ] [ V ]
+ *
+ * 1: 1 for built-in DSQs.
+ * L: 1 for LOCAL_ON DSQ IDs, 0 for others
+ * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value.
+ */
+enum scx_dsq_id_flags {
+ SCX_DSQ_FLAG_BUILTIN = 1LLU << 63,
+ SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62,
+
+ SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0,
+ SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1,
+ SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2,
+ SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON,
+ SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU,
+};
+
+/*
+ * Dispatch queue (dsq) is a simple FIFO which is used to buffer between the
+ * scheduler core and the BPF scheduler. See the documentation for more details.
+ */
+struct scx_dispatch_q {
+ raw_spinlock_t lock;
+ struct list_head list; /* tasks in dispatch order */
+ u32 nr;
+ u64 id;
+ struct rhash_head hash_node;
+ struct llist_node free_node;
+ struct rcu_head rcu;
+};
+
+/* scx_entity.flags */
+enum scx_ent_flags {
+ SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */
+ SCX_TASK_BAL_KEEP = 1 << 1, /* balance decided to keep current */
+ SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */
+ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */
+
+ SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */
+ SCX_TASK_STATE_BITS = 2,
+ SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT,
+
+ SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */
+};
+
+/* scx_entity.flags & SCX_TASK_STATE_MASK */
+enum scx_task_state {
+ SCX_TASK_NONE, /* ops.init_task() not called yet */
+ SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */
+ SCX_TASK_READY, /* fully initialized, but not in sched_ext */
+ SCX_TASK_ENABLED, /* fully initialized and in sched_ext */
+
+ SCX_TASK_NR_STATES,
+};
+
+/*
+ * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from
+ * everywhere and the following bits track which kfunc sets are currently
+ * allowed for %current. This simple per-task tracking works because SCX ops
+ * nest in a limited way. BPF will likely implement a way to allow and disallow
+ * kfuncs depending on the calling context which will replace this manual
+ * mechanism. See scx_kf_allow().
+ */
+enum scx_kf_mask {
+ SCX_KF_UNLOCKED = 0, /* not sleepable, not rq locked */
+ /* all non-sleepables may be nested inside SLEEPABLE */
+ SCX_KF_SLEEPABLE = 1 << 0, /* sleepable init operations */
+ /* ops.dequeue (in REST) may be nested inside DISPATCH */
+ SCX_KF_DISPATCH = 1 << 2, /* ops.dispatch() */
+ SCX_KF_ENQUEUE = 1 << 3, /* ops.enqueue() and ops.select_cpu() */
+ SCX_KF_SELECT_CPU = 1 << 4, /* ops.select_cpu() */
+ SCX_KF_REST = 1 << 5, /* other rq-locked operations */
+
+ __SCX_KF_RQ_LOCKED = SCX_KF_DISPATCH |
+ SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST,
+};
+
+/*
+ * The following is embedded in task_struct and contains all fields necessary
+ * for a task to be scheduled by SCX.
+ */
+struct sched_ext_entity {
+ struct scx_dispatch_q *dsq;
+ struct list_head dsq_node;
+ u32 flags; /* protected by rq lock */
+ u32 weight;
+ s32 sticky_cpu;
+ s32 holding_cpu;
+ u32 kf_mask; /* see scx_kf_mask above */
+ atomic_long_t ops_state;
+
+ struct list_head runnable_node; /* rq->scx.runnable_list */
+
+ u64 ddsp_dsq_id;
+ u64 ddsp_enq_flags;
+
+ /* BPF scheduler modifiable fields */
+
+ /*
+ * Runtime budget in nsecs. This is usually set through
+ * scx_bpf_dispatch() but can also be modified directly by the BPF
+ * scheduler. Automatically decreased by SCX as the task executes. On
+ * depletion, a scheduling event is triggered.
+ */
+ u64 slice;
+
+ /* cold fields */
+ /* must be the last field, see init_scx_entity() */
+ struct list_head tasks_node;
+};
+
+void sched_ext_free(struct task_struct *p);
+
#else /* !CONFIG_SCHED_CLASS_EXT */
static inline void sched_ext_free(struct task_struct *p) {}
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 3bac0a8ceab2..359a14cc76a4 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -118,6 +118,7 @@ struct clone_args {
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5
#define SCHED_DEADLINE 6
+#define SCHED_EXT 7
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK 0x40000000
diff --git a/init/init_task.c b/init/init_task.c
index eeb110c65fe2..c6804396fe12 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -6,6 +6,7 @@
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/task.h>
+#include <linux/sched/ext.h>
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/mm.h>
@@ -99,6 +100,16 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
#ifdef CONFIG_CGROUP_SCHED
.sched_task_group = &root_task_group,
#endif
+#ifdef CONFIG_SCHED_CLASS_EXT
+ .scx = {
+ .dsq_node = LIST_HEAD_INIT(init_task.scx.dsq_node),
+ .sticky_cpu = -1,
+ .holding_cpu = -1,
+ .runnable_node = LIST_HEAD_INIT(init_task.scx.runnable_node),
+ .ddsp_dsq_id = SCX_DSQ_INVALID,
+ .slice = SCX_SLICE_DFL,
+ },
+#endif
.ptraced = LIST_HEAD_INIT(init_task.ptraced),
.ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry),
.real_parent = &init_task,
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c2f1fd95a821..39ecfc2b5a1c 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -133,4 +133,27 @@ config SCHED_CORE
which is the likely usage by Linux distributions, there should
be no measurable impact on performance.
-
+config SCHED_CLASS_EXT
+ bool "Extensible Scheduling Class"
+ depends on BPF_SYSCALL && BPF_JIT && !SCHED_CORE
+ help
+ This option enables a new scheduler class sched_ext (SCX), which
+ allows scheduling policies to be implemented as BPF programs to
+ achieve the following:
+
+ - Ease of experimentation and exploration: Enabling rapid
+ iteration of new scheduling policies.
+ - Customization: Building application-specific schedulers which
+ implement policies that are not applicable to general-purpose
+ schedulers.
+ - Rapid scheduler deployments: Non-disruptive swap outs of
+ scheduling policies in production environments.
+
+ sched_ext leverages BPF struct_ops feature to define a structure
+ which exports function callbacks and flags to BPF programs that
+ wish to implement scheduling policies. The struct_ops structure
+ exported by sched_ext is struct sched_ext_ops, and is conceptually
+ similar to struct sched_class.
+
+ For more information:
+ https://github.com/sched-ext/scx
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 39c315182b35..f0c148fcd2df 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -21,13 +21,18 @@
#include <linux/cpuidle.h>
#include <linux/jiffies.h>
+#include <linux/kobject.h>
#include <linux/livepatch.h>
+#include <linux/pm.h>
#include <linux/psi.h>
+#include <linux/rhashtable.h>
+#include <linux/seq_buf.h>
#include <linux/seqlock_api.h>
#include <linux/slab.h>
#include <linux/suspend.h>
#include <linux/tsacct_kern.h>
#include <linux/vtime.h>
+#include <linux/percpu-rwsem.h>
#include <uapi/linux/sched/types.h>
@@ -52,4 +57,8 @@
#include "cputime.c"
#include "deadline.c"
+#ifdef CONFIG_SCHED_CLASS_EXT
+# include "ext.c"
+#endif
+
#include "syscalls.c"
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8c963fea9eb..6042ce3bfee0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3791,6 +3791,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
/*
+ * The BPF scheduler may depend on select_task_rq() being invoked during
+ * wakeups. In addition, @p may end up executing on a different CPU
+ * regardless of what happens in the wakeup path making the ttwu_queue
+ * optimization less meaningful. Skip if on SCX.
+ */
+ if (task_on_scx(p))
+ return false;
+
+ /*
* Do not complicate things with the async wake_list while the CPU is
* in hotplug state.
*/
@@ -4357,6 +4366,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->rt.on_rq = 0;
p->rt.on_list = 0;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ init_scx_entity(&p->scx);
+#endif
+
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
@@ -4604,6 +4617,10 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
goto out_cancel;
} else if (rt_prio(p->prio)) {
p->sched_class = &rt_sched_class;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ } else if (task_should_scx(p)) {
+ p->sched_class = &ext_sched_class;
+#endif
} else {
p->sched_class = &fair_sched_class;
}
@@ -5511,8 +5528,10 @@ void sched_tick(void)
wq_worker_tick(curr);
#ifdef CONFIG_SMP
- rq->idle_balance = idle_cpu(cpu);
- sched_balance_trigger(rq);
+ if (!scx_switched_all()) {
+ rq->idle_balance = idle_cpu(cpu);
+ sched_balance_trigger(rq);
+ }
#endif
}
@@ -6902,6 +6921,10 @@ void __setscheduler_prio(struct task_struct *p, int prio)
p->sched_class = &dl_sched_class;
else if (rt_prio(prio))
p->sched_class = &rt_sched_class;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ else if (task_should_scx(p))
+ p->sched_class = &ext_sched_class;
+#endif
else
p->sched_class = &fair_sched_class;
@@ -8203,6 +8226,10 @@ void __init sched_init(void)
BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
+#ifdef CONFIG_SCHED_CLASS_EXT
+ BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class));
+ BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
+#endif
wait_bit_init();
@@ -10337,3 +10364,38 @@ void sched_mm_cid_fork(struct task_struct *t)
t->mm_cid_active = 1;
}
#endif
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
+ struct sched_enq_and_set_ctx *ctx)
+{
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_rq_held(rq);
+
+ *ctx = (struct sched_enq_and_set_ctx){
+ .p = p,
+ .queue_flags = queue_flags,
+ .queued = task_on_rq_queued(p),
+ .running = task_current(rq, p),
+ };
+
+ update_rq_clock(rq);
+ if (ctx->queued)
+ dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
+ if (ctx->running)
+ put_prev_task(rq, p);
+}
+
+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
+{
+ struct rq *rq = task_rq(ctx->p);
+
+ lockdep_assert_rq_held(rq);
+
+ if (ctx->queued)
+ enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
+ if (ctx->running)
+ set_next_task(rq, ctx->p);
+}
+#endif /* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index c1eb9a1afd13..c057ef46c5f8 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1090,6 +1090,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(dl.runtime);
P(dl.deadline);
}
+#ifdef CONFIG_SCHED_CLASS_EXT
+ __PS("ext.enabled", task_on_scx(p));
+#endif
#undef PN_SCHEDSTAT
#undef P_SCHEDSTAT
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
new file mode 100644
index 000000000000..49b115f5b052
--- /dev/null
+++ b/kernel/sched/ext.c
@@ -0,0 +1,4256 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
+ * Copyright (c) 2022 David Vernet <dvernet@meta.com>
+ */
+#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
+
+enum scx_consts {
+ SCX_DSP_DFL_MAX_BATCH = 32,
+
+ SCX_EXIT_BT_LEN = 64,
+ SCX_EXIT_MSG_LEN = 1024,
+};
+
+enum scx_exit_kind {
+ SCX_EXIT_NONE,
+ SCX_EXIT_DONE,
+
+ SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */
+ SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */
+ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */
+
+ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */
+ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */
+};
+
+/*
+ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
+ * being disabled.
+ */
+struct scx_exit_info {
+ /* %SCX_EXIT_* - broad category of the exit reason */
+ enum scx_exit_kind kind;
+
+ /* exit code if gracefully exiting */
+ s64 exit_code;
+
+ /* textual representation of the above */
+ const char *reason;
+
+ /* backtrace if exiting due to an error */
+ unsigned long *bt;
+ u32 bt_len;
+
+ /* informational message */
+ char *msg;
+};
+
+/* sched_ext_ops.flags */
+enum scx_ops_flags {
+ /*
+ * Keep built-in idle tracking even if ops.update_idle() is implemented.
+ */
+ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
+
+ /*
+ * By default, if there are no other task to run on the CPU, ext core
+ * keeps running the current task even after its slice expires. If this
+ * flag is specified, such tasks are passed to ops.enqueue() with
+ * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
+ */
+ SCX_OPS_ENQ_LAST = 1LLU << 1,
+
+ /*
+ * An exiting task may schedule after PF_EXITING is set. In such cases,
+ * bpf_task_from_pid() may not be able to find the task and if the BPF
+ * scheduler depends on pid lookup for dispatching, the task will be
+ * lost leading to various issues including RCU grace period stalls.
+ *
+ * To mask this problem, by default, unhashed tasks are automatically
+ * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
+ * depend on pid lookups and wants to handle these tasks directly, the
+ * following flag can be used.
+ */
+ SCX_OPS_ENQ_EXITING = 1LLU << 2,
+
+ /*
+ * If set, only tasks with policy set to SCHED_EXT are attached to
+ * sched_ext. If clear, SCHED_NORMAL tasks are also included.
+ */
+ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
+
+ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
+ SCX_OPS_ENQ_LAST |
+ SCX_OPS_ENQ_EXITING |
+ SCX_OPS_SWITCH_PARTIAL,
+};
+
+/* argument container for ops.init_task() */
+struct scx_init_task_args {
+ /*
+ * Set if ops.init_task() is being invoked on the fork path, as opposed
+ * to the scheduler transition path.
+ */
+ bool fork;
+};
+
+/* argument container for ops.exit_task() */
+struct scx_exit_task_args {
+ /* Whether the task exited before running on sched_ext. */
+ bool cancelled;
+};
+
+/**
+ * struct sched_ext_ops - Operation table for BPF scheduler implementation
+ *
+ * Userland can implement an arbitrary scheduling policy by implementing and
+ * loading operations in this table.
+ */
+struct sched_ext_ops {
+ /**
+ * select_cpu - Pick the target CPU for a task which is being woken up
+ * @p: task being woken up
+ * @prev_cpu: the cpu @p was on before sleeping
+ * @wake_flags: SCX_WAKE_*
+ *
+ * Decision made here isn't final. @p may be moved to any CPU while it
+ * is getting dispatched for execution later. However, as @p is not on
+ * the rq at this point, getting the eventual execution CPU right here
+ * saves a small bit of overhead down the line.
+ *
+ * If an idle CPU is returned, the CPU is kicked and will try to
+ * dispatch. While an explicit custom mechanism can be added,
+ * select_cpu() serves as the default way to wake up idle CPUs.
+ *
+ * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p
+ * is dispatched, the ops.enqueue() callback will be skipped. Finally,
+ * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the
+ * local DSQ of whatever CPU is returned by this callback.
+ */
+ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
+
+ /**
+ * enqueue - Enqueue a task on the BPF scheduler
+ * @p: task being enqueued
+ * @enq_flags: %SCX_ENQ_*
+ *
+ * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch()
+ * or enqueue on the BPF scheduler. If not directly dispatched, the bpf
+ * scheduler owns @p and if it fails to dispatch @p, the task will
+ * stall.
+ *
+ * If @p was dispatched from ops.select_cpu(), this callback is
+ * skipped.
+ */
+ void (*enqueue)(struct task_struct *p, u64 enq_flags);
+
+ /**
+ * dequeue - Remove a task from the BPF scheduler
+ * @p: task being dequeued
+ * @deq_flags: %SCX_DEQ_*
+ *
+ * Remove @p from the BPF scheduler. This is usually called to isolate
+ * the task while updating its scheduling properties (e.g. priority).
+ *
+ * The ext core keeps track of whether the BPF side owns a given task or
+ * not and can gracefully ignore spurious dispatches from BPF side,
+ * which makes it safe to not implement this method. However, depending
+ * on the scheduling logic, this can lead to confusing behaviors - e.g.
+ * scheduling position not being updated across a priority change.
+ */
+ void (*dequeue)(struct task_struct *p, u64 deq_flags);
+
+ /**
+ * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs
+ * @cpu: CPU to dispatch tasks for
+ * @prev: previous task being switched out
+ *
+ * Called when a CPU's local dsq is empty. The operation should dispatch
+ * one or more tasks from the BPF scheduler into the DSQs using
+ * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using
+ * scx_bpf_consume().
+ *
+ * The maximum number of times scx_bpf_dispatch() can be called without
+ * an intervening scx_bpf_consume() is specified by
+ * ops.dispatch_max_batch. See the comments on top of the two functions
+ * for more details.
+ *
+ * When not %NULL, @prev is an SCX task with its slice depleted. If
+ * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
+ * @prev->scx.flags, it is not enqueued yet and will be enqueued after
+ * ops.dispatch() returns. To keep executing @prev, return without
+ * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST.
+ */
+ void (*dispatch)(s32 cpu, struct task_struct *prev);
+
+ /**
+ * tick - Periodic tick
+ * @p: task running currently
+ *
+ * This operation is called every 1/HZ seconds on CPUs which are
+ * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
+ * immediate dispatch cycle on the CPU.
+ */
+ void (*tick)(struct task_struct *p);
+
+ /**
+ * yield - Yield CPU
+ * @from: yielding task
+ * @to: optional yield target task
+ *
+ * If @to is NULL, @from is yielding the CPU to other runnable tasks.
+ * The BPF scheduler should ensure that other available tasks are
+ * dispatched before the yielding task. Return value is ignored in this
+ * case.
+ *
+ * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
+ * scheduler can implement the request, return %true; otherwise, %false.
+ */
+ bool (*yield)(struct task_struct *from, struct task_struct *to);
+
+ /**
+ * set_weight - Set task weight
+ * @p: task to set weight for
+ * @weight: new eight [1..10000]
+ *
+ * Update @p's weight to @weight.
+ */
+ void (*set_weight)(struct task_struct *p, u32 weight);
+
+ /**
+ * set_cpumask - Set CPU affinity
+ * @p: task to set CPU affinity for
+ * @cpumask: cpumask of cpus that @p can run on
+ *
+ * Update @p's CPU affinity to @cpumask.
+ */
+ void (*set_cpumask)(struct task_struct *p,
+ const struct cpumask *cpumask);
+
+ /**
+ * update_idle - Update the idle state of a CPU
+ * @cpu: CPU to udpate the idle state for
+ * @idle: whether entering or exiting the idle state
+ *
+ * This operation is called when @rq's CPU goes or leaves the idle
+ * state. By default, implementing this operation disables the built-in
+ * idle CPU tracking and the following helpers become unavailable:
+ *
+ * - scx_bpf_select_cpu_dfl()
+ * - scx_bpf_test_and_clear_cpu_idle()
+ * - scx_bpf_pick_idle_cpu()
+ *
+ * The user also must implement ops.select_cpu() as the default
+ * implementation relies on scx_bpf_select_cpu_dfl().
+ *
+ * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
+ * tracking.
+ */
+ void (*update_idle)(s32 cpu, bool idle);
+
+ /**
+ * init_task - Initialize a task to run in a BPF scheduler
+ * @p: task to initialize for BPF scheduling
+ * @args: init arguments, see the struct definition
+ *
+ * Either we're loading a BPF scheduler or a new task is being forked.
+ * Initialize @p for BPF scheduling. This operation may block and can
+ * be used for allocations, and is called exactly once for a task.
+ *
+ * Return 0 for success, -errno for failure. An error return while
+ * loading will abort loading of the BPF scheduler. During a fork, it
+ * will abort that specific fork.
+ */
+ s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
+
+ /**
+ * exit_task - Exit a previously-running task from the system
+ * @p: task to exit
+ *
+ * @p is exiting or the BPF scheduler is being unloaded. Perform any
+ * necessary cleanup for @p.
+ */
+ void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
+
+ /**
+ * enable - Enable BPF scheduling for a task
+ * @p: task to enable BPF scheduling for
+ *
+ * Enable @p for BPF scheduling. enable() is called on @p any time it
+ * enters SCX, and is always paired with a matching disable().
+ */
+ void (*enable)(struct task_struct *p);
+
+ /**
+ * disable - Disable BPF scheduling for a task
+ * @p: task to disable BPF scheduling for
+ *
+ * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
+ * Disable BPF scheduling for @p. A disable() call is always matched
+ * with a prior enable() call.
+ */
+ void (*disable)(struct task_struct *p);
+
+ /*
+ * All online ops must come before ops.init().
+ */
+
+ /**
+ * init - Initialize the BPF scheduler
+ */
+ s32 (*init)(void);
+
+ /**
+ * exit - Clean up after the BPF scheduler
+ * @info: Exit info
+ */
+ void (*exit)(struct scx_exit_info *info);
+
+ /**
+ * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch
+ */
+ u32 dispatch_max_batch;
+
+ /**
+ * flags - %SCX_OPS_* flags
+ */
+ u64 flags;
+
+ /**
+ * name - BPF scheduler's name
+ *
+ * Must be a non-zero valid BPF object name including only isalnum(),
+ * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
+ * BPF scheduler is enabled.
+ */
+ char name[SCX_OPS_NAME_LEN];
+};
+
+enum scx_opi {
+ SCX_OPI_BEGIN = 0,
+ SCX_OPI_NORMAL_BEGIN = 0,
+ SCX_OPI_NORMAL_END = SCX_OP_IDX(init),
+ SCX_OPI_END = SCX_OP_IDX(init),
+};
+
+enum scx_wake_flags {
+ /* expose select WF_* flags as enums */
+ SCX_WAKE_FORK = WF_FORK,
+ SCX_WAKE_TTWU = WF_TTWU,
+ SCX_WAKE_SYNC = WF_SYNC,
+};
+
+enum scx_enq_flags {
+ /* expose select ENQUEUE_* flags as enums */
+ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP,
+ SCX_ENQ_HEAD = ENQUEUE_HEAD,
+
+ /* high 32bits are SCX specific */
+
+ /*
+ * The task being enqueued is the only task available for the cpu. By
+ * default, ext core keeps executing such tasks but when
+ * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
+ * %SCX_ENQ_LAST flag set.
+ *
+ * If the BPF scheduler wants to continue executing the task,
+ * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately.
+ * If the task gets queued on a different dsq or the BPF side, the BPF
+ * scheduler is responsible for triggering a follow-up scheduling event.
+ * Otherwise, Execution may stall.
+ */
+ SCX_ENQ_LAST = 1LLU << 41,
+
+ /* high 8 bits are internal */
+ __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56,
+
+ SCX_ENQ_CLEAR_OPSS = 1LLU << 56,
+};
+
+enum scx_deq_flags {
+ /* expose select DEQUEUE_* flags as enums */
+ SCX_DEQ_SLEEP = DEQUEUE_SLEEP,
+};
+
+enum scx_pick_idle_cpu_flags {
+ SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */
+};
+
+enum scx_ops_enable_state {
+ SCX_OPS_PREPPING,
+ SCX_OPS_ENABLING,
+ SCX_OPS_ENABLED,
+ SCX_OPS_DISABLING,
+ SCX_OPS_DISABLED,
+};
+
+static const char *scx_ops_enable_state_str[] = {
+ [SCX_OPS_PREPPING] = "prepping",
+ [SCX_OPS_ENABLING] = "enabling",
+ [SCX_OPS_ENABLED] = "enabled",
+ [SCX_OPS_DISABLING] = "disabling",
+ [SCX_OPS_DISABLED] = "disabled",
+};
+
+/*
+ * sched_ext_entity->ops_state
+ *
+ * Used to track the task ownership between the SCX core and the BPF scheduler.
+ * State transitions look as follows:
+ *
+ * NONE -> QUEUEING -> QUEUED -> DISPATCHING
+ * ^ | |
+ * | v v
+ * \-------------------------------/
+ *
+ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
+ * sites for explanations on the conditions being waited upon and why they are
+ * safe. Transitions out of them into NONE or QUEUED must store_release and the
+ * waiters should load_acquire.
+ *
+ * Tracking scx_ops_state enables sched_ext core to reliably determine whether
+ * any given task can be dispatched by the BPF scheduler at all times and thus
+ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
+ * to try to dispatch any task anytime regardless of its state as the SCX core
+ * can safely reject invalid dispatches.
+ */
+enum scx_ops_state {
+ SCX_OPSS_NONE, /* owned by the SCX core */
+ SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */
+ SCX_OPSS_QUEUED, /* owned by the BPF scheduler */
+ SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */
+
+ /*
+ * QSEQ brands each QUEUED instance so that, when dispatch races
+ * dequeue/requeue, the dispatcher can tell whether it still has a claim
+ * on the task being dispatched.
+ *
+ * As some 32bit archs can't do 64bit store_release/load_acquire,
+ * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
+ * 32bit machines. The dispatch race window QSEQ protects is very narrow
+ * and runs with IRQ disabled. 30 bits should be sufficient.
+ */
+ SCX_OPSS_QSEQ_SHIFT = 2,
+};
+
+/* Use macros to ensure that the type is unsigned long for the masks */
+#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
+#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK)
+
+/*
+ * During exit, a task may schedule after losing its PIDs. When disabling the
+ * BPF scheduler, we need to be able to iterate tasks in every state to
+ * guarantee system safety. Maintain a dedicated task list which contains every
+ * task between its fork and eventual free.
+ */
+static DEFINE_SPINLOCK(scx_tasks_lock);
+static LIST_HEAD(scx_tasks);
+
+/* ops enable/disable */
+static struct kthread_worker *scx_ops_helper;
+static DEFINE_MUTEX(scx_ops_enable_mutex);
+DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
+DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
+static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
+static bool scx_switching_all;
+DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
+
+static struct sched_ext_ops scx_ops;
+static bool scx_warned_zero_slice;
+
+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
+
+struct static_key_false scx_has_op[SCX_OPI_END] =
+ { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
+
+static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
+static struct scx_exit_info *scx_exit_info;
+
+/* idle tracking */
+#ifdef CONFIG_SMP
+#ifdef CONFIG_CPUMASK_OFFSTACK
+#define CL_ALIGNED_IF_ONSTACK
+#else
+#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
+#endif
+
+static struct {
+ cpumask_var_t cpu;
+ cpumask_var_t smt;
+} idle_masks CL_ALIGNED_IF_ONSTACK;
+
+#endif /* CONFIG_SMP */
+
+/*
+ * Direct dispatch marker.
+ *
+ * Non-NULL values are used for direct dispatch from enqueue path. A valid
+ * pointer points to the task currently being enqueued. An ERR_PTR value is used
+ * to indicate that direct dispatch has already happened.
+ */
+static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
+
+/* dispatch queues */
+static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global;
+
+static const struct rh