aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-12-03 13:25:39 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-12-03 13:25:39 -0800
commit02baaa67d9afc2e56c6e1ac6a1fb1f1dd2be366f (patch)
tree13ae2fec8be92b2f774cfb3fd725c027740be3ac /kernel
parent8449d3252c2603a51ffc7c36cb5bd94874378b7d (diff)
parent1dd6c84f1c544e552848a8968599220bd464e338 (diff)
Merge tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext updates from Tejun Heo: - Improve recovery from misbehaving BPF schedulers. When a scheduler puts many tasks with varying affinity restrictions on a shared DSQ, CPUs scanning through tasks they cannot run can overwhelm the system, causing lockups. Bypass mode now uses per-CPU DSQs with a load balancer to avoid this, and hooks into the hardlockup detector to attempt recovery. Add scx_cpu0 example scheduler to demonstrate this scenario. - Add lockless peek operation for DSQs to reduce lock contention for schedulers that need to query queue state during load balancing. - Allow scx_bpf_reenqueue_local() to be called from anywhere in preparation for deprecating cpu_acquire/release() callbacks in favor of generic BPF hooks. - Prepare for hierarchical scheduler support: add scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() kfuncs, make scx_bpf_dsq_insert*() return bool, and wrap kfunc args in structs for future aux__prog parameter. - Implement cgroup_set_idle() callback to notify BPF schedulers when a cgroup's idle state changes. - Fix migration tasks being incorrectly downgraded from stop_sched_class to rt_sched_class across sched_ext enable/disable. Applied late as the fix is low risk and the bug subtle but needs stable backporting. - Various fixes and cleanups including cgroup exit ordering, SCX_KICK_WAIT reliability, and backward compatibility improvements. * tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (44 commits) sched_ext: Fix incorrect sched_class settings for per-cpu migration tasks sched_ext: tools: Removing duplicate targets during non-cross compilation sched_ext: Use kvfree_rcu() to release per-cpu ksyncs object sched_ext: Pass locked CPU parameter to scx_hardlockup() and add docs sched_ext: Update comments replacing breather with aborting mechanism sched_ext: Implement load balancer for bypass mode sched_ext: Factor out abbreviated dispatch dequeue into dispatch_dequeue_locked() sched_ext: Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR sched_ext: Add scx_cpu0 example scheduler sched_ext: Hook up hardlockup detector sched_ext: Make handle_lockup() propagate scx_verror() result sched_ext: Refactor lockup handlers into handle_lockup() sched_ext: Make scx_exit() and scx_vexit() return bool sched_ext: Exit dispatch and move operations immediately when aborting sched_ext: Simplify breather mechanism with scx_aborting flag sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode sched_ext: Refactor do_enqueue_task() local and global DSQ paths sched_ext: Use shorter slice in bypass mode sched_ext: Mark racy bitfields to prevent adding fields that can't tolerate races sched_ext: Minor cleanups to scx_task_iter ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/fork.c1
-rw-r--r--kernel/sched/core.c6
-rw-r--r--kernel/sched/ext.c1067
-rw-r--r--kernel/sched/ext_idle.c43
-rw-r--r--kernel/sched/ext_internal.h29
-rw-r--r--kernel/sched/sched.h4
-rw-r--r--kernel/watchdog.c9
7 files changed, 879 insertions, 280 deletions
diff --git a/kernel/fork.c b/kernel/fork.c
index e5dc33f8b4b6..25d243718048 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -736,7 +736,6 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(tsk == current);
unwind_task_free(tsk);
- sched_ext_free(tsk);
io_uring_free(tsk);
cgroup_task_free(tsk);
task_numa_free(tsk, true);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index dba4e58d5d1a..b7801cd05d5a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5143,6 +5143,12 @@ static struct rq *finish_task_switch(struct task_struct *prev)
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
+ /*
+ * sched_ext_dead() must come before cgroup_task_dead() to
+ * prevent cgroups from being removed while its member tasks are
+ * visible to SCX schedulers.
+ */
+ sched_ext_dead(prev);
cgroup_task_dead(prev);
/* Task is done with its stack. */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 6827689a0966..05f5a49e9649 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -33,9 +33,10 @@ static DEFINE_MUTEX(scx_enable_mutex);
DEFINE_STATIC_KEY_FALSE(__scx_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED);
-static unsigned long scx_in_softlockup;
-static atomic_t scx_breather_depth = ATOMIC_INIT(0);
static int scx_bypass_depth;
+static cpumask_var_t scx_bypass_lb_donee_cpumask;
+static cpumask_var_t scx_bypass_lb_resched_cpumask;
+static bool scx_aborting;
static bool scx_init_task_enabled;
static bool scx_switching_all;
DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -68,18 +69,18 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
static struct delayed_work scx_watchdog_work;
/*
- * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence
+ * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence
* numbers. The arrays are allocated with kvzalloc() as size can exceed percpu
* allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated
* lazily when enabling and freed when disabling to avoid waste when sched_ext
* isn't active.
*/
-struct scx_kick_pseqs {
+struct scx_kick_syncs {
struct rcu_head rcu;
- unsigned long seqs[];
+ unsigned long syncs[];
};
-static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs);
+static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs);
/*
* Direct dispatch marker.
@@ -143,26 +144,70 @@ static struct scx_dump_data scx_dump_data = {
/* /sys/kernel/sched_ext interface */
static struct kset *scx_kset;
+/*
+ * Parameters that can be adjusted through /sys/module/sched_ext/parameters.
+ * There usually is no reason to modify these as normal scheduler operation
+ * shouldn't be affected by them. The knobs are primarily for debugging.
+ */
+static u64 scx_slice_dfl = SCX_SLICE_DFL;
+static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC;
+static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US;
+
+static int set_slice_us(const char *val, const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC);
+}
+
+static const struct kernel_param_ops slice_us_param_ops = {
+ .set = set_slice_us,
+ .get = param_get_uint,
+};
+
+static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp)
+{
+ return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC);
+}
+
+static const struct kernel_param_ops bypass_lb_intv_us_param_ops = {
+ .set = set_bypass_lb_intv_us,
+ .get = param_get_uint,
+};
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "sched_ext."
+
+module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600);
+MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)");
+module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600);
+MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)");
+
+#undef MODULE_PARAM_PREFIX
+
#define CREATE_TRACE_POINTS
#include <trace/events/sched_ext.h>
static void process_ddsp_deferred_locals(struct rq *rq);
+static u32 reenq_local(struct rq *rq);
static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags);
-static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
+static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind,
s64 exit_code, const char *fmt, va_list args);
-static __printf(4, 5) void scx_exit(struct scx_sched *sch,
+static __printf(4, 5) bool scx_exit(struct scx_sched *sch,
enum scx_exit_kind kind, s64 exit_code,
const char *fmt, ...)
{
va_list args;
+ bool ret;
va_start(args, fmt);
- scx_vexit(sch, kind, exit_code, fmt, args);
+ ret = scx_vexit(sch, kind, exit_code, fmt, args);
va_end(args);
+
+ return ret;
}
#define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args)
+#define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args)
#define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op)
@@ -200,7 +245,15 @@ static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch,
static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id)
{
- return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params);
+ return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params);
+}
+
+static const struct sched_class *scx_setscheduler_class(struct task_struct *p)
+{
+ if (p->sched_class == &stop_sched_class)
+ return &stop_sched_class;
+
+ return __setscheduler_class(p->policy, p->prio);
}
/*
@@ -469,19 +522,16 @@ struct scx_task_iter {
* RCU read lock or obtaining a reference count.
*
* All tasks which existed when the iteration started are guaranteed to be
- * visited as long as they still exist.
+ * visited as long as they are not dead.
*/
static void scx_task_iter_start(struct scx_task_iter *iter)
{
- BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS &
- ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1));
+ memset(iter, 0, sizeof(*iter));
raw_spin_lock_irq(&scx_tasks_lock);
iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
list_add(&iter->cursor.tasks_node, &scx_tasks);
- iter->locked_task = NULL;
- iter->cnt = 0;
iter->list_locked = true;
}
@@ -547,14 +597,13 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
struct list_head *cursor = &iter->cursor.tasks_node;
struct sched_ext_entity *pos;
- __scx_task_iter_maybe_relock(iter);
-
if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) {
scx_task_iter_unlock(iter);
cond_resched();
- __scx_task_iter_maybe_relock(iter);
}
+ __scx_task_iter_maybe_relock(iter);
+
list_for_each_entry(pos, cursor, tasks_node) {
if (&pos->tasks_node == &scx_tasks)
return NULL;
@@ -755,6 +804,11 @@ static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err
static void run_deferred(struct rq *rq)
{
process_ddsp_deferred_locals(rq);
+
+ if (local_read(&rq->scx.reenq_local_deferred)) {
+ local_set(&rq->scx.reenq_local_deferred, 0);
+ reenq_local(rq);
+ }
}
static void deferred_bal_cb_workfn(struct rq *rq)
@@ -775,12 +829,28 @@ static void deferred_irq_workfn(struct irq_work *irq_work)
* schedule_deferred - Schedule execution of deferred actions on an rq
* @rq: target rq
*
- * Schedule execution of deferred actions on @rq. Must be called with @rq
- * locked. Deferred actions are executed with @rq locked but unpinned, and thus
- * can unlock @rq to e.g. migrate tasks to other rqs.
+ * Schedule execution of deferred actions on @rq. Deferred actions are executed
+ * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks
+ * to other rqs.
*/
static void schedule_deferred(struct rq *rq)
{
+ /*
+ * Queue an irq work. They are executed on IRQ re-enable which may take
+ * a bit longer than the scheduler hook in schedule_deferred_locked().
+ */
+ irq_work_queue(&rq->scx.deferred_irq_work);
+}
+
+/**
+ * schedule_deferred_locked - Schedule execution of deferred actions on an rq
+ * @rq: target rq
+ *
+ * Schedule execution of deferred actions on @rq. Equivalent to
+ * schedule_deferred() but requires @rq to be locked and can be more efficient.
+ */
+static void schedule_deferred_locked(struct rq *rq)
+{
lockdep_assert_rq_held(rq);
/*
@@ -812,12 +882,11 @@ static void schedule_deferred(struct rq *rq)
}
/*
- * No scheduler hooks available. Queue an irq work. They are executed on
- * IRQ re-enable which may take a bit longer than the scheduler hooks.
- * The above WAKEUP and BALANCE paths should cover most of the cases and
- * the time to IRQ re-enable shouldn't be long.
+ * No scheduler hooks available. Use the generic irq_work path. The
+ * above WAKEUP and BALANCE paths should cover most of the cases and the
+ * time to IRQ re-enable shouldn't be long.
*/
- irq_work_queue(&rq->scx.deferred_irq_work);
+ schedule_deferred(rq);
}
/**
@@ -902,7 +971,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p)
{
- p->scx.slice = SCX_SLICE_DFL;
+ p->scx.slice = READ_ONCE(scx_slice_dfl);
__scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1);
}
@@ -916,7 +985,9 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
!RB_EMPTY_NODE(&p->scx.dsq_priq));
if (!is_local) {
- raw_spin_lock(&dsq->lock);
+ raw_spin_lock_nested(&dsq->lock,
+ (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0);
+
if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
scx_error(sch, "attempting to dispatch to a destroyed dsq");
/* fall back to the global dsq */
@@ -965,8 +1036,11 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
container_of(rbp, struct task_struct,
scx.dsq_priq);
list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node);
+ /* first task unchanged - no update needed */
} else {
list_add(&p->scx.dsq_list.node, &dsq->list);
+ /* not builtin and new task is at head - use fastpath */
+ rcu_assign_pointer(dsq->first_task, p);
}
} else {
/* a FIFO DSQ shouldn't be using PRIQ enqueuing */
@@ -974,10 +1048,19 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq,
scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks",
dsq->id);
- if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
+ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) {
list_add(&p->scx.dsq_list.node, &dsq->list);
- else
+ /* new task inserted at head - use fastpath */
+ if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN))
+ rcu_assign_pointer(dsq->first_task, p);
+ } else {
+ bool was_empty;
+
+ was_empty = list_empty(&dsq->list);
list_add_tail(&p->scx.dsq_list.node, &dsq->list);
+ if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN))
+ rcu_assign_pointer(dsq->first_task, p);
+ }
}
/* seq records the order tasks are queued, used by BPF DSQ iterator */
@@ -1034,6 +1117,13 @@ static void task_unlink_from_dsq(struct task_struct *p,
list_del_init(&p->scx.dsq_list.node);
dsq_mod_nr(dsq, -1);
+
+ if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) {
+ struct task_struct *first_task;
+
+ first_task = nldsq_next_task(dsq, NULL, false);
+ rcu_assign_pointer(dsq->first_task, first_task);
+ }
}
static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
@@ -1041,6 +1131,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
struct scx_dispatch_q *dsq = p->scx.dsq;
bool is_local = dsq == &rq->scx.local_dsq;
+ lockdep_assert_rq_held(rq);
+
if (!dsq) {
/*
* If !dsq && on-list, @p is on @rq's ddsp_deferred_locals.
@@ -1087,6 +1179,20 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
raw_spin_unlock(&dsq->lock);
}
+/*
+ * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq
+ * and dsq are locked.
+ */
+static void dispatch_dequeue_locked(struct task_struct *p,
+ struct scx_dispatch_q *dsq)
+{
+ lockdep_assert_rq_held(task_rq(p));
+ lockdep_assert_held(&dsq->lock);
+
+ task_unlink_from_dsq(p, dsq);
+ p->scx.dsq = NULL;
+}
+
static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch,
struct rq *rq, u64 dsq_id,
struct task_struct *p)
@@ -1192,7 +1298,7 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p,
WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node));
list_add_tail(&p->scx.dsq_list.node,
&rq->scx.ddsp_deferred_locals);
- schedule_deferred(rq);
+ schedule_deferred_locked(rq);
return;
}
@@ -1217,6 +1323,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
{
struct scx_sched *sch = scx_root;
struct task_struct **ddsp_taskp;
+ struct scx_dispatch_q *dsq;
unsigned long qseq;
WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
@@ -1235,7 +1342,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
if (scx_rq_bypassing(rq)) {
__scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1);
- goto global;
+ goto bypass;
}
if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
@@ -1284,8 +1391,20 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
direct:
direct_dispatch(sch, p, enq_flags);
return;
-
+local_norefill:
+ dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
+ return;
local:
+ dsq = &rq->scx.local_dsq;
+ goto enqueue;
+global:
+ dsq = find_global_dsq(sch, p);
+ goto enqueue;
+bypass:
+ dsq = &task_rq(p)->scx.bypass_dsq;
+ goto enqueue;
+
+enqueue:
/*
* For task-ordering, slice refill must be treated as implying the end
* of the current slice. Otherwise, the longer @p stays on the CPU, the
@@ -1293,14 +1412,7 @@ local:
*/
touch_core_sched(rq, p);
refill_task_slice_dfl(sch, p);
-local_norefill:
- dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags);
- return;
-
-global:
- touch_core_sched(rq, p); /* see the comment in local: */
- refill_task_slice_dfl(sch, p);
- dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags);
+ dispatch_enqueue(sch, dsq, p, enq_flags);
}
static bool task_runnable(const struct task_struct *p)
@@ -1741,8 +1853,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
* @p is going from a non-local DSQ to a non-local DSQ. As
* $src_dsq is already locked, do an abbreviated dequeue.
*/
- task_unlink_from_dsq(p, src_dsq);
- p->scx.dsq = NULL;
+ dispatch_dequeue_locked(p, src_dsq);
raw_spin_unlock(&src_dsq->lock);
dispatch_enqueue(sch, dst_dsq, p, enq_flags);
@@ -1751,49 +1862,12 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch,
return dst_rq;
}
-/*
- * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
- * banging on the same DSQ on a large NUMA system to the point where switching
- * to the bypass mode can take a long time. Inject artificial delays while the
- * bypass mode is switching to guarantee timely completion.
- */
-static void scx_breather(struct rq *rq)
-{
- u64 until;
-
- lockdep_assert_rq_held(rq);
-
- if (likely(!atomic_read(&scx_breather_depth)))
- return;
-
- raw_spin_rq_unlock(rq);
-
- until = ktime_get_ns() + NSEC_PER_MSEC;
-
- do {
- int cnt = 1024;
- while (atomic_read(&scx_breather_depth) && --cnt)
- cpu_relax();
- } while (atomic_read(&scx_breather_depth) &&
- time_before64(ktime_get_ns(), until));
-
- raw_spin_rq_lock(rq);
-}
-
static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq,
struct scx_dispatch_q *dsq)
{
struct task_struct *p;
retry:
/*
- * This retry loop can repeatedly race against scx_bypass() dequeueing
- * tasks from @dsq trying to put the system into the bypass mode. On
- * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock
- * the machine into soft lockups. Give a breather.
- */
- scx_breather(rq);
-
- /*
* The caller can't expect to successfully consume a task if the task's
* addition to @dsq isn't guaranteed to be visible somehow. Test
* @dsq->list without locking and skip if it seems empty.
@@ -1806,6 +1880,17 @@ retry:
nldsq_for_each_task(p, dsq) {
struct rq *task_rq = task_rq(p);
+ /*
+ * This loop can lead to multiple lockup scenarios, e.g. the BPF
+ * scheduler can put an enormous number of affinitized tasks into
+ * a contended DSQ, or the outer retry loop can repeatedly race
+ * against scx_bypass() dequeueing tasks from @dsq trying to put
+ * the system into the bypass mode. This can easily live-lock the
+ * machine. If aborting, exit from all non-bypass DSQs.
+ */
+ if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS)
+ break;
+
if (rq == task_rq) {
task_unlink_from_dsq(p, dsq);
move_local_task_to_local_dsq(p, 0, dsq, rq);
@@ -2089,8 +2174,14 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
if (consume_global_dsq(sch, rq))
goto has_tasks;
- if (unlikely(!SCX_HAS_OP(sch, dispatch)) ||
- scx_rq_bypassing(rq) || !scx_rq_online(rq))
+ if (scx_rq_bypassing(rq)) {
+ if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq))
+ goto has_tasks;
+ else
+ goto no_tasks;
+ }
+
+ if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq))
goto no_tasks;
dspc->rq = rq;
@@ -2241,12 +2332,6 @@ static void switch_class(struct rq *rq, struct task_struct *next)
struct scx_sched *sch = scx_root;
const struct sched_class *next_class = next->sched_class;
- /*
- * Pairs with the smp_load_acquire() issued by a CPU in
- * kick_cpus_irq_workfn() who is waiting for this CPU to perform a
- * resched.
- */
- smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1);
if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT))
return;
@@ -2286,6 +2371,10 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p,
struct task_struct *next)
{
struct scx_sched *sch = scx_root;
+
+ /* see kick_cpus_irq_workfn() */
+ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
+
update_curr_scx(rq);
/* see dequeue_task_scx() on why we skip when !QUEUED */
@@ -2332,18 +2421,32 @@ static struct task_struct *first_local_task(struct rq *rq)
struct task_struct, scx.dsq_list.node);
}
-static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
+static struct task_struct *
+do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx)
{
struct task_struct *prev = rq->curr;
bool keep_prev, kick_idle = false;
struct task_struct *p;
+ /* see kick_cpus_irq_workfn() */
+ smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1);
+
rq_modified_clear(rq);
+
rq_unpin_lock(rq, rf);
balance_one(rq, prev);
rq_repin_lock(rq, rf);
maybe_queue_balance_callback(rq);
- if (rq_modified_above(rq, &ext_sched_class))
+
+ /*
+ * If any higher-priority sched class enqueued a runnable task on
+ * this rq during balance_one(), abort and return RETRY_TASK, so
+ * that the scheduler loop can restart.
+ *
+ * If @force_scx is true, always try to pick a SCHED_EXT task,
+ * regardless of any higher-priority sched classes activity.
+ */
+ if (!force_scx && rq_modified_above(rq, &ext_sched_class))
return RETRY_TASK;
keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
@@ -2386,6 +2489,11 @@ static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
return p;
}
+static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf)
+{
+ return do_pick_task_scx(rq, rf, false);
+}
+
#ifdef CONFIG_SCHED_CORE
/**
* scx_prio_less - Task ordering for core-sched
@@ -2842,7 +2950,7 @@ void init_scx_entity(struct sched_ext_entity *scx)
INIT_LIST_HEAD(&scx->runnable_node);
scx->runnable_at = jiffies;
scx->ddsp_dsq_id = SCX_DSQ_INVALID;
- scx->slice = SCX_SLICE_DFL;
+ scx->slice = READ_ONCE(scx_slice_dfl);
}
void scx_pre_fork(struct task_struct *p)
@@ -2908,7 +3016,7 @@ void scx_cancel_fork(struct task_struct *p)
percpu_up_read(&scx_fork_rwsem);
}
-void sched_ext_free(struct task_struct *p)
+void sched_ext_dead(struct task_struct *p)
{
unsigned long flags;
@@ -3012,6 +3120,7 @@ void scx_tg_init(struct task_group *tg)
tg->scx.weight = CGROUP_WEIGHT_DFL;
tg->scx.bw_period_us = default_bw_period_us();
tg->scx.bw_quota_us = RUNTIME_INF;
+ tg->scx.idle = false;
}
int scx_tg_online(struct task_group *tg)
@@ -3160,7 +3269,18 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight)
void scx_group_set_idle(struct task_group *tg, bool idle)
{
- /* TODO: Implement ops->cgroup_set_idle() */
+ struct scx_sched *sch = scx_root;
+
+ percpu_down_read(&scx_cgroup_ops_rwsem);
+
+ if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle))
+ SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL,
+ tg_cgrp(tg), idle);
+
+ /* Update the task group's idle state */
+ tg->scx.idle = idle;
+
+ percpu_up_read(&scx_cgroup_ops_rwsem);
}
void scx_group_set_bandwidth(struct task_group *tg,
@@ -3575,38 +3695,55 @@ bool scx_allow_ttwu_queue(const struct task_struct *p)
}
/**
- * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
+ * handle_lockup - sched_ext common lockup handler
+ * @fmt: format string
*
- * While there are various reasons why RCU CPU stalls can occur on a system
- * that may not be caused by the current BPF scheduler, try kicking out the
- * current scheduler in an attempt to recover the system to a good state before
- * issuing panics.
+ * Called on system stall or lockup condition and initiates abort of sched_ext
+ * if enabled, which may resolve the reported lockup.
+ *
+ * Returns %true if sched_ext is enabled and abort was initiated, which may
+ * resolve the lockup. %false if sched_ext is not enabled or abort was already
+ * initiated by someone else.
*/
-bool scx_rcu_cpu_stall(void)
+static __printf(1, 2) bool handle_lockup(const char *fmt, ...)
{
struct scx_sched *sch;
+ va_list args;
+ bool ret;
- rcu_read_lock();
+ guard(rcu)();
sch = rcu_dereference(scx_root);
- if (unlikely(!sch)) {
- rcu_read_unlock();
+ if (unlikely(!sch))
return false;
- }
switch (scx_enable_state()) {
case SCX_ENABLING:
case SCX_ENABLED:
- break;
+ va_start(args, fmt);
+ ret = scx_verror(sch, fmt, args);
+ va_end(args);
+ return ret;
default:
- rcu_read_unlock();
return false;
}
+}
- scx_error(sch, "RCU CPU stall detected!");
- rcu_read_unlock();
-
- return true;
+/**
+ * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler
+ *
+ * While there are various reasons why RCU CPU stalls can occur on a system
+ * that may not be caused by the current BPF scheduler, try kicking out the
+ * current scheduler in an attempt to recover the system to a good state before
+ * issuing panics.
+ *
+ * Returns %true if sched_ext is enabled and abort was initiated, which may
+ * resolve the reported RCU stall. %false if sched_ext is not enabled or someone
+ * else already initiated abort.
+ */
+bool scx_rcu_cpu_stall(void)
+{
+ return handle_lockup("RCU CPU stall detected!");
}
/**
@@ -3617,50 +3754,240 @@ bool scx_rcu_cpu_stall(void)
* live-lock the system by making many CPUs target the same DSQ to the point
* where soft-lockup detection triggers. This function is called from
* soft-lockup watchdog when the triggering point is close and tries to unjam
- * the system by enabling the breather and aborting the BPF scheduler.
+ * the system and aborting the BPF scheduler.
*/
void scx_softlockup(u32 dur_s)
{
- struct scx_sched *sch;
+ if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s))
+ return;
- rcu_read_lock();
+ printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n",
+ smp_processor_id(), dur_s);
+}
- sch = rcu_dereference(scx_root);
- if (unlikely(!sch))
- goto out_unlock;
+/**
+ * scx_hardlockup - sched_ext hardlockup handler
+ *
+ * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting
+ * numerous affinitized tasks in a single queue and directing all CPUs at it.
+ * Try kicking out the current scheduler in an attempt to recover the system to
+ * a good state before taking more drastic actions.
+ *
+ * Returns %true if sched_ext is enabled and abort was initiated, which may
+ * resolve the reported hardlockdup. %false if sched_ext is not enabled or
+ * someone else already initiated abort.
+ */
+bool scx_hardlockup(int cpu)
+{
+ if (!handle_lockup("hard lockup - CPU %d", cpu))
+ return false;
- switch (scx_enable_state()) {
- case SCX_ENABLING:
- case SCX_ENABLED:
- break;
- default:
- goto out_unlock;
+ printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n",
+ cpu);
+ return true;
+}
+
+static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq,
+ struct cpumask *donee_mask, struct cpumask *resched_mask,
+ u32 nr_donor_target, u32 nr_donee_target)
+{
+ struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
+ struct task_struct *p, *n;
+ struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0);
+ s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target;
+ u32 nr_balanced = 0, min_delta_us;
+
+ /*
+ * All we want to guarantee is reasonable forward progress. No reason to
+ * fine tune. Assuming every task on @donor_dsq runs their full slice,
+ * consider offloading iff the total queued duration is over the
+ * threshold.
+ */
+ min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV;
+ if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us))
+ return 0;
+
+ raw_spin_rq_lock_irq(rq);
+ raw_spin_lock(&donor_dsq->lock);
+ list_add(&cursor.node, &donor_dsq->list);
+resume:
+ n = container_of(&cursor, struct task_struct, scx.dsq_list);
+ n = nldsq_next_task(donor_dsq, n, false);
+
+ while ((p = n)) {
+ struct rq *donee_rq;
+ struct scx_dispatch_q *donee_dsq;
+ int donee;
+
+ n = nldsq_next_task(donor_dsq, n, false);
+
+ if (donor_dsq->nr <= nr_donor_target)
+ break;
+
+ if (cpumask_empty(donee_mask))
+ break;
+
+ donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr);
+ if (donee >= nr_cpu_ids)
+ continue;
+
+ donee_rq = cpu_rq(donee);
+ donee_dsq = &donee_rq->scx.bypass_dsq;
+
+ /*
+ * $p's rq is not locked but $p's DSQ lock protects its
+ * scheduling properties making this test safe.
+ */
+ if (!task_can_run_on_remote_rq(sch, p, donee_rq, false))
+ continue;
+
+ /*
+ * Moving $p from one non-local DSQ to another. The source rq
+ * and DSQ are already locked. Do an abbreviated dequeue and
+ * then perform enqueue without unlocking $donor_dsq.
+ *
+ * We don't want to drop and reacquire the lock on each
+ * iteration as @donor_dsq can be very long and potentially
+ * highly contended. Donee DSQs are less likely to be contended.
+ * The nested locking is safe as only this LB moves tasks
+ * between bypass DSQs.
+ */
+ dispatch_dequeue_locked(p, donor_dsq);
+ dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED);
+
+ /*
+ * $donee might have been idle and need to be woken up. No need
+ * to be clever. Kick every CPU that receives tasks.
+ */
+ cpumask_set_cpu(donee, resched_mask);
+
+ if (READ_ONCE(donee_dsq->nr) >= nr_donee_target)
+ cpumask_clear_cpu(donee, donee_mask);
+
+ nr_balanced++;
+ if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) {
+ list_move_tail(&cursor.node, &n->scx.dsq_list.node);
+ raw_spin_unlock(&donor_dsq->lock);
+ raw_spin_rq_unlock_irq(rq);
+ cpu_relax();
+ raw_spin_rq_lock_irq(rq);
+ raw_spin_lock(&donor_dsq->lock);
+ goto resume;
+ }
}
- /* allow only one instance, cleared at the end of scx_bypass() */
- if (test_and_set_bit(0, &scx_in_softlockup))
- goto out_unlock;
+ list_del_init(&cursor.node);
+ raw_spin_unlock(&donor_dsq->lock);
+ raw_spin_rq_unlock_irq(rq);
- printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
- smp_processor_id(), dur_s, scx_root->ops.name);
+ return nr_balanced;
+}
+
+static void bypass_lb_node(struct scx_sched *sch, int node)
+{
+ const struct cpumask *node_mask = cpumask_of_node(node);
+ struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask;
+ struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask;
+ u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0;
+ u32 nr_target, nr_donor_target;
+ u32 before_min = U32_MAX, before_max = 0;
+ u32 after_min = U32_MAX, after_max = 0;
+ int cpu;
+
+ /* count the target tasks and CPUs */
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+
+ nr_tasks += nr;
+ nr_cpus++;
+
+ before_min = min(nr, before_min);
+ before_max = max(nr, before_max);
+ }
+
+ if (!nr_cpus)
+ return;
/*
- * Some CPUs may be trapped in the dispatch paths. Enable breather
- * immediately; otherwise, we might even be able to get to scx_bypass().
+ * We don't want CPUs to have more than $nr_donor_target tasks and
+ * balancing to fill donee CPUs upto $nr_target. Once targets are
+ * calculated, find the donee CPUs.
*/
- atomic_inc(&scx_breather_depth);
+ nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus);
+ nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100);
- scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s);
-out_unlock:
- rcu_read_unlock();
+ cpumask_clear(donee_mask);
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target)
+ cpumask_set_cpu(cpu, donee_mask);
+ }
+
+ /* iterate !donee CPUs and see if they should be offloaded */
+ cpumask_clear(resched_mask);
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ struct rq *rq = cpu_rq(cpu);
+ struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq;
+
+ if (cpumask_empty(donee_mask))
+ break;
+ if (cpumask_test_cpu(cpu, donee_mask))
+ continue;
+ if (READ_ONCE(donor_dsq->nr) <= nr_donor_target)
+ continue;
+
+ nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask,
+ nr_donor_target, nr_target);
+ }
+
+ for_each_cpu(cpu, resched_mask) {
+ struct rq *rq = cpu_rq(cpu);
+
+ raw_spin_rq_lock_irq(rq);
+ resched_curr(rq);
+ raw_spin_rq_unlock_irq(rq);
+ }
+
+ for_each_cpu_and(cpu, cpu_online_mask, node_mask) {
+ u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr);
+
+ after_min = min(nr, after_min);
+ after_max = max(nr, after_max);
+
+ }
+
+ trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced,
+ before_min, before