diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-03 13:25:39 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-03 13:25:39 -0800 |
| commit | 02baaa67d9afc2e56c6e1ac6a1fb1f1dd2be366f (patch) | |
| tree | 13ae2fec8be92b2f774cfb3fd725c027740be3ac /kernel | |
| parent | 8449d3252c2603a51ffc7c36cb5bd94874378b7d (diff) | |
| parent | 1dd6c84f1c544e552848a8968599220bd464e338 (diff) | |
Merge tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext updates from Tejun Heo:
- Improve recovery from misbehaving BPF schedulers.
When a scheduler puts many tasks with varying affinity restrictions
on a shared DSQ, CPUs scanning through tasks they cannot run can
overwhelm the system, causing lockups.
Bypass mode now uses per-CPU DSQs with a load balancer to avoid this,
and hooks into the hardlockup detector to attempt recovery.
Add scx_cpu0 example scheduler to demonstrate this scenario.
- Add lockless peek operation for DSQs to reduce lock contention for
schedulers that need to query queue state during load balancing.
- Allow scx_bpf_reenqueue_local() to be called from anywhere in
preparation for deprecating cpu_acquire/release() callbacks in favor
of generic BPF hooks.
- Prepare for hierarchical scheduler support: add
scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() kfuncs,
make scx_bpf_dsq_insert*() return bool, and wrap kfunc args in
structs for future aux__prog parameter.
- Implement cgroup_set_idle() callback to notify BPF schedulers when a
cgroup's idle state changes.
- Fix migration tasks being incorrectly downgraded from
stop_sched_class to rt_sched_class across sched_ext enable/disable.
Applied late as the fix is low risk and the bug subtle but needs
stable backporting.
- Various fixes and cleanups including cgroup exit ordering,
SCX_KICK_WAIT reliability, and backward compatibility improvements.
* tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (44 commits)
sched_ext: Fix incorrect sched_class settings for per-cpu migration tasks
sched_ext: tools: Removing duplicate targets during non-cross compilation
sched_ext: Use kvfree_rcu() to release per-cpu ksyncs object
sched_ext: Pass locked CPU parameter to scx_hardlockup() and add docs
sched_ext: Update comments replacing breather with aborting mechanism
sched_ext: Implement load balancer for bypass mode
sched_ext: Factor out abbreviated dispatch dequeue into dispatch_dequeue_locked()
sched_ext: Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR
sched_ext: Add scx_cpu0 example scheduler
sched_ext: Hook up hardlockup detector
sched_ext: Make handle_lockup() propagate scx_verror() result
sched_ext: Refactor lockup handlers into handle_lockup()
sched_ext: Make scx_exit() and scx_vexit() return bool
sched_ext: Exit dispatch and move operations immediately when aborting
sched_ext: Simplify breather mechanism with scx_aborting flag
sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode
sched_ext: Refactor do_enqueue_task() local and global DSQ paths
sched_ext: Use shorter slice in bypass mode
sched_ext: Mark racy bitfields to prevent adding fields that can't tolerate races
sched_ext: Minor cleanups to scx_task_iter
...
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/fork.c | 1 | ||||
| -rw-r--r-- | kernel/sched/core.c | 6 | ||||
| -rw-r--r-- | kernel/sched/ext.c | 1067 | ||||
| -rw-r--r-- | kernel/sched/ext_idle.c | 43 | ||||
| -rw-r--r-- | kernel/sched/ext_internal.h | 29 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 4 | ||||
| -rw-r--r-- | kernel/watchdog.c | 9 |
7 files changed, 879 insertions, 280 deletions
diff --git a/kernel/fork.c b/kernel/fork.c index e5dc33f8b4b6..25d243718048 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -736,7 +736,6 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(tsk == current); unwind_task_free(tsk); - sched_ext_free(tsk); io_uring_free(tsk); cgroup_task_free(tsk); task_numa_free(tsk, true); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dba4e58d5d1a..b7801cd05d5a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5143,6 +5143,12 @@ static struct rq *finish_task_switch(struct task_struct *prev) if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); + /* + * sched_ext_dead() must come before cgroup_task_dead() to + * prevent cgroups from being removed while its member tasks are + * visible to SCX schedulers. + */ + sched_ext_dead(prev); cgroup_task_dead(prev); /* Task is done with its stack. */ diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 6827689a0966..05f5a49e9649 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -33,9 +33,10 @@ static DEFINE_MUTEX(scx_enable_mutex); DEFINE_STATIC_KEY_FALSE(__scx_enabled); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); static atomic_t scx_enable_state_var = ATOMIC_INIT(SCX_DISABLED); -static unsigned long scx_in_softlockup; -static atomic_t scx_breather_depth = ATOMIC_INIT(0); static int scx_bypass_depth; +static cpumask_var_t scx_bypass_lb_donee_cpumask; +static cpumask_var_t scx_bypass_lb_resched_cpumask; +static bool scx_aborting; static bool scx_init_task_enabled; static bool scx_switching_all; DEFINE_STATIC_KEY_FALSE(__scx_switched_all); @@ -68,18 +69,18 @@ static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; static struct delayed_work scx_watchdog_work; /* - * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of pick_task sequence + * For %SCX_KICK_WAIT: Each CPU has a pointer to an array of kick_sync sequence * numbers. The arrays are allocated with kvzalloc() as size can exceed percpu * allocator limits on large machines. O(nr_cpu_ids^2) allocation, allocated * lazily when enabling and freed when disabling to avoid waste when sched_ext * isn't active. */ -struct scx_kick_pseqs { +struct scx_kick_syncs { struct rcu_head rcu; - unsigned long seqs[]; + unsigned long syncs[]; }; -static DEFINE_PER_CPU(struct scx_kick_pseqs __rcu *, scx_kick_pseqs); +static DEFINE_PER_CPU(struct scx_kick_syncs __rcu *, scx_kick_syncs); /* * Direct dispatch marker. @@ -143,26 +144,70 @@ static struct scx_dump_data scx_dump_data = { /* /sys/kernel/sched_ext interface */ static struct kset *scx_kset; +/* + * Parameters that can be adjusted through /sys/module/sched_ext/parameters. + * There usually is no reason to modify these as normal scheduler operation + * shouldn't be affected by them. The knobs are primarily for debugging. + */ +static u64 scx_slice_dfl = SCX_SLICE_DFL; +static unsigned int scx_slice_bypass_us = SCX_SLICE_BYPASS / NSEC_PER_USEC; +static unsigned int scx_bypass_lb_intv_us = SCX_BYPASS_LB_DFL_INTV_US; + +static int set_slice_us(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, 100, 100 * USEC_PER_MSEC); +} + +static const struct kernel_param_ops slice_us_param_ops = { + .set = set_slice_us, + .get = param_get_uint, +}; + +static int set_bypass_lb_intv_us(const char *val, const struct kernel_param *kp) +{ + return param_set_uint_minmax(val, kp, 0, 10 * USEC_PER_SEC); +} + +static const struct kernel_param_ops bypass_lb_intv_us_param_ops = { + .set = set_bypass_lb_intv_us, + .get = param_get_uint, +}; + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "sched_ext." + +module_param_cb(slice_bypass_us, &slice_us_param_ops, &scx_slice_bypass_us, 0600); +MODULE_PARM_DESC(slice_bypass_us, "bypass slice in microseconds, applied on [un]load (100us to 100ms)"); +module_param_cb(bypass_lb_intv_us, &bypass_lb_intv_us_param_ops, &scx_bypass_lb_intv_us, 0600); +MODULE_PARM_DESC(bypass_lb_intv_us, "bypass load balance interval in microseconds (0 (disable) to 10s)"); + +#undef MODULE_PARAM_PREFIX + #define CREATE_TRACE_POINTS #include <trace/events/sched_ext.h> static void process_ddsp_deferred_locals(struct rq *rq); +static u32 reenq_local(struct rq *rq); static void scx_kick_cpu(struct scx_sched *sch, s32 cpu, u64 flags); -static void scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, +static bool scx_vexit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, va_list args); -static __printf(4, 5) void scx_exit(struct scx_sched *sch, +static __printf(4, 5) bool scx_exit(struct scx_sched *sch, enum scx_exit_kind kind, s64 exit_code, const char *fmt, ...) { va_list args; + bool ret; va_start(args, fmt); - scx_vexit(sch, kind, exit_code, fmt, args); + ret = scx_vexit(sch, kind, exit_code, fmt, args); va_end(args); + + return ret; } #define scx_error(sch, fmt, args...) scx_exit((sch), SCX_EXIT_ERROR, 0, fmt, ##args) +#define scx_verror(sch, fmt, args) scx_vexit((sch), SCX_EXIT_ERROR, 0, fmt, args) #define SCX_HAS_OP(sch, op) test_bit(SCX_OP_IDX(op), (sch)->has_op) @@ -200,7 +245,15 @@ static struct scx_dispatch_q *find_global_dsq(struct scx_sched *sch, static struct scx_dispatch_q *find_user_dsq(struct scx_sched *sch, u64 dsq_id) { - return rhashtable_lookup_fast(&sch->dsq_hash, &dsq_id, dsq_hash_params); + return rhashtable_lookup(&sch->dsq_hash, &dsq_id, dsq_hash_params); +} + +static const struct sched_class *scx_setscheduler_class(struct task_struct *p) +{ + if (p->sched_class == &stop_sched_class) + return &stop_sched_class; + + return __setscheduler_class(p->policy, p->prio); } /* @@ -469,19 +522,16 @@ struct scx_task_iter { * RCU read lock or obtaining a reference count. * * All tasks which existed when the iteration started are guaranteed to be - * visited as long as they still exist. + * visited as long as they are not dead. */ static void scx_task_iter_start(struct scx_task_iter *iter) { - BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & - ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); + memset(iter, 0, sizeof(*iter)); raw_spin_lock_irq(&scx_tasks_lock); iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); - iter->locked_task = NULL; - iter->cnt = 0; iter->list_locked = true; } @@ -547,14 +597,13 @@ static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) struct list_head *cursor = &iter->cursor.tasks_node; struct sched_ext_entity *pos; - __scx_task_iter_maybe_relock(iter); - if (!(++iter->cnt % SCX_TASK_ITER_BATCH)) { scx_task_iter_unlock(iter); cond_resched(); - __scx_task_iter_maybe_relock(iter); } + __scx_task_iter_maybe_relock(iter); + list_for_each_entry(pos, cursor, tasks_node) { if (&pos->tasks_node == &scx_tasks) return NULL; @@ -755,6 +804,11 @@ static int ops_sanitize_err(struct scx_sched *sch, const char *ops_name, s32 err static void run_deferred(struct rq *rq) { process_ddsp_deferred_locals(rq); + + if (local_read(&rq->scx.reenq_local_deferred)) { + local_set(&rq->scx.reenq_local_deferred, 0); + reenq_local(rq); + } } static void deferred_bal_cb_workfn(struct rq *rq) @@ -775,12 +829,28 @@ static void deferred_irq_workfn(struct irq_work *irq_work) * schedule_deferred - Schedule execution of deferred actions on an rq * @rq: target rq * - * Schedule execution of deferred actions on @rq. Must be called with @rq - * locked. Deferred actions are executed with @rq locked but unpinned, and thus - * can unlock @rq to e.g. migrate tasks to other rqs. + * Schedule execution of deferred actions on @rq. Deferred actions are executed + * with @rq locked but unpinned, and thus can unlock @rq to e.g. migrate tasks + * to other rqs. */ static void schedule_deferred(struct rq *rq) { + /* + * Queue an irq work. They are executed on IRQ re-enable which may take + * a bit longer than the scheduler hook in schedule_deferred_locked(). + */ + irq_work_queue(&rq->scx.deferred_irq_work); +} + +/** + * schedule_deferred_locked - Schedule execution of deferred actions on an rq + * @rq: target rq + * + * Schedule execution of deferred actions on @rq. Equivalent to + * schedule_deferred() but requires @rq to be locked and can be more efficient. + */ +static void schedule_deferred_locked(struct rq *rq) +{ lockdep_assert_rq_held(rq); /* @@ -812,12 +882,11 @@ static void schedule_deferred(struct rq *rq) } /* - * No scheduler hooks available. Queue an irq work. They are executed on - * IRQ re-enable which may take a bit longer than the scheduler hooks. - * The above WAKEUP and BALANCE paths should cover most of the cases and - * the time to IRQ re-enable shouldn't be long. + * No scheduler hooks available. Use the generic irq_work path. The + * above WAKEUP and BALANCE paths should cover most of the cases and the + * time to IRQ re-enable shouldn't be long. */ - irq_work_queue(&rq->scx.deferred_irq_work); + schedule_deferred(rq); } /** @@ -902,7 +971,7 @@ static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) static void refill_task_slice_dfl(struct scx_sched *sch, struct task_struct *p) { - p->scx.slice = SCX_SLICE_DFL; + p->scx.slice = READ_ONCE(scx_slice_dfl); __scx_add_event(sch, SCX_EV_REFILL_SLICE_DFL, 1); } @@ -916,7 +985,9 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, !RB_EMPTY_NODE(&p->scx.dsq_priq)); if (!is_local) { - raw_spin_lock(&dsq->lock); + raw_spin_lock_nested(&dsq->lock, + (enq_flags & SCX_ENQ_NESTED) ? SINGLE_DEPTH_NESTING : 0); + if (unlikely(dsq->id == SCX_DSQ_INVALID)) { scx_error(sch, "attempting to dispatch to a destroyed dsq"); /* fall back to the global dsq */ @@ -965,8 +1036,11 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, container_of(rbp, struct task_struct, scx.dsq_priq); list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); + /* first task unchanged - no update needed */ } else { list_add(&p->scx.dsq_list.node, &dsq->list); + /* not builtin and new task is at head - use fastpath */ + rcu_assign_pointer(dsq->first_task, p); } } else { /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ @@ -974,10 +1048,19 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", dsq->id); - if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) { list_add(&p->scx.dsq_list.node, &dsq->list); - else + /* new task inserted at head - use fastpath */ + if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) + rcu_assign_pointer(dsq->first_task, p); + } else { + bool was_empty; + + was_empty = list_empty(&dsq->list); list_add_tail(&p->scx.dsq_list.node, &dsq->list); + if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) + rcu_assign_pointer(dsq->first_task, p); + } } /* seq records the order tasks are queued, used by BPF DSQ iterator */ @@ -1034,6 +1117,13 @@ static void task_unlink_from_dsq(struct task_struct *p, list_del_init(&p->scx.dsq_list.node); dsq_mod_nr(dsq, -1); + + if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { + struct task_struct *first_task; + + first_task = nldsq_next_task(dsq, NULL, false); + rcu_assign_pointer(dsq->first_task, first_task); + } } static void dispatch_dequeue(struct rq *rq, struct task_struct *p) @@ -1041,6 +1131,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) struct scx_dispatch_q *dsq = p->scx.dsq; bool is_local = dsq == &rq->scx.local_dsq; + lockdep_assert_rq_held(rq); + if (!dsq) { /* * If !dsq && on-list, @p is on @rq's ddsp_deferred_locals. @@ -1087,6 +1179,20 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) raw_spin_unlock(&dsq->lock); } +/* + * Abbreviated version of dispatch_dequeue() that can be used when both @p's rq + * and dsq are locked. + */ +static void dispatch_dequeue_locked(struct task_struct *p, + struct scx_dispatch_q *dsq) +{ + lockdep_assert_rq_held(task_rq(p)); + lockdep_assert_held(&dsq->lock); + + task_unlink_from_dsq(p, dsq); + p->scx.dsq = NULL; +} + static struct scx_dispatch_q *find_dsq_for_dispatch(struct scx_sched *sch, struct rq *rq, u64 dsq_id, struct task_struct *p) @@ -1192,7 +1298,7 @@ static void direct_dispatch(struct scx_sched *sch, struct task_struct *p, WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_list.node)); list_add_tail(&p->scx.dsq_list.node, &rq->scx.ddsp_deferred_locals); - schedule_deferred(rq); + schedule_deferred_locked(rq); return; } @@ -1217,6 +1323,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, { struct scx_sched *sch = scx_root; struct task_struct **ddsp_taskp; + struct scx_dispatch_q *dsq; unsigned long qseq; WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); @@ -1235,7 +1342,7 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, if (scx_rq_bypassing(rq)) { __scx_add_event(sch, SCX_EV_BYPASS_DISPATCH, 1); - goto global; + goto bypass; } if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) @@ -1284,8 +1391,20 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, direct: direct_dispatch(sch, p, enq_flags); return; - +local_norefill: + dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); + return; local: + dsq = &rq->scx.local_dsq; + goto enqueue; +global: + dsq = find_global_dsq(sch, p); + goto enqueue; +bypass: + dsq = &task_rq(p)->scx.bypass_dsq; + goto enqueue; + +enqueue: /* * For task-ordering, slice refill must be treated as implying the end * of the current slice. Otherwise, the longer @p stays on the CPU, the @@ -1293,14 +1412,7 @@ local: */ touch_core_sched(rq, p); refill_task_slice_dfl(sch, p); -local_norefill: - dispatch_enqueue(sch, &rq->scx.local_dsq, p, enq_flags); - return; - -global: - touch_core_sched(rq, p); /* see the comment in local: */ - refill_task_slice_dfl(sch, p); - dispatch_enqueue(sch, find_global_dsq(sch, p), p, enq_flags); + dispatch_enqueue(sch, dsq, p, enq_flags); } static bool task_runnable(const struct task_struct *p) @@ -1741,8 +1853,7 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, * @p is going from a non-local DSQ to a non-local DSQ. As * $src_dsq is already locked, do an abbreviated dequeue. */ - task_unlink_from_dsq(p, src_dsq); - p->scx.dsq = NULL; + dispatch_dequeue_locked(p, src_dsq); raw_spin_unlock(&src_dsq->lock); dispatch_enqueue(sch, dst_dsq, p, enq_flags); @@ -1751,49 +1862,12 @@ static struct rq *move_task_between_dsqs(struct scx_sched *sch, return dst_rq; } -/* - * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly - * banging on the same DSQ on a large NUMA system to the point where switching - * to the bypass mode can take a long time. Inject artificial delays while the - * bypass mode is switching to guarantee timely completion. - */ -static void scx_breather(struct rq *rq) -{ - u64 until; - - lockdep_assert_rq_held(rq); - - if (likely(!atomic_read(&scx_breather_depth))) - return; - - raw_spin_rq_unlock(rq); - - until = ktime_get_ns() + NSEC_PER_MSEC; - - do { - int cnt = 1024; - while (atomic_read(&scx_breather_depth) && --cnt) - cpu_relax(); - } while (atomic_read(&scx_breather_depth) && - time_before64(ktime_get_ns(), until)); - - raw_spin_rq_lock(rq); -} - static bool consume_dispatch_q(struct scx_sched *sch, struct rq *rq, struct scx_dispatch_q *dsq) { struct task_struct *p; retry: /* - * This retry loop can repeatedly race against scx_bypass() dequeueing - * tasks from @dsq trying to put the system into the bypass mode. On - * some multi-socket machines (e.g. 2x Intel 8480c), this can live-lock - * the machine into soft lockups. Give a breather. - */ - scx_breather(rq); - - /* * The caller can't expect to successfully consume a task if the task's * addition to @dsq isn't guaranteed to be visible somehow. Test * @dsq->list without locking and skip if it seems empty. @@ -1806,6 +1880,17 @@ retry: nldsq_for_each_task(p, dsq) { struct rq *task_rq = task_rq(p); + /* + * This loop can lead to multiple lockup scenarios, e.g. the BPF + * scheduler can put an enormous number of affinitized tasks into + * a contended DSQ, or the outer retry loop can repeatedly race + * against scx_bypass() dequeueing tasks from @dsq trying to put + * the system into the bypass mode. This can easily live-lock the + * machine. If aborting, exit from all non-bypass DSQs. + */ + if (unlikely(READ_ONCE(scx_aborting)) && dsq->id != SCX_DSQ_BYPASS) + break; + if (rq == task_rq) { task_unlink_from_dsq(p, dsq); move_local_task_to_local_dsq(p, 0, dsq, rq); @@ -2089,8 +2174,14 @@ static int balance_one(struct rq *rq, struct task_struct *prev) if (consume_global_dsq(sch, rq)) goto has_tasks; - if (unlikely(!SCX_HAS_OP(sch, dispatch)) || - scx_rq_bypassing(rq) || !scx_rq_online(rq)) + if (scx_rq_bypassing(rq)) { + if (consume_dispatch_q(sch, rq, &rq->scx.bypass_dsq)) + goto has_tasks; + else + goto no_tasks; + } + + if (unlikely(!SCX_HAS_OP(sch, dispatch)) || !scx_rq_online(rq)) goto no_tasks; dspc->rq = rq; @@ -2241,12 +2332,6 @@ static void switch_class(struct rq *rq, struct task_struct *next) struct scx_sched *sch = scx_root; const struct sched_class *next_class = next->sched_class; - /* - * Pairs with the smp_load_acquire() issued by a CPU in - * kick_cpus_irq_workfn() who is waiting for this CPU to perform a - * resched. - */ - smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); if (!(sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT)) return; @@ -2286,6 +2371,10 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p, struct task_struct *next) { struct scx_sched *sch = scx_root; + + /* see kick_cpus_irq_workfn() */ + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); + update_curr_scx(rq); /* see dequeue_task_scx() on why we skip when !QUEUED */ @@ -2332,18 +2421,32 @@ static struct task_struct *first_local_task(struct rq *rq) struct task_struct, scx.dsq_list.node); } -static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) +static struct task_struct * +do_pick_task_scx(struct rq *rq, struct rq_flags *rf, bool force_scx) { struct task_struct *prev = rq->curr; bool keep_prev, kick_idle = false; struct task_struct *p; + /* see kick_cpus_irq_workfn() */ + smp_store_release(&rq->scx.kick_sync, rq->scx.kick_sync + 1); + rq_modified_clear(rq); + rq_unpin_lock(rq, rf); balance_one(rq, prev); rq_repin_lock(rq, rf); maybe_queue_balance_callback(rq); - if (rq_modified_above(rq, &ext_sched_class)) + + /* + * If any higher-priority sched class enqueued a runnable task on + * this rq during balance_one(), abort and return RETRY_TASK, so + * that the scheduler loop can restart. + * + * If @force_scx is true, always try to pick a SCHED_EXT task, + * regardless of any higher-priority sched classes activity. + */ + if (!force_scx && rq_modified_above(rq, &ext_sched_class)) return RETRY_TASK; keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; @@ -2386,6 +2489,11 @@ static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) return p; } +static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) +{ + return do_pick_task_scx(rq, rf, false); +} + #ifdef CONFIG_SCHED_CORE /** * scx_prio_less - Task ordering for core-sched @@ -2842,7 +2950,7 @@ void init_scx_entity(struct sched_ext_entity *scx) INIT_LIST_HEAD(&scx->runnable_node); scx->runnable_at = jiffies; scx->ddsp_dsq_id = SCX_DSQ_INVALID; - scx->slice = SCX_SLICE_DFL; + scx->slice = READ_ONCE(scx_slice_dfl); } void scx_pre_fork(struct task_struct *p) @@ -2908,7 +3016,7 @@ void scx_cancel_fork(struct task_struct *p) percpu_up_read(&scx_fork_rwsem); } -void sched_ext_free(struct task_struct *p) +void sched_ext_dead(struct task_struct *p) { unsigned long flags; @@ -3012,6 +3120,7 @@ void scx_tg_init(struct task_group *tg) tg->scx.weight = CGROUP_WEIGHT_DFL; tg->scx.bw_period_us = default_bw_period_us(); tg->scx.bw_quota_us = RUNTIME_INF; + tg->scx.idle = false; } int scx_tg_online(struct task_group *tg) @@ -3160,7 +3269,18 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) void scx_group_set_idle(struct task_group *tg, bool idle) { - /* TODO: Implement ops->cgroup_set_idle() */ + struct scx_sched *sch = scx_root; + + percpu_down_read(&scx_cgroup_ops_rwsem); + + if (scx_cgroup_enabled && SCX_HAS_OP(sch, cgroup_set_idle)) + SCX_CALL_OP(sch, SCX_KF_UNLOCKED, cgroup_set_idle, NULL, + tg_cgrp(tg), idle); + + /* Update the task group's idle state */ + tg->scx.idle = idle; + + percpu_up_read(&scx_cgroup_ops_rwsem); } void scx_group_set_bandwidth(struct task_group *tg, @@ -3575,38 +3695,55 @@ bool scx_allow_ttwu_queue(const struct task_struct *p) } /** - * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler + * handle_lockup - sched_ext common lockup handler + * @fmt: format string * - * While there are various reasons why RCU CPU stalls can occur on a system - * that may not be caused by the current BPF scheduler, try kicking out the - * current scheduler in an attempt to recover the system to a good state before - * issuing panics. + * Called on system stall or lockup condition and initiates abort of sched_ext + * if enabled, which may resolve the reported lockup. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the lockup. %false if sched_ext is not enabled or abort was already + * initiated by someone else. */ -bool scx_rcu_cpu_stall(void) +static __printf(1, 2) bool handle_lockup(const char *fmt, ...) { struct scx_sched *sch; + va_list args; + bool ret; - rcu_read_lock(); + guard(rcu)(); sch = rcu_dereference(scx_root); - if (unlikely(!sch)) { - rcu_read_unlock(); + if (unlikely(!sch)) return false; - } switch (scx_enable_state()) { case SCX_ENABLING: case SCX_ENABLED: - break; + va_start(args, fmt); + ret = scx_verror(sch, fmt, args); + va_end(args); + return ret; default: - rcu_read_unlock(); return false; } +} - scx_error(sch, "RCU CPU stall detected!"); - rcu_read_unlock(); - - return true; +/** + * scx_rcu_cpu_stall - sched_ext RCU CPU stall handler + * + * While there are various reasons why RCU CPU stalls can occur on a system + * that may not be caused by the current BPF scheduler, try kicking out the + * current scheduler in an attempt to recover the system to a good state before + * issuing panics. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the reported RCU stall. %false if sched_ext is not enabled or someone + * else already initiated abort. + */ +bool scx_rcu_cpu_stall(void) +{ + return handle_lockup("RCU CPU stall detected!"); } /** @@ -3617,50 +3754,240 @@ bool scx_rcu_cpu_stall(void) * live-lock the system by making many CPUs target the same DSQ to the point * where soft-lockup detection triggers. This function is called from * soft-lockup watchdog when the triggering point is close and tries to unjam - * the system by enabling the breather and aborting the BPF scheduler. + * the system and aborting the BPF scheduler. */ void scx_softlockup(u32 dur_s) { - struct scx_sched *sch; + if (!handle_lockup("soft lockup - CPU %d stuck for %us", smp_processor_id(), dur_s)) + return; - rcu_read_lock(); + printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU %d stuck for %us, disabling BPF scheduler\n", + smp_processor_id(), dur_s); +} - sch = rcu_dereference(scx_root); - if (unlikely(!sch)) - goto out_unlock; +/** + * scx_hardlockup - sched_ext hardlockup handler + * + * A poorly behaving BPF scheduler can trigger hard lockup by e.g. putting + * numerous affinitized tasks in a single queue and directing all CPUs at it. + * Try kicking out the current scheduler in an attempt to recover the system to + * a good state before taking more drastic actions. + * + * Returns %true if sched_ext is enabled and abort was initiated, which may + * resolve the reported hardlockdup. %false if sched_ext is not enabled or + * someone else already initiated abort. + */ +bool scx_hardlockup(int cpu) +{ + if (!handle_lockup("hard lockup - CPU %d", cpu)) + return false; - switch (scx_enable_state()) { - case SCX_ENABLING: - case SCX_ENABLED: - break; - default: - goto out_unlock; + printk_deferred(KERN_ERR "sched_ext: Hard lockup - CPU %d, disabling BPF scheduler\n", + cpu); + return true; +} + +static u32 bypass_lb_cpu(struct scx_sched *sch, struct rq *rq, + struct cpumask *donee_mask, struct cpumask *resched_mask, + u32 nr_donor_target, u32 nr_donee_target) +{ + struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; + struct task_struct *p, *n; + struct scx_dsq_list_node cursor = INIT_DSQ_LIST_CURSOR(cursor, 0, 0); + s32 delta = READ_ONCE(donor_dsq->nr) - nr_donor_target; + u32 nr_balanced = 0, min_delta_us; + + /* + * All we want to guarantee is reasonable forward progress. No reason to + * fine tune. Assuming every task on @donor_dsq runs their full slice, + * consider offloading iff the total queued duration is over the + * threshold. + */ + min_delta_us = scx_bypass_lb_intv_us / SCX_BYPASS_LB_MIN_DELTA_DIV; + if (delta < DIV_ROUND_UP(min_delta_us, scx_slice_bypass_us)) + return 0; + + raw_spin_rq_lock_irq(rq); + raw_spin_lock(&donor_dsq->lock); + list_add(&cursor.node, &donor_dsq->list); +resume: + n = container_of(&cursor, struct task_struct, scx.dsq_list); + n = nldsq_next_task(donor_dsq, n, false); + + while ((p = n)) { + struct rq *donee_rq; + struct scx_dispatch_q *donee_dsq; + int donee; + + n = nldsq_next_task(donor_dsq, n, false); + + if (donor_dsq->nr <= nr_donor_target) + break; + + if (cpumask_empty(donee_mask)) + break; + + donee = cpumask_any_and_distribute(donee_mask, p->cpus_ptr); + if (donee >= nr_cpu_ids) + continue; + + donee_rq = cpu_rq(donee); + donee_dsq = &donee_rq->scx.bypass_dsq; + + /* + * $p's rq is not locked but $p's DSQ lock protects its + * scheduling properties making this test safe. + */ + if (!task_can_run_on_remote_rq(sch, p, donee_rq, false)) + continue; + + /* + * Moving $p from one non-local DSQ to another. The source rq + * and DSQ are already locked. Do an abbreviated dequeue and + * then perform enqueue without unlocking $donor_dsq. + * + * We don't want to drop and reacquire the lock on each + * iteration as @donor_dsq can be very long and potentially + * highly contended. Donee DSQs are less likely to be contended. + * The nested locking is safe as only this LB moves tasks + * between bypass DSQs. + */ + dispatch_dequeue_locked(p, donor_dsq); + dispatch_enqueue(sch, donee_dsq, p, SCX_ENQ_NESTED); + + /* + * $donee might have been idle and need to be woken up. No need + * to be clever. Kick every CPU that receives tasks. + */ + cpumask_set_cpu(donee, resched_mask); + + if (READ_ONCE(donee_dsq->nr) >= nr_donee_target) + cpumask_clear_cpu(donee, donee_mask); + + nr_balanced++; + if (!(nr_balanced % SCX_BYPASS_LB_BATCH) && n) { + list_move_tail(&cursor.node, &n->scx.dsq_list.node); + raw_spin_unlock(&donor_dsq->lock); + raw_spin_rq_unlock_irq(rq); + cpu_relax(); + raw_spin_rq_lock_irq(rq); + raw_spin_lock(&donor_dsq->lock); + goto resume; + } } - /* allow only one instance, cleared at the end of scx_bypass() */ - if (test_and_set_bit(0, &scx_in_softlockup)) - goto out_unlock; + list_del_init(&cursor.node); + raw_spin_unlock(&donor_dsq->lock); + raw_spin_rq_unlock_irq(rq); - printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n", - smp_processor_id(), dur_s, scx_root->ops.name); + return nr_balanced; +} + +static void bypass_lb_node(struct scx_sched *sch, int node) +{ + const struct cpumask *node_mask = cpumask_of_node(node); + struct cpumask *donee_mask = scx_bypass_lb_donee_cpumask; + struct cpumask *resched_mask = scx_bypass_lb_resched_cpumask; + u32 nr_tasks = 0, nr_cpus = 0, nr_balanced = 0; + u32 nr_target, nr_donor_target; + u32 before_min = U32_MAX, before_max = 0; + u32 after_min = U32_MAX, after_max = 0; + int cpu; + + /* count the target tasks and CPUs */ + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); + + nr_tasks += nr; + nr_cpus++; + + before_min = min(nr, before_min); + before_max = max(nr, before_max); + } + + if (!nr_cpus) + return; /* - * Some CPUs may be trapped in the dispatch paths. Enable breather - * immediately; otherwise, we might even be able to get to scx_bypass(). + * We don't want CPUs to have more than $nr_donor_target tasks and + * balancing to fill donee CPUs upto $nr_target. Once targets are + * calculated, find the donee CPUs. */ - atomic_inc(&scx_breather_depth); + nr_target = DIV_ROUND_UP(nr_tasks, nr_cpus); + nr_donor_target = DIV_ROUND_UP(nr_target * SCX_BYPASS_LB_DONOR_PCT, 100); - scx_error(sch, "soft lockup - CPU#%d stuck for %us", smp_processor_id(), dur_s); -out_unlock: - rcu_read_unlock(); + cpumask_clear(donee_mask); + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + if (READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr) < nr_target) + cpumask_set_cpu(cpu, donee_mask); + } + + /* iterate !donee CPUs and see if they should be offloaded */ + cpumask_clear(resched_mask); + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + struct rq *rq = cpu_rq(cpu); + struct scx_dispatch_q *donor_dsq = &rq->scx.bypass_dsq; + + if (cpumask_empty(donee_mask)) + break; + if (cpumask_test_cpu(cpu, donee_mask)) + continue; + if (READ_ONCE(donor_dsq->nr) <= nr_donor_target) + continue; + + nr_balanced += bypass_lb_cpu(sch, rq, donee_mask, resched_mask, + nr_donor_target, nr_target); + } + + for_each_cpu(cpu, resched_mask) { + struct rq *rq = cpu_rq(cpu); + + raw_spin_rq_lock_irq(rq); + resched_curr(rq); + raw_spin_rq_unlock_irq(rq); + } + + for_each_cpu_and(cpu, cpu_online_mask, node_mask) { + u32 nr = READ_ONCE(cpu_rq(cpu)->scx.bypass_dsq.nr); + + after_min = min(nr, after_min); + after_max = max(nr, after_max); + + } + + trace_sched_ext_bypass_lb(node, nr_cpus, nr_tasks, nr_balanced, + before_min, before |
