diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-03 13:25:39 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-03 13:25:39 -0800 |
| commit | 02baaa67d9afc2e56c6e1ac6a1fb1f1dd2be366f (patch) | |
| tree | 13ae2fec8be92b2f774cfb3fd725c027740be3ac /tools/sched_ext/scx_qmap.bpf.c | |
| parent | 8449d3252c2603a51ffc7c36cb5bd94874378b7d (diff) | |
| parent | 1dd6c84f1c544e552848a8968599220bd464e338 (diff) | |
Merge tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext updates from Tejun Heo:
- Improve recovery from misbehaving BPF schedulers.
When a scheduler puts many tasks with varying affinity restrictions
on a shared DSQ, CPUs scanning through tasks they cannot run can
overwhelm the system, causing lockups.
Bypass mode now uses per-CPU DSQs with a load balancer to avoid this,
and hooks into the hardlockup detector to attempt recovery.
Add scx_cpu0 example scheduler to demonstrate this scenario.
- Add lockless peek operation for DSQs to reduce lock contention for
schedulers that need to query queue state during load balancing.
- Allow scx_bpf_reenqueue_local() to be called from anywhere in
preparation for deprecating cpu_acquire/release() callbacks in favor
of generic BPF hooks.
- Prepare for hierarchical scheduler support: add
scx_bpf_task_set_slice() and scx_bpf_task_set_dsq_vtime() kfuncs,
make scx_bpf_dsq_insert*() return bool, and wrap kfunc args in
structs for future aux__prog parameter.
- Implement cgroup_set_idle() callback to notify BPF schedulers when a
cgroup's idle state changes.
- Fix migration tasks being incorrectly downgraded from
stop_sched_class to rt_sched_class across sched_ext enable/disable.
Applied late as the fix is low risk and the bug subtle but needs
stable backporting.
- Various fixes and cleanups including cgroup exit ordering,
SCX_KICK_WAIT reliability, and backward compatibility improvements.
* tag 'sched_ext-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (44 commits)
sched_ext: Fix incorrect sched_class settings for per-cpu migration tasks
sched_ext: tools: Removing duplicate targets during non-cross compilation
sched_ext: Use kvfree_rcu() to release per-cpu ksyncs object
sched_ext: Pass locked CPU parameter to scx_hardlockup() and add docs
sched_ext: Update comments replacing breather with aborting mechanism
sched_ext: Implement load balancer for bypass mode
sched_ext: Factor out abbreviated dispatch dequeue into dispatch_dequeue_locked()
sched_ext: Factor out scx_dsq_list_node cursor initialization into INIT_DSQ_LIST_CURSOR
sched_ext: Add scx_cpu0 example scheduler
sched_ext: Hook up hardlockup detector
sched_ext: Make handle_lockup() propagate scx_verror() result
sched_ext: Refactor lockup handlers into handle_lockup()
sched_ext: Make scx_exit() and scx_vexit() return bool
sched_ext: Exit dispatch and move operations immediately when aborting
sched_ext: Simplify breather mechanism with scx_aborting flag
sched_ext: Use per-CPU DSQs instead of per-node global DSQs in bypass mode
sched_ext: Refactor do_enqueue_task() local and global DSQ paths
sched_ext: Use shorter slice in bypass mode
sched_ext: Mark racy bitfields to prevent adding fields that can't tolerate races
sched_ext: Minor cleanups to scx_task_iter
...
Diffstat (limited to 'tools/sched_ext/scx_qmap.bpf.c')
| -rw-r--r-- | tools/sched_ext/scx_qmap.bpf.c | 52 |
1 files changed, 33 insertions, 19 deletions
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c index 3072b593f898..df21fad0c438 100644 --- a/tools/sched_ext/scx_qmap.bpf.c +++ b/tools/sched_ext/scx_qmap.bpf.c @@ -202,6 +202,9 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) void *ring; s32 cpu; + if (enq_flags & SCX_ENQ_REENQ) + __sync_fetch_and_add(&nr_reenqueued, 1); + if (p->flags & PF_KTHREAD) { if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) return; @@ -320,12 +323,9 @@ static bool dispatch_highpri(bool from_timer) if (tctx->highpri) { /* exercise the set_*() and vtime interface too */ - __COMPAT_scx_bpf_dsq_move_set_slice( - BPF_FOR_EACH_ITER, slice_ns * 2); - __COMPAT_scx_bpf_dsq_move_set_vtime( - BPF_FOR_EACH_ITER, highpri_seq++); - __COMPAT_scx_bpf_dsq_move_vtime( - BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); + scx_bpf_dsq_move_set_slice(BPF_FOR_EACH_ITER, slice_ns * 2); + scx_bpf_dsq_move_set_vtime(BPF_FOR_EACH_ITER, highpri_seq++); + scx_bpf_dsq_move_vtime(BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0); } } @@ -342,9 +342,8 @@ static bool dispatch_highpri(bool from_timer) else cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0); - if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, - SCX_DSQ_LOCAL_ON | cpu, - SCX_ENQ_PREEMPT)) { + if (scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p, SCX_DSQ_LOCAL_ON | cpu, + SCX_ENQ_PREEMPT)) { if (cpu == this_cpu) { dispatched = true; __sync_fetch_and_add(&nr_expedited_local, 1); @@ -533,20 +532,35 @@ bool BPF_STRUCT_OPS(qmap_core_sched_before, return task_qdist(a) > task_qdist(b); } -void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) +SEC("tp_btf/sched_switch") +int BPF_PROG(qmap_sched_switch, bool preempt, struct task_struct *prev, + struct task_struct *next, unsigned long prev_state) { - u32 cnt; + if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) + return 0; /* - * Called when @cpu is taken by a higher priority scheduling class. This - * makes @cpu no longer available for executing sched_ext tasks. As we - * don't want the tasks in @cpu's local dsq to sit there until @cpu - * becomes available again, re-enqueue them into the global dsq. See - * %SCX_ENQ_REENQ handling in qmap_enqueue(). + * If @cpu is taken by a higher priority scheduling class, it is no + * longer available for executing sched_ext tasks. As we don't want the + * tasks in @cpu's local dsq to sit there until @cpu becomes available + * again, re-enqueue them into the global dsq. See %SCX_ENQ_REENQ + * handling in qmap_enqueue(). */ - cnt = scx_bpf_reenqueue_local(); - if (cnt) - __sync_fetch_and_add(&nr_reenqueued, cnt); + switch (next->policy) { + case 1: /* SCHED_FIFO */ + case 2: /* SCHED_RR */ + case 6: /* SCHED_DEADLINE */ + scx_bpf_reenqueue_local(); + } + + return 0; +} + +void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) +{ + /* see qmap_sched_switch() to learn how to do this on newer kernels */ + if (!__COMPAT_scx_bpf_reenqueue_local_from_anywhere()) + scx_bpf_reenqueue_local(); } s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, |