diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-01 21:04:45 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-01 21:04:45 -0800 |
| commit | 6d2c10e889db596938bb7b4d3cdd42d67208439a (patch) | |
| tree | c8adec6da2f3c89e60bf30397c7ebff7ee85e288 /kernel/sched/ext.c | |
| parent | 6c26fbe8c9d3e932dce6afe2505b19b4b261cae9 (diff) | |
| parent | c04507ac500e2cc8048000c2a849588227554e06 (diff) | |
Merge tag 'sched-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"Scalability and load-balancing improvements:
- Enable scheduler feature NEXT_BUDDY (Mel Gorman)
- Reimplement NEXT_BUDDY to align with EEVDF goals (Mel Gorman)
- Skip sched_balance_running cmpxchg when balance is not due (Tim
Chen)
- Implement generic code for architecture specific sched domain NUMA
distances (Tim Chen)
- Optimize the NUMA distances of the sched-domains builds of Intel
Granite Rapids (GNR) and Clearwater Forest (CWF) platforms (Tim
Chen)
- Implement proportional newidle balance: a randomized algorithm that
runs newidle balancing proportional to its success rate. (Peter
Zijlstra)
Scheduler infrastructure changes:
- Implement the 'sched_change' scoped_guard() pattern for the entire
scheduler (Peter Zijlstra)
- More broadly utilize the sched_change guard (Peter Zijlstra)
- Add support to pick functions to take runqueue-flags (Joel
Fernandes)
- Provide and use set_need_resched_current() (Peter Zijlstra)
Fair scheduling enhancements:
- Forfeit vruntime on yield (Fernand Sieber)
- Only update stats for allowed CPUs when looking for dst group (Adam
Li)
CPU-core scheduling enhancements:
- Optimize core cookie matching check (Fernand Sieber)
Deadline scheduler fixes:
- Only set free_cpus for online runqueues (Doug Berger)
- Fix dl_server time accounting (Peter Zijlstra)
- Fix dl_server stop condition (Peter Zijlstra)
Proxy scheduling fixes:
- Yield the donor task (Fernand Sieber)
Fixes and cleanups:
- Fix do_set_cpus_allowed() locking (Peter Zijlstra)
- Fix migrate_disable_switch() locking (Peter Zijlstra)
- Remove double update_rq_clock() in __set_cpus_allowed_ptr_locked()
(Hao Jia)
- Increase sched_tick_remote timeout (Phil Auld)
- sched/deadline: Use cpumask_weight_and() in dl_bw_cpus() (Shrikanth
Hegde)
- sched/deadline: Clean up select_task_rq_dl() (Shrikanth Hegde)"
* tag 'sched-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (44 commits)
sched: Provide and use set_need_resched_current()
sched/fair: Proportional newidle balance
sched/fair: Small cleanup to update_newidle_cost()
sched/fair: Small cleanup to sched_balance_newidle()
sched/fair: Revert max_newidle_lb_cost bump
sched/fair: Reimplement NEXT_BUDDY to align with EEVDF goals
sched/fair: Enable scheduler feature NEXT_BUDDY
sched: Increase sched_tick_remote timeout
sched/fair: Have SD_SERIALIZE affect newidle balancing
sched/fair: Skip sched_balance_running cmpxchg when balance is not due
sched/deadline: Minor cleanup in select_task_rq_dl()
sched/deadline: Use cpumask_weight_and() in dl_bw_cpus
sched/deadline: Document dl_server
sched/deadline: Fix dl_server stop condition
sched/deadline: Fix dl_server time accounting
sched/core: Remove double update_rq_clock() in __set_cpus_allowed_ptr_locked()
sched/eevdf: Fix min_vruntime vs avg_vruntime
sched/core: Add comment explaining force-idle vruntime snapshots
sched/core: Optimize core cookie matching check
sched/proxy: Yield the donor task
...
Diffstat (limited to 'kernel/sched/ext.c')
| -rw-r--r-- | kernel/sched/ext.c | 132 |
1 files changed, 36 insertions, 96 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 979484dab2d3..6827689a0966 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -1474,7 +1474,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags static void yield_task_scx(struct rq *rq) { struct scx_sched *sch = scx_root; - struct task_struct *p = rq->curr; + struct task_struct *p = rq->donor; if (SCX_HAS_OP(sch, yield)) SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); @@ -1485,7 +1485,7 @@ static void yield_task_scx(struct rq *rq) static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) { struct scx_sched *sch = scx_root; - struct task_struct *from = rq->curr; + struct task_struct *from = rq->donor; if (SCX_HAS_OP(sch, yield)) return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, @@ -2047,7 +2047,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) lockdep_assert_rq_held(rq); rq->scx.flags |= SCX_RQ_IN_BALANCE; - rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); + rq->scx.flags &= ~SCX_RQ_BAL_KEEP; if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && unlikely(rq->scx.cpu_released)) { @@ -2153,42 +2153,6 @@ has_tasks: return true; } -static int balance_scx(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf) -{ - int ret; - - rq_unpin_lock(rq, rf); - - ret = balance_one(rq, prev); - -#ifdef CONFIG_SCHED_SMT - /* - * When core-sched is enabled, this ops.balance() call will be followed - * by pick_task_scx() on this CPU and the SMT siblings. Balance the - * siblings too. - */ - if (sched_core_enabled(rq)) { - const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); - int scpu; - - for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { - struct rq *srq = cpu_rq(scpu); - struct task_struct *sprev = srq->curr; - - WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); - update_rq_clock(srq); - balance_one(srq, sprev); - } - } -#endif - rq_repin_lock(rq, rf); - - maybe_queue_balance_callback(rq); - - return ret; -} - static void process_ddsp_deferred_locals(struct rq *rq) { struct task_struct *p; @@ -2368,41 +2332,23 @@ static struct task_struct *first_local_task(struct rq *rq) struct task_struct, scx.dsq_list.node); } -static struct task_struct *pick_task_scx(struct rq *rq) +static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) { struct task_struct *prev = rq->curr; + bool keep_prev, kick_idle = false; struct task_struct *p; - bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; - bool kick_idle = false; - /* - * WORKAROUND: - * - * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just - * have gone through balance_scx(). Unfortunately, there currently is a - * bug where fair could say yes on balance() but no on pick_task(), - * which then ends up calling pick_task_scx() without preceding - * balance_scx(). - * - * Keep running @prev if possible and avoid stalling from entering idle - * without balancing. - * - * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() - * if pick_task_scx() is called without preceding balance_scx(). - */ - if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { - if (prev->scx.flags & SCX_TASK_QUEUED) { - keep_prev = true; - } else { - keep_prev = false; - kick_idle = true; - } - } else if (unlikely(keep_prev && - prev->sched_class != &ext_sched_class)) { - /* - * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is - * conditional on scx_enabled() and may have been skipped. - */ + rq_modified_clear(rq); + rq_unpin_lock(rq, rf); + balance_one(rq, prev); + rq_repin_lock(rq, rf); + maybe_queue_balance_callback(rq); + if (rq_modified_above(rq, &ext_sched_class)) + return RETRY_TASK; + + keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; + if (unlikely(keep_prev && + prev->sched_class != &ext_sched_class)) { WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); keep_prev = false; } @@ -2997,7 +2943,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p, p, p->scx.weight); } -static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) +static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) { } @@ -3270,6 +3216,8 @@ static void scx_cgroup_unlock(void) {} * their current sched_class. Call them directly from sched core instead. */ DEFINE_SCHED_CLASS(ext) = { + .queue_mask = 1, + .enqueue_task = enqueue_task_scx, .dequeue_task = dequeue_task_scx, .yield_task = yield_task_scx, @@ -3277,7 +3225,6 @@ DEFINE_SCHED_CLASS(ext) = { .wakeup_preempt = wakeup_preempt_scx, - .balance = balance_scx, .pick_task = pick_task_scx, .put_prev_task = put_prev_task_scx, @@ -3818,11 +3765,10 @@ static void scx_bypass(bool bypass) */ list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, scx.runnable_node) { - struct sched_enq_and_set_ctx ctx; - /* cycling deq/enq is enough, see the function comment */ - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* nothing */ ; + } } /* resched to restore ticks and idle state */ @@ -3972,22 +3918,20 @@ static void scx_disable_workfn(struct kthread_work *work) scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *old_class = p->sched_class; const struct sched_class *new_class = __setscheduler_class(p->policy, p->prio); - struct sched_enq_and_set_ctx ctx; - if (old_class != new_class && p->se.sched_delayed) - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + update_rq_clock(task_rq(p)); - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + if (old_class != new_class) + queue_flags |= DEQUEUE_CLASS; - p->sched_class = new_class; - check_class_changing(task_rq(p), p, old_class); - - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, queue_flags) { + p->sched_class = new_class; + } - check_class_changed(task_rq(p), p, old_class, p->prio); scx_exit_task(p); } scx_task_iter_stop(&sti); @@ -4751,26 +4695,22 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) percpu_down_write(&scx_fork_rwsem); scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; const struct sched_class *old_class = p->sched_class; const struct sched_class *new_class = __setscheduler_class(p->policy, p->prio); - struct sched_enq_and_set_ctx ctx; if (!tryget_task_struct(p)) continue; - if (old_class != new_class && p->se.sched_delayed) - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); - - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + if (old_class != new_class) + queue_flags |= DEQUEUE_CLASS; - p->scx.slice = SCX_SLICE_DFL; - p->sched_class = new_class; - check_class_changing(task_rq(p), p, old_class); - - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, queue_flags) { + p->scx.slice = SCX_SLICE_DFL; + p->sched_class = new_class; + } - check_class_changed(task_rq(p), p, old_class, p->prio); put_task_struct(p); } scx_task_iter_stop(&sti); |