summaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-12-01 21:04:45 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-12-01 21:04:45 -0800
commit6d2c10e889db596938bb7b4d3cdd42d67208439a (patch)
treec8adec6da2f3c89e60bf30397c7ebff7ee85e288 /kernel/sched/core.c
parent6c26fbe8c9d3e932dce6afe2505b19b4b261cae9 (diff)
parentc04507ac500e2cc8048000c2a849588227554e06 (diff)
Merge tag 'sched-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "Scalability and load-balancing improvements: - Enable scheduler feature NEXT_BUDDY (Mel Gorman) - Reimplement NEXT_BUDDY to align with EEVDF goals (Mel Gorman) - Skip sched_balance_running cmpxchg when balance is not due (Tim Chen) - Implement generic code for architecture specific sched domain NUMA distances (Tim Chen) - Optimize the NUMA distances of the sched-domains builds of Intel Granite Rapids (GNR) and Clearwater Forest (CWF) platforms (Tim Chen) - Implement proportional newidle balance: a randomized algorithm that runs newidle balancing proportional to its success rate. (Peter Zijlstra) Scheduler infrastructure changes: - Implement the 'sched_change' scoped_guard() pattern for the entire scheduler (Peter Zijlstra) - More broadly utilize the sched_change guard (Peter Zijlstra) - Add support to pick functions to take runqueue-flags (Joel Fernandes) - Provide and use set_need_resched_current() (Peter Zijlstra) Fair scheduling enhancements: - Forfeit vruntime on yield (Fernand Sieber) - Only update stats for allowed CPUs when looking for dst group (Adam Li) CPU-core scheduling enhancements: - Optimize core cookie matching check (Fernand Sieber) Deadline scheduler fixes: - Only set free_cpus for online runqueues (Doug Berger) - Fix dl_server time accounting (Peter Zijlstra) - Fix dl_server stop condition (Peter Zijlstra) Proxy scheduling fixes: - Yield the donor task (Fernand Sieber) Fixes and cleanups: - Fix do_set_cpus_allowed() locking (Peter Zijlstra) - Fix migrate_disable_switch() locking (Peter Zijlstra) - Remove double update_rq_clock() in __set_cpus_allowed_ptr_locked() (Hao Jia) - Increase sched_tick_remote timeout (Phil Auld) - sched/deadline: Use cpumask_weight_and() in dl_bw_cpus() (Shrikanth Hegde) - sched/deadline: Clean up select_task_rq_dl() (Shrikanth Hegde)" * tag 'sched-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (44 commits) sched: Provide and use set_need_resched_current() sched/fair: Proportional newidle balance sched/fair: Small cleanup to update_newidle_cost() sched/fair: Small cleanup to sched_balance_newidle() sched/fair: Revert max_newidle_lb_cost bump sched/fair: Reimplement NEXT_BUDDY to align with EEVDF goals sched/fair: Enable scheduler feature NEXT_BUDDY sched: Increase sched_tick_remote timeout sched/fair: Have SD_SERIALIZE affect newidle balancing sched/fair: Skip sched_balance_running cmpxchg when balance is not due sched/deadline: Minor cleanup in select_task_rq_dl() sched/deadline: Use cpumask_weight_and() in dl_bw_cpus sched/deadline: Document dl_server sched/deadline: Fix dl_server stop condition sched/deadline: Fix dl_server time accounting sched/core: Remove double update_rq_clock() in __set_cpus_allowed_ptr_locked() sched/eevdf: Fix min_vruntime vs avg_vruntime sched/core: Add comment explaining force-idle vruntime snapshots sched/core: Optimize core cookie matching check sched/proxy: Yield the donor task ...
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c400
1 files changed, 162 insertions, 238 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f754a60de848..0c4ff93eeb78 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+DEFINE_PER_CPU(struct rnd_state, sched_rnd_state);
#ifdef CONFIG_SCHED_PROXY_EXEC
DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec);
@@ -583,8 +584,8 @@ EXPORT_SYMBOL(__trace_set_current_state);
*
* p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
*
- * is set by activate_task() and cleared by deactivate_task(), under
- * rq->lock. Non-zero indicates the task is runnable, the special
+ * is set by activate_task() and cleared by deactivate_task()/block_task(),
+ * under rq->lock. Non-zero indicates the task is runnable, the special
* ON_RQ_MIGRATING state is used for migration without holding both
* rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
*
@@ -2089,6 +2090,7 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
*/
uclamp_rq_inc(rq, p, flags);
+ rq->queue_mask |= p->sched_class->queue_mask;
p->sched_class->enqueue_task(rq, p, flags);
psi_enqueue(p, flags);
@@ -2121,6 +2123,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
* and mark the task ->sched_delayed.
*/
uclamp_rq_dec(rq, p);
+ rq->queue_mask |= p->sched_class->queue_mask;
return p->sched_class->dequeue_task(rq, p, flags);
}
@@ -2169,37 +2172,6 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}
-/*
- * ->switching_to() is called with the pi_lock and rq_lock held and must not
- * mess with locking.
- */
-void check_class_changing(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class)
-{
- if (prev_class != p->sched_class && p->sched_class->switching_to)
- p->sched_class->switching_to(rq, p);
-}
-
-/*
- * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
- * use the balance_callback list if you want balancing.
- *
- * this means any call to check_class_changed() must be followed by a call to
- * balance_callback().
- */
-void check_class_changed(struct rq *rq, struct task_struct *p,
- const struct sched_class *prev_class,
- int oldprio)
-{
- if (prev_class != p->sched_class) {
- if (prev_class->switched_from)
- prev_class->switched_from(rq, p);
-
- p->sched_class->switched_to(rq, p);
- } else if (oldprio != p->prio || dl_task(p))
- p->sched_class->prio_changed(rq, p, oldprio);
-}
-
void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags)
{
struct task_struct *donor = rq->donor;
@@ -2362,7 +2334,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
}
static void
-__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
+do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx);
static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
{
@@ -2377,10 +2349,8 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
if (p->cpus_ptr != &p->cpus_mask)
return;
- /*
- * Violates locking rules! See comment in __do_set_cpus_allowed().
- */
- __do_set_cpus_allowed(p, &ac);
+ scoped_guard (task_rq_lock, p)
+ do_set_cpus_allowed(p, &ac);
}
void ___migrate_enable(void)
@@ -2613,7 +2583,8 @@ static int migration_cpu_stop(void *data)
*/
WARN_ON_ONCE(!pending->stop_pending);
preempt_disable();
- task_rq_unlock(rq, p, &rf);
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
&pending->arg, &pending->stop_work);
preempt_enable();
@@ -2622,7 +2593,8 @@ static int migration_cpu_stop(void *data)
out:
if (pending)
pending->stop_pending = false;
- task_rq_unlock(rq, p, &rf);
+ rq_unlock(rq, &rf);
+ raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
if (complete)
complete_all(&pending->done);
@@ -2693,56 +2665,19 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx
}
static void
-__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
+do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
- struct rq *rq = task_rq(p);
- bool queued, running;
-
- /*
- * This here violates the locking rules for affinity, since we're only
- * supposed to change these variables while holding both rq->lock and
- * p->pi_lock.
- *
- * HOWEVER, it magically works, because ttwu() is the only code that
- * accesses these variables under p->pi_lock and only does so after
- * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
- * before finish_task().
- *
- * XXX do further audits, this smells like something putrid.
- */
- if (ctx->flags & SCA_MIGRATE_DISABLE)
- WARN_ON_ONCE(!p->on_cpu);
- else
- lockdep_assert_held(&p->pi_lock);
-
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
-
- if (queued) {
- /*
- * Because __kthread_bind() calls this on blocked tasks without
- * holding rq->lock.
- */
- lockdep_assert_rq_held(rq);
- dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE) {
+ p->sched_class->set_cpus_allowed(p, ctx);
+ mm_set_cpus_allowed(p->mm, ctx->new_mask);
}
- if (running)
- put_prev_task(rq, p);
-
- p->sched_class->set_cpus_allowed(p, ctx);
- mm_set_cpus_allowed(p->mm, ctx->new_mask);
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
}
/*
* Used for kthread_bind() and select_fallback_rq(), in both cases the user
* affinity (if any) should be destroyed too.
*/
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask)
{
struct affinity_context ac = {
.new_mask = new_mask,
@@ -2754,7 +2689,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
struct rcu_head rcu;
};
- __do_set_cpus_allowed(p, &ac);
+ scoped_guard (__task_rq_lock, p)
+ do_set_cpus_allowed(p, &ac);
/*
* Because this is called with p->pi_lock held, it is not possible
@@ -2792,7 +2728,7 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
* Use pi_lock to protect content of user_cpus_ptr
*
* Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent
- * do_set_cpus_allowed().
+ * set_cpus_allowed_force().
*/
raw_spin_lock_irqsave(&src->pi_lock, flags);
if (src->user_cpus_ptr) {
@@ -3064,8 +3000,6 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
unsigned int dest_cpu;
int ret = 0;
- update_rq_clock(rq);
-
if (kthread || is_migration_disabled(p)) {
/*
* Kernel threads are allowed on online && !active CPUs,
@@ -3120,7 +3054,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
goto out;
}
- __do_set_cpus_allowed(p, ctx);
+ do_set_cpus_allowed(p, ctx);
return affine_move_task(rq, p, rf, dest_cpu, ctx->flags);
@@ -3529,13 +3463,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
}
fallthrough;
case possible:
- /*
- * XXX When called from select_task_rq() we only
- * hold p->pi_lock and again violate locking order.
- *
- * More yuck to audit.
- */
- do_set_cpus_allowed(p, task_cpu_fallback_mask(p));
+ set_cpus_allowed_force(p, task_cpu_fallback_mask(p));
state = fail;
break;
case fail:
@@ -3777,7 +3705,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
ttwu_do_wakeup(p);
ret = 1;
}
- __task_rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
return ret;
}
@@ -4231,7 +4159,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* __schedule(). See the comment for smp_mb__after_spinlock().
*
* Form a control-dep-acquire with p->on_rq == 0 above, to ensure
- * schedule()'s deactivate_task() has 'happened' and p will no longer
+ * schedule()'s block_task() has 'happened' and p will no longer
* care about it's own p->state. See the comment in __schedule().
*/
smp_acquire__after_ctrl_dep();
@@ -4370,7 +4298,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg)
ret = func(p, arg);
if (rq)
- rq_unlock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
return ret;
@@ -5692,7 +5620,7 @@ static void sched_tick_remote(struct work_struct *work)
* reasonable amount of time.
*/
u64 delta = rq_clock_task(rq) - curr->se.exec_start;
- WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 30);
}
curr->sched_class->task_tick(rq, curr, 0);
@@ -5916,19 +5844,6 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
const struct sched_class *start_class = prev->sched_class;
const struct sched_class *class;
-#ifdef CONFIG_SCHED_CLASS_EXT
- /*
- * SCX requires a balance() call before every pick_task() including when
- * waking up from SCHED_IDLE. If @start_class is below SCX, start from
- * SCX instead. Also, set a flag to detect missing balance() call.
- */
- if (scx_enabled()) {
- rq->scx.flags |= SCX_RQ_BAL_PENDING;
- if (sched_class_above(&ext_sched_class, start_class))
- start_class = &ext_sched_class;
- }
-#endif
-
/*
* We must do the balancing pass before put_prev_task(), such
* that when we release the rq->lock the task is in the same
@@ -5972,7 +5887,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
/* Assume the next prioritized class is idle_sched_class */
if (!p) {
- p = pick_task_idle(rq);
+ p = pick_task_idle(rq, rf);
put_prev_set_next_task(rq, prev, p);
}
@@ -5984,11 +5899,15 @@ restart:
for_each_active_class(class) {
if (class->pick_next_task) {
- p = class->pick_next_task(rq, prev);
+ p = class->pick_next_task(rq, prev, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart;
if (p)
return p;
} else {
- p = class->pick_task(rq);
+ p = class->pick_task(rq, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart;
if (p) {
put_prev_set_next_task(rq, prev, p);
return p;
@@ -6018,7 +5937,11 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
return a->core_cookie == b->core_cookie;
}
-static inline struct task_struct *pick_task(struct rq *rq)
+/*
+ * Careful; this can return RETRY_TASK, it does not include the retry-loop
+ * itself due to the whole SMT pick retry thing below.
+ */
+static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf)
{
const struct sched_class *class;
struct task_struct *p;
@@ -6026,7 +5949,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
rq->dl_server = NULL;
for_each_active_class(class) {
- p = class->pick_task(rq);
+ p = class->pick_task(rq, rf);
if (p)
return p;
}
@@ -6041,7 +5964,7 @@ static void queue_core_balance(struct rq *rq);
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
- struct task_struct *next, *p, *max = NULL;
+ struct task_struct *next, *p, *max;
const struct cpumask *smt_mask;
bool fi_before = false;
bool core_clock_updated = (rq == rq->core);
@@ -6126,7 +6049,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* and there are no cookied tasks running on siblings.
*/
if (!need_sync) {
- next = pick_task(rq);
+restart_single:
+ next = pick_task(rq, rf);
+ if (unlikely(next == RETRY_TASK))
+ goto restart_single;
if (!next->core_cookie) {
rq->core_pick = NULL;
rq->core_dl_server = NULL;
@@ -6146,6 +6072,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*
* Tie-break prio towards the current CPU
*/
+restart_multi:
+ max = NULL;
for_each_cpu_wrap(i, smt_mask, cpu) {
rq_i = cpu_rq(i);
@@ -6157,7 +6085,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (i != cpu && (rq_i != rq->core || !core_clock_updated))
update_rq_clock(rq_i);
- rq_i->core_pick = p = pick_task(rq_i);
+ p = pick_task(rq_i, rf);
+ if (unlikely(p == RETRY_TASK))
+ goto restart_multi;
+
+ rq_i->core_pick = p;
rq_i->core_dl_server = rq_i->dl_server;
if (!max || prio_less(max, p, fi_before))
@@ -6179,7 +6111,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (cookie)
p = sched_core_find(rq_i, cookie);
if (!p)
- p = idle_sched_class.pick_task(rq_i);
+ p = idle_sched_class.pick_task(rq_i, rf);
}
rq_i->core_pick = p;
@@ -6812,6 +6744,7 @@ static void __sched notrace __schedule(int sched_mode)
local_irq_disable();
rcu_note_context_switch(preempt);
+ migrate_disable_switch(rq, prev);
/*
* Make sure that signal_pending_state()->signal_pending() below
@@ -6918,7 +6851,6 @@ keep_resched:
*/
++*switch_count;
- migrate_disable_switch(rq, prev);
psi_account_irqtime(rq, prev, next);
psi_sched_switch(prev, next, !task_on_rq_queued(prev) ||
prev->se.sched_delayed);
@@ -7326,7 +7258,7 @@ void rt_mutex_post_schedule(void)
*/
void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
{
- int prio, oldprio, queued, running, queue_flag =
+ int prio, oldprio, queue_flag =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
const struct sched_class *prev_class, *next_class;
struct rq_flags rf;
@@ -7388,64 +7320,51 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
prev_class = p->sched_class;
next_class = __setscheduler_class(p->policy, prio);
- if (prev_class != next_class && p->se.sched_delayed)
- dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
-
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
- if (queued)
- dequeue_task(rq, p, queue_flag);
- if (running)
- put_prev_task(rq, p);
+ if (prev_class != next_class)
+ queue_flag |= DEQUEUE_CLASS;
- /*
- * Boosting condition are:
- * 1. -rt task is running and holds mutex A
- * --> -dl task blocks on mutex A
- *
- * 2. -dl task is running and holds mutex A
- * --> -dl task blocks on mutex A and could preempt the
- * running task
- */
- if (dl_prio(prio)) {
- if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_prio(pi_task->prio) &&
- dl_entity_preempt(&pi_task->dl, &p->dl))) {
- p->dl.pi_se = pi_task->dl.pi_se;
- queue_flag |= ENQUEUE_REPLENISH;
+ scoped_guard (sched_change, p, queue_flag) {
+ /*
+ * Boosting condition are:
+ * 1. -rt task is running and holds mutex A
+ * --> -dl task blocks on mutex A
+ *
+ * 2. -dl task is running and holds mutex A
+ * --> -dl task blocks on mutex A and could preempt the
+ * running task
+ */
+ if (dl_prio(prio)) {
+ if (!dl_prio(p->normal_prio) ||
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ p->dl.pi_se = pi_task->dl.pi_se;
+ scope->flags |= ENQUEUE_REPLENISH;
+ } else {
+ p->dl.pi_se = &p->dl;
+ }
+ } else if (rt_prio(prio)) {
+ if (dl_prio(oldprio))
+ p->dl.pi_se = &p->dl;
+ if (oldprio < prio)
+ scope->flags |= ENQUEUE_HEAD;
} else {
- p->dl.pi_se = &p->dl;
+ if (dl_prio(oldprio))
+ p->dl.pi_se = &p->dl;
+ if (rt_prio(oldprio))
+ p->rt.timeout = 0;
}
- } else if (rt_prio(prio)) {
- if (dl_prio(oldprio))
- p->dl.pi_se = &p->dl;
- if (oldprio < prio)
- queue_flag |= ENQUEUE_HEAD;
- } else {
- if (dl_prio(oldprio))
- p->dl.pi_se = &p->dl;
- if (rt_prio(oldprio))
- p->rt.timeout = 0;
- }
- p->sched_class = next_class;
- p->prio = prio;
-
- check_class_changing(rq, p, prev_class);
-
- if (queued)
- enqueue_task(rq, p, queue_flag);
- if (running)
- set_next_task(rq, p);
-
- check_class_changed(rq, p, prev_class, oldprio);
+ p->sched_class = next_class;
+ p->prio = prio;
+ }
out_unlock:
/* Avoid rq from going away on us: */
preempt_disable();
rq_unpin_lock(rq, &rf);
__balance_callbacks(rq);
- raw_spin_rq_unlock(rq);
+ rq_repin_lock(rq, &rf);
+ __task_rq_unlock(rq, p, &rf);
preempt_enable();
}
@@ -8084,26 +8003,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
*/
void sched_setnuma(struct task_struct *p, int nid)
{
- bool queued, running;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(p, &rf);
- queued = task_on_rq_queued(p);
- running = task_current_donor(rq, p);
-
- if (queued)
- dequeue_task(rq, p, DEQUEUE_SAVE);
- if (running)
- put_prev_task(rq, p);
-
- p->numa_preferred_nid = nid;
-
- if (queued)
- enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
- if (running)
- set_next_task(rq, p);
- task_rq_unlock(rq, p, &rf);
+ guard(task_rq_lock)(p);
+ scoped_guard (sched_change, p, DEQUEUE_SAVE)
+ p->numa_preferred_nid = nid;
}
#endif /* CONFIG_NUMA_BALANCING */
@@ -8141,18 +8043,15 @@ static int __balance_push_cpu_stop(void *arg)
struct rq_flags rf;
int cpu;
- raw_spin_lock_irq(&p->pi_lock);
- rq_lock(rq, &rf);
-
- update_rq_clock(rq);
-
- if (task_rq(p) == rq && task_on_rq_queued(p)) {
+ scoped_guard (raw_spinlock_irq, &p->pi_lock) {
cpu = select_fallback_rq(rq->cpu, p);
- rq = __migrate_task(rq, &rf, p, cpu);
- }
- rq_unlock(rq, &rf);
- raw_spin_unlock_irq(&p->pi_lock);
+ rq_lock(rq, &rf);
+ update_rq_clock(rq);
+ if (task_rq(p) == rq && task_on_rq_queued(p))
+ rq = __migrate_task(rq, &rf, p, cpu);
+ rq_unlock(rq, &rf);
+ }
put_task_struct(p);
@@ -8591,6 +8490,8 @@ void __init sched_init_smp(void)
{
sched_init_numa(NUMA_NO_NODE);
+ prandom_init_once(&sched_rnd_state);
+
/*
* There's no userspace yet to cause hotplug operations; hence all the
* CPU masks are stable and all blatant races in the below code cannot
@@ -9207,38 +9108,23 @@ static void sched_change_group(struct task_struct *tsk)
*/
void sched_move_task(struct task_struct *tsk, bool for_autogroup)
{
- int queued, running, queue_flags =
- DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
+ bool resched = false;
struct rq *rq;
CLASS(task_rq_lock, rq_guard)(tsk);
rq = rq_guard.rq;
- update_rq_clock(rq);
-
- running = task_current_donor(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, queue_flags);
- if (running)
- put_prev_task(rq, tsk);
-
- sched_change_group(tsk);
- if (!for_autogroup)
- scx_cgroup_move_task(tsk);
+ scoped_guard (sched_change, tsk, queue_flags) {
+ sched_change_group(tsk);
+ if (!for_autogroup)
+ scx_cgroup_move_task(tsk);
+ if (scope->running)
+ resched = true;
+ }
- if (queued)
- enqueue_task(rq, tsk, queue_flags);
- if (running) {
- set_next_task(rq, tsk);
- /*
- * After changing group, the running task may have joined a
- * throttled one but it's still the running task. Trigger a
- * resched to make sure that task can still run.
- */
+ if (resched)
resched_curr(rq);
- }
}
static struct cgroup_subsys_state *
@@ -10894,37 +10780,75 @@ void sched_mm_cid_fork(struct task_struct *t)
}
#endif /* CONFIG_SCHED_MM_CID */
-#ifdef CONFIG_SCHED_CLASS_EXT
-void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
- struct sched_enq_and_set_ctx *ctx)
+static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
+
+struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags)
{
+ struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx);
struct rq *rq = task_rq(p);
+ /*
+ * Must exclusively use matched flags since this is both dequeue and
+ * enqueue.
+ */
+ WARN_ON_ONCE(flags & 0xFFFF0000);
+
lockdep_assert_rq_held(rq);
- *ctx = (struct sched_enq_and_set_ctx){
+ if (!(flags & DEQUEUE_NOCLOCK)) {
+ update_rq_clock(rq);
+ flags |= DEQUEUE_NOCLOCK;
+ }
+
+ if (flags & DEQUEUE_CLASS) {
+ if (p->sched_class->switching_from)
+ p->sched_class->switching_from(rq, p);
+ }
+
+ *ctx = (struct sched_change_ctx){
.p = p,
- .queue_flags = queue_flags,
+ .flags = flags,
.queued = task_on_rq_queued(p),
- .running = task_current(rq, p),
+ .running = task_current_donor(rq, p),
};
- update_rq_clock(rq);
+ if (!(flags & DEQUEUE_CLASS)) {
+ if (p->sched_class->get_prio)
+ ctx->prio = p->sched_class->get_prio(rq, p);
+ else
+ ctx->prio = p->prio;
+ }
+
if (ctx->queued)
- dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
+ dequeue_task(rq, p, flags);
if (ctx->running)
put_prev_task(rq, p);
+
+ if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from)
+ p->sched_class->switched_from(rq, p);
+
+ return ctx;
}
-void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
+void sched_change_end(struct sched_change_ctx *ctx)
{
- struct rq *rq = task_rq(ctx->p);
+ struct task_struct *p = ctx->p;
+ struct rq *rq = task_rq(p);
lockdep_assert_rq_held(rq);
+ if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to)
+ p->sched_class->switching_to(rq, p);
+
if (ctx->queued)
- enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
+ enqueue_task(rq, p, ctx->flags);
if (ctx->running)
- set_next_task(rq, ctx->p);
+ set_next_task(rq, p);
+
+ if (ctx->flags & ENQUEUE_CLASS) {
+ if (p->sched_class->switched_to)
+ p->sched_class->switched_to(rq, p);
+ } else {
+ p->sched_class->prio_changed(rq, p, ctx->prio);
+ }
}
-#endif /* CONFIG_SCHED_CLASS_EXT */