summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-12-01 21:04:45 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-12-01 21:04:45 -0800
commit6d2c10e889db596938bb7b4d3cdd42d67208439a (patch)
treec8adec6da2f3c89e60bf30397c7ebff7ee85e288 /arch
parent6c26fbe8c9d3e932dce6afe2505b19b4b261cae9 (diff)
parentc04507ac500e2cc8048000c2a849588227554e06 (diff)
Merge tag 'sched-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "Scalability and load-balancing improvements: - Enable scheduler feature NEXT_BUDDY (Mel Gorman) - Reimplement NEXT_BUDDY to align with EEVDF goals (Mel Gorman) - Skip sched_balance_running cmpxchg when balance is not due (Tim Chen) - Implement generic code for architecture specific sched domain NUMA distances (Tim Chen) - Optimize the NUMA distances of the sched-domains builds of Intel Granite Rapids (GNR) and Clearwater Forest (CWF) platforms (Tim Chen) - Implement proportional newidle balance: a randomized algorithm that runs newidle balancing proportional to its success rate. (Peter Zijlstra) Scheduler infrastructure changes: - Implement the 'sched_change' scoped_guard() pattern for the entire scheduler (Peter Zijlstra) - More broadly utilize the sched_change guard (Peter Zijlstra) - Add support to pick functions to take runqueue-flags (Joel Fernandes) - Provide and use set_need_resched_current() (Peter Zijlstra) Fair scheduling enhancements: - Forfeit vruntime on yield (Fernand Sieber) - Only update stats for allowed CPUs when looking for dst group (Adam Li) CPU-core scheduling enhancements: - Optimize core cookie matching check (Fernand Sieber) Deadline scheduler fixes: - Only set free_cpus for online runqueues (Doug Berger) - Fix dl_server time accounting (Peter Zijlstra) - Fix dl_server stop condition (Peter Zijlstra) Proxy scheduling fixes: - Yield the donor task (Fernand Sieber) Fixes and cleanups: - Fix do_set_cpus_allowed() locking (Peter Zijlstra) - Fix migrate_disable_switch() locking (Peter Zijlstra) - Remove double update_rq_clock() in __set_cpus_allowed_ptr_locked() (Hao Jia) - Increase sched_tick_remote timeout (Phil Auld) - sched/deadline: Use cpumask_weight_and() in dl_bw_cpus() (Shrikanth Hegde) - sched/deadline: Clean up select_task_rq_dl() (Shrikanth Hegde)" * tag 'sched-core-2025-12-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (44 commits) sched: Provide and use set_need_resched_current() sched/fair: Proportional newidle balance sched/fair: Small cleanup to update_newidle_cost() sched/fair: Small cleanup to sched_balance_newidle() sched/fair: Revert max_newidle_lb_cost bump sched/fair: Reimplement NEXT_BUDDY to align with EEVDF goals sched/fair: Enable scheduler feature NEXT_BUDDY sched: Increase sched_tick_remote timeout sched/fair: Have SD_SERIALIZE affect newidle balancing sched/fair: Skip sched_balance_running cmpxchg when balance is not due sched/deadline: Minor cleanup in select_task_rq_dl() sched/deadline: Use cpumask_weight_and() in dl_bw_cpus sched/deadline: Document dl_server sched/deadline: Fix dl_server stop condition sched/deadline: Fix dl_server time accounting sched/core: Remove double update_rq_clock() in __set_cpus_allowed_ptr_locked() sched/eevdf: Fix min_vruntime vs avg_vruntime sched/core: Add comment explaining force-idle vruntime snapshots sched/core: Optimize core cookie matching check sched/proxy: Yield the donor task ...
Diffstat (limited to 'arch')
-rw-r--r--arch/s390/mm/pfault.c3
-rw-r--r--arch/x86/include/asm/topology.h2
-rw-r--r--arch/x86/kernel/smpboot.c70
3 files changed, 73 insertions, 2 deletions
diff --git a/arch/s390/mm/pfault.c b/arch/s390/mm/pfault.c
index e6175d75e4b0..2f829448c719 100644
--- a/arch/s390/mm/pfault.c
+++ b/arch/s390/mm/pfault.c
@@ -199,8 +199,7 @@ block:
* return to userspace schedule() to block.
*/
__set_current_state(TASK_UNINTERRUPTIBLE);
- set_tsk_need_resched(tsk);
- set_preempt_need_resched();
+ set_need_resched_current();
}
}
out:
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 21041898157a..df943881f15b 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -325,4 +325,6 @@ static inline void freq_invariance_set_perf_ratio(u64 ratio, bool turbo_disabled
extern void arch_scale_freq_tick(void);
#define arch_scale_freq_tick arch_scale_freq_tick
+extern int arch_sched_node_distance(int from, int to);
+
#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index a4ba735842a8..c2107047dc14 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -515,6 +515,76 @@ static void __init build_sched_topology(void)
set_sched_topology(topology);
}
+#ifdef CONFIG_NUMA
+static int sched_avg_remote_distance;
+static int avg_remote_numa_distance(void)
+{
+ int i, j;
+ int distance, nr_remote, total_distance;
+
+ if (sched_avg_remote_distance > 0)
+ return sched_avg_remote_distance;
+
+ nr_remote = 0;
+ total_distance = 0;
+ for_each_node_state(i, N_CPU) {
+ for_each_node_state(j, N_CPU) {
+ distance = node_distance(i, j);
+
+ if (distance >= REMOTE_DISTANCE) {
+ nr_remote++;
+ total_distance += distance;
+ }
+ }
+ }
+ if (nr_remote)
+ sched_avg_remote_distance = total_distance / nr_remote;
+ else
+ sched_avg_remote_distance = REMOTE_DISTANCE;
+
+ return sched_avg_remote_distance;
+}
+
+int arch_sched_node_distance(int from, int to)
+{
+ int d = node_distance(from, to);
+
+ switch (boot_cpu_data.x86_vfm) {
+ case INTEL_GRANITERAPIDS_X:
+ case INTEL_ATOM_DARKMONT_X:
+
+ if (!x86_has_numa_in_package || topology_max_packages() == 1 ||
+ d < REMOTE_DISTANCE)
+ return d;
+
+ /*
+ * With SNC enabled, there could be too many levels of remote
+ * NUMA node distances, creating NUMA domain levels
+ * including local nodes and partial remote nodes.
+ *
+ * Trim finer distance tuning for NUMA nodes in remote package
+ * for the purpose of building sched domains. Group NUMA nodes
+ * in the remote package in the same sched group.
+ * Simplify NUMA domains and avoid extra NUMA levels including
+ * different remote NUMA nodes and local nodes.
+ *
+ * GNR and CWF don't expect systems with more than 2 packages
+ * and more than 2 hops between packages. Single average remote
+ * distance won't be appropriate if there are more than 2
+ * packages as average distance to different remote packages
+ * could be different.
+ */
+ WARN_ONCE(topology_max_packages() > 2,
+ "sched: Expect only up to 2 packages for GNR or CWF, "
+ "but saw %d packages when building sched domains.",
+ topology_max_packages());
+
+ d = avg_remote_numa_distance();
+ }
+ return d;
+}
+#endif /* CONFIG_NUMA */
+
void set_cpu_sibling_map(int cpu)
{
bool has_smt = __max_threads_per_core > 1;