diff options
Diffstat (limited to 'kernel')
79 files changed, 2899 insertions, 1312 deletions
diff --git a/kernel/Kconfig.kexec b/kernel/Kconfig.kexec index 422270d64820..54e581072617 100644 --- a/kernel/Kconfig.kexec +++ b/kernel/Kconfig.kexec @@ -109,6 +109,15 @@ config KEXEC_HANDOVER to keep data or state alive across the kexec. For this to work, both source and target kernels need to have this option enabled. +config KEXEC_HANDOVER_DEBUG + bool "Enable Kexec Handover debug checks" + depends on KEXEC_HANDOVER + help + This option enables extra sanity checks for the Kexec Handover + subsystem. Since, KHO performance is crucial in live update + scenarios and the extra code might be adding overhead it is + only optionally enabled. + config CRASH_DUMP bool "kernel crash dumps" default ARCH_DEFAULT_CRASH_DUMP diff --git a/kernel/Makefile b/kernel/Makefile index df3dd8291bb6..9fe722305c9b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -83,6 +83,7 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o obj-$(CONFIG_KEXEC_ELF) += kexec_elf.o obj-$(CONFIG_KEXEC_HANDOVER) += kexec_handover.o +obj-$(CONFIG_KEXEC_HANDOVER_DEBUG) += kexec_handover_debug.o obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup/ diff --git a/kernel/acct.c b/kernel/acct.c index 61630110e29d..2a2b3c874acd 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -520,26 +520,23 @@ static void fill_ac(struct bsd_acct_struct *acct) static void acct_write_process(struct bsd_acct_struct *acct) { struct file *file = acct->file; - const struct cred *cred; acct_t *ac = &acct->ac; /* Perform file operations on behalf of whoever enabled accounting */ - cred = override_creds(file->f_cred); - - /* - * First check to see if there is enough free_space to continue - * the process accounting system. Then get freeze protection. If - * the fs is frozen, just skip the write as we could deadlock - * the system otherwise. - */ - if (check_free_space(acct) && file_start_write_trylock(file)) { - /* it's been opened O_APPEND, so position is irrelevant */ - loff_t pos = 0; - __kernel_write(file, ac, sizeof(acct_t), &pos); - file_end_write(file); + scoped_with_creds(file->f_cred) { + /* + * First check to see if there is enough free_space to continue + * the process accounting system. Then get freeze protection. If + * the fs is frozen, just skip the write as we could deadlock + * the system otherwise. + */ + if (check_free_space(acct) && file_start_write_trylock(file)) { + /* it's been opened O_APPEND, so position is irrelevant */ + loff_t pos = 0; + __kernel_write(file, ac, sizeof(acct_t), &pos); + file_end_write(file); + } } - - revert_creds(cred); } static void do_acct_process(struct bsd_acct_struct *acct) diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 6ac35430c573..eec60b57bd3d 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -634,37 +634,24 @@ release_prog: int bpf_iter_new_fd(struct bpf_link *link) { struct bpf_iter_link *iter_link; - struct file *file; unsigned int flags; - int err, fd; + int err; if (link->ops != &bpf_iter_link_lops) return -EINVAL; flags = O_RDONLY | O_CLOEXEC; - fd = get_unused_fd_flags(flags); - if (fd < 0) - return fd; - - file = anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags); - if (IS_ERR(file)) { - err = PTR_ERR(file); - goto free_fd; - } + + FD_PREPARE(fdf, flags, anon_inode_getfile("bpf_iter", &bpf_iter_fops, NULL, flags)); + if (fdf.err) + return fdf.err; iter_link = container_of(link, struct bpf_iter_link, link); - err = prepare_seq_file(file, iter_link); + err = prepare_seq_file(fd_prepare_file(fdf), iter_link); if (err) - goto free_file; + return err; /* Automatic cleanup handles fput */ - fd_install(fd, file); - return fd; - -free_file: - fput(file); -free_fd: - put_unused_fd(fd); - return err; + return fd_publish(fdf); } struct bpf_prog *bpf_iter_get_info(struct bpf_iter_meta *meta, bool in_stop) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index eb25e70e0bdc..e4007fea4909 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -4169,7 +4169,8 @@ release_prog: } /** - * bpf_task_work_schedule_signal - Schedule BPF callback using task_work_add with TWA_SIGNAL mode + * bpf_task_work_schedule_signal_impl - Schedule BPF callback using task_work_add with TWA_SIGNAL + * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values @@ -4178,15 +4179,17 @@ release_prog: * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct bpf_task_work *tw, - void *map__map, bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_signal_impl(struct task_struct *task, + struct bpf_task_work *tw, void *map__map, + bpf_task_work_callback_t callback, + void *aux__prog) { return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_SIGNAL); } /** - * bpf_task_work_schedule_resume - Schedule BPF callback using task_work_add with TWA_RESUME mode + * bpf_task_work_schedule_resume_impl - Schedule BPF callback using task_work_add with TWA_RESUME + * mode * @task: Task struct for which callback should be scheduled * @tw: Pointer to struct bpf_task_work in BPF map value for internal bookkeeping * @map__map: bpf_map that embeds struct bpf_task_work in the values @@ -4195,9 +4198,10 @@ __bpf_kfunc int bpf_task_work_schedule_signal(struct task_struct *task, struct b * * Return: 0 if task work has been scheduled successfully, negative error code otherwise */ -__bpf_kfunc int bpf_task_work_schedule_resume(struct task_struct *task, struct bpf_task_work *tw, - void *map__map, bpf_task_work_callback_t callback, - void *aux__prog) +__bpf_kfunc int bpf_task_work_schedule_resume_impl(struct task_struct *task, + struct bpf_task_work *tw, void *map__map, + bpf_task_work_callback_t callback, + void *aux__prog) { return bpf_task_work_schedule(task, tw, map__map, callback, aux__prog, TWA_RESUME); } @@ -4376,9 +4380,9 @@ BTF_ID_FLAGS(func, bpf_strnstr); #if defined(CONFIG_BPF_LSM) && defined(CONFIG_CGROUPS) BTF_ID_FLAGS(func, bpf_cgroup_read_xattr, KF_RCU) #endif -BTF_ID_FLAGS(func, bpf_stream_vprintk, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_signal, KF_TRUSTED_ARGS) -BTF_ID_FLAGS(func, bpf_task_work_schedule_resume, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_stream_vprintk_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_signal_impl, KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_task_work_schedule_resume_impl, KF_TRUSTED_ARGS) BTF_KFUNCS_END(common_btf_ids) static const struct btf_kfunc_id_set common_kfunc_set = { diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4d53cdd1374c..8f1dacaf01fe 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -315,7 +315,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map, max_depth = sysctl_perf_event_max_stack; trace = get_perf_callchain(regs, kernel, user, max_depth, - false, false); + false, false, 0); if (unlikely(!trace)) /* couldn't fetch the stack trace */ @@ -452,7 +452,7 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task, trace = get_callchain_entry_for_task(task, max_depth); else trace = get_perf_callchain(regs, kernel, user, max_depth, - crosstask, false); + crosstask, false, 0); if (unlikely(!trace) || trace->nr < skip) { if (may_fault) diff --git a/kernel/bpf/stream.c b/kernel/bpf/stream.c index eb6c5a21c2ef..ff16c631951b 100644 --- a/kernel/bpf/stream.c +++ b/kernel/bpf/stream.c @@ -355,7 +355,8 @@ __bpf_kfunc_start_defs(); * Avoid using enum bpf_stream_id so that kfunc users don't have to pull in the * enum in headers. */ -__bpf_kfunc int bpf_stream_vprintk(int stream_id, const char *fmt__str, const void *args, u32 len__sz, void *aux__prog) +__bpf_kfunc int bpf_stream_vprintk_impl(int stream_id, const char *fmt__str, const void *args, + u32 len__sz, void *aux__prog) { struct bpf_bprintf_data data = { .get_bin_args = true, diff --git a/kernel/bpf/token.c b/kernel/bpf/token.c index 0bbe412f854e..feecd8f4dbf9 100644 --- a/kernel/bpf/token.c +++ b/kernel/bpf/token.c @@ -110,16 +110,15 @@ const struct file_operations bpf_token_fops = { int bpf_token_create(union bpf_attr *attr) { + struct bpf_token *token __free(kfree) = NULL; struct bpf_mount_opts *mnt_opts; - struct bpf_token *token = NULL; struct user_namespace *userns; struct inode *inode; - struct file *file; CLASS(fd, f)(attr->token_create.bpffs_fd); struct path path; struct super_block *sb; umode_t mode; - int err, fd; + int err; if (fd_empty(f)) return -EBADF; @@ -166,23 +165,20 @@ int bpf_token_create(union bpf_attr *attr) inode->i_fop = &bpf_token_fops; clear_nlink(inode); /* make sure it is unlinked */ - file = alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, O_RDWR, &bpf_token_fops); - if (IS_ERR(file)) { - iput(inode); - return PTR_ERR(file); - } + FD_PREPARE(fdf, O_CLOEXEC, + alloc_file_pseudo(inode, path.mnt, BPF_TOKEN_INODE_NAME, + O_RDWR, &bpf_token_fops)); + if (fdf.err) + return fdf.err; token = kzalloc(sizeof(*token), GFP_USER); - if (!token) { - err = -ENOMEM; - goto out_file; - } + if (!token) + return -ENOMEM; atomic64_set(&token->refcnt, 1); - /* remember bpffs owning userns for future ns_capable() checks */ - token->userns = get_user_ns(userns); - + /* remember bpffs owning userns for future ns_capable() checks. */ + token->userns = userns; token->allowed_cmds = mnt_opts->delegate_cmds; token->allowed_maps = mnt_opts->delegate_maps; token->allowed_progs = mnt_opts->delegate_progs; @@ -190,24 +186,11 @@ int bpf_token_create(union bpf_attr *attr) err = security_bpf_token_create(token, attr, &path); if (err) - goto out_token; - - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) { - err = fd; - goto out_token; - } - - file->private_data = token; - fd_install(fd, file); - - return fd; + return err; -out_token: - bpf_token_free(token); -out_file: - fput(file); - return err; + get_user_ns(token->userns); + fd_prepare_file(fdf)->private_data = no_free_ptr(token); + return fd_publish(fdf); } int bpf_token_get_info_by_fd(struct bpf_token *token, diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 5949095e51c3..f2cb0b097093 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -479,11 +479,6 @@ again: * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the * trampoline again, and retry register. */ - /* reset fops->func and fops->trampoline for re-register */ - tr->fops->func = NULL; - tr->fops->trampoline = 0; - - /* free im memory and reallocate later */ bpf_tramp_image_free(im); goto again; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ff40e5e65c43..fbe4bb91c564 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8866,7 +8866,7 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, struct bpf_verifier_state *cur) { struct bpf_func_state *fold, *fcur; - int i, fr; + int i, fr, num_slots; reset_idmap_scratch(env); for (fr = old->curframe; fr >= 0; fr--) { @@ -8879,7 +8879,9 @@ static int widen_imprecise_scalars(struct bpf_verifier_env *env, &fcur->regs[i], &env->idmap_scratch); - for (i = 0; i < fold->allocated_stack / BPF_REG_SIZE; i++) { + num_slots = min(fold->allocated_stack / BPF_REG_SIZE, + fcur->allocated_stack / BPF_REG_SIZE); + for (i = 0; i < num_slots; i++) { if (!is_spilled_reg(&fold->stack[i]) || !is_spilled_reg(&fcur->stack[i])) continue; @@ -12259,8 +12261,8 @@ enum special_kfunc_type { KF_bpf_res_spin_lock_irqsave, KF_bpf_res_spin_unlock_irqrestore, KF___bpf_trap, - KF_bpf_task_work_schedule_signal, - KF_bpf_task_work_schedule_resume, + KF_bpf_task_work_schedule_signal_impl, + KF_bpf_task_work_schedule_resume_impl, }; BTF_ID_LIST(special_kfunc_list) @@ -12331,13 +12333,13 @@ BTF_ID(func, bpf_res_spin_unlock) BTF_ID(func, bpf_res_spin_lock_irqsave) BTF_ID(func, bpf_res_spin_unlock_irqrestore) BTF_ID(func, __bpf_trap) -BTF_ID(func, bpf_task_work_schedule_signal) -BTF_ID(func, bpf_task_work_schedule_resume) +BTF_ID(func, bpf_task_work_schedule_signal_impl) +BTF_ID(func, bpf_task_work_schedule_resume_impl) static bool is_task_work_add_kfunc(u32 func_id) { - return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal] || - func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume]; + return func_id == special_kfunc_list[KF_bpf_task_work_schedule_signal_impl] || + func_id == special_kfunc_list[KF_bpf_task_work_schedule_resume_impl]; } static bool is_kfunc_ret_null(struct bpf_kfunc_call_arg_meta *meta) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index fdee387f0d6b..ae1eb7a85eb4 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -250,12 +250,9 @@ bool cgroup_enable_per_threadgroup_rwsem __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .ns.__ns_ref = REFCOUNT_INIT(2), + .ns = NS_COMMON_INIT(init_cgroup_ns), .user_ns = &init_user_ns, - .ns.ops = &cgroupns_operations, - .ns.inum = ns_init_inum(&init_cgroup_ns), .root_cset = &init_css_set, - .ns.ns_type = ns_common_type(&init_cgroup_ns), }; static struct file_system_type cgroup2_fs_type; @@ -1522,9 +1519,9 @@ static struct cgroup *current_cgns_cgroup_dfl(void) } else { /* * NOTE: This function may be called from bpf_cgroup_from_id() - * on a task which has already passed exit_task_namespaces() and - * nsproxy == NULL. Fall back to cgrp_dfl_root which will make all - * cgroups visible for lookups. + * on a task which has already passed exit_nsproxy_namespaces() + * and nsproxy == NULL. Fall back to cgrp_dfl_root which will + * make all cgroups visible for lookups. */ return &cgrp_dfl_root.cgrp; } @@ -5363,7 +5360,6 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; - const struct cred *saved_cred; ssize_t ret; enum cgroup_attach_lock_mode lock_mode; @@ -5386,11 +5382,10 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, * permissions using the credentials from file open to protect against * inherited fd attacks. */ - saved_cred = override_creds(of->file->f_cred); - ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, - threadgroup, ctx->ns); - revert_creds(saved_cred); + scoped_with_creds(of->file->f_cred) + ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, + of->file->f_path.dentry->d_sb, + threadgroup, ctx->ns); if (ret) goto out_finish; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 52468d2c178a..185e820cd1df 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4180,7 +4180,7 @@ bool cpuset_cpus_allowed_fallback(struct task_struct *tsk) rcu_read_lock(); cs_mask = task_cs(tsk)->cpus_allowed; if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) { - do_set_cpus_allowed(tsk, cs_mask); + set_cpus_allowed_force(tsk, cs_mask); changed = true; } rcu_read_unlock(); diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index fdbe57578e68..db9617556dd7 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -30,7 +30,6 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) ret = ns_common_init(new_ns); if (ret) return ERR_PTR(ret); - ns_tree_add(new_ns); return no_free_ptr(new_ns); } @@ -86,6 +85,7 @@ struct cgroup_namespace *copy_cgroup_ns(u64 flags, new_ns->ucounts = ucounts; new_ns->root_cset = cset; + ns_tree_add(new_ns); return new_ns; } diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 3b1c43382eec..99dac1aa972a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -373,7 +373,7 @@ static int __crash_shrink_memory(struct resource *old_res, old_res->start = 0; old_res->end = 0; } else { - crashk_res.end = ram_res->start - 1; + old_res->end = ram_res->start - 1; } crash_free_reserved_phys_range(ram_res->start, ram_res->end); diff --git a/kernel/cred.c b/kernel/cred.c index dbf6b687dc5c..a6f686b30da1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -35,33 +35,6 @@ do { \ static struct kmem_cache *cred_jar; -/* init to 2 - one for init_task, one to ensure it is never freed */ -static struct group_info init_groups = { .usage = REFCOUNT_INIT(2) }; - -/* - * The initial credentials for the initial task - */ -struct cred init_cred = { - .usage = ATOMIC_INIT(4), - .uid = GLOBAL_ROOT_UID, - .gid = GLOBAL_ROOT_GID, - .suid = GLOBAL_ROOT_UID, - .sgid = GLOBAL_ROOT_GID, - .euid = GLOBAL_ROOT_UID, - .egid = GLOBAL_ROOT_GID, - .fsuid = GLOBAL_ROOT_UID, - .fsgid = GLOBAL_ROOT_GID, - .securebits = SECUREBITS_DEFAULT, - .cap_inheritable = CAP_EMPTY_SET, - .cap_permitted = CAP_FULL_SET, - .cap_effective = CAP_FULL_SET, - .cap_bset = CAP_FULL_SET, - .user = INIT_USER, - .user_ns = &init_user_ns, - .group_info = &init_groups, - .ucounts = &init_ucounts, -}; - /* * The RCU callback to actually dispose of a set of credentials */ @@ -306,6 +279,7 @@ int copy_creds(struct task_struct *p, u64 clone_flags) kdebug("share_creds(%p{%ld})", p->cred, atomic_long_read(&p->cred->usage)); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + get_cred_namespaces(p); return 0; } @@ -343,6 +317,8 @@ int copy_creds(struct task_struct *p, u64 clone_flags) p->cred = p->real_cred = get_cred(new); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + get_cred_namespaces(p); + return 0; error_put: @@ -435,10 +411,13 @@ int commit_creds(struct cred *new) */ if (new->user != old->user || new->user_ns != old->user_ns) inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1); + rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->cred, new); if (new->user != old->user || new->user_ns != old->user_ns) dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1); + if (new->user_ns != old->user_ns) + switch_cred_namespaces(old, new); /* send notifications */ if (!uid_eq(new->uid, old->uid) || diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 1f9ee9759426..f973e7e73c90 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -481,6 +481,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents, case PCI_P2PDMA_MAP_BUS_ADDR: sg->dma_address = pci_p2pdma_bus_addr_map(&p2pdma_state, sg_phys(sg)); + sg_dma_len(sg) = sg->length; sg_dma_mark_bus_address(sg); continue; default: diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 808c0d7a31fa..b9c7e00725d6 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -218,7 +218,7 @@ static void fixup_uretprobe_trampoline_entries(struct perf_callchain_entry *entr struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, - u32 max_stack, bool crosstask, bool add_mark) + u32 max_stack, bool crosstask, bool add_mark, u64 defer_cookie) { struct perf_callchain_entry *entry; struct perf_callchain_entry_ctx ctx; @@ -251,6 +251,18 @@ get_perf_callchain(struct pt_regs *regs, bool kernel, bool user, regs = task_pt_regs(current); } + if (defer_cookie) { + /* + * Foretell the coming of PERF_RECORD_CALLCHAIN_DEFERRED + * which can be stitched to this one, and add + * the cookie after it (it will be cut off when the + * user stack is copied to the callchain). + */ + perf_callchain_store_context(&ctx, PERF_CONTEXT_USER_DEFERRED); + perf_callchain_store_context(&ctx, defer_cookie); + goto exit_put; + } + if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); diff --git a/kernel/events/core.c b/kernel/events/core.c index 177e57c1a362..ece716879cbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -56,6 +56,7 @@ #include <linux/buildid.h> #include <linux/task_work.h> #include <linux/percpu-rwsem.h> +#include <linux/unwind_deferred.h> #include "internal.h" @@ -8200,6 +8201,8 @@ static u64 perf_get_page_size(unsigned long addr) static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; +static struct unwind_work perf_unwind_work; + struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { @@ -8208,8 +8211,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) !(current->flags & (PF_KTHREAD | PF_USER_WORKER)); /* Disallow cross-task user callchains. */ bool crosstask = event->ctx->task && event->ctx->task != current; + bool defer_user = IS_ENABLED(CONFIG_UNWIND_USER) && user && + event->attr.defer_callchain; const u32 max_stack = event->attr.sample_max_stack; struct perf_callchain_entry *callchain; + u64 defer_cookie; if (!current->mm) user = false; @@ -8217,8 +8223,13 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!kernel && !user) return &__empty_callchain; - callchain = get_perf_callchain(regs, kernel, user, - max_stack, crosstask, true); + if (!(user && defer_user && !crosstask && + unwind_deferred_request(&perf_unwind_work, &defer_cookie) >= 0)) + defer_cookie = 0; + + callchain = get_perf_callchain(regs, kernel, user, max_stack, + crosstask, true, defer_cookie); + return callchain ?: &__empty_callchain; } @@ -10003,6 +10014,66 @@ void perf_event_bpf_event(struct bpf_prog *prog, perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); } +struct perf_callchain_deferred_event { + struct unwind_stacktrace *trace; + struct { + struct perf_event_header header; + u64 cookie; + u64 nr; + u64 ips[]; + } event; +}; + +static void perf_callchain_deferred_output(struct perf_event *event, void *data) +{ + struct perf_callchain_deferred_event *deferred_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret, size = deferred_event->event.header.size; + + if (!event->attr.defer_output) + return; + + /* XXX do we really need sample_id_all for this ??? */ + perf_event_header__init_id(&deferred_event->event.header, &sample, event); + + ret = perf_output_begin(&handle, &sample, event, + deferred_event->event.header.size); + if (ret) + goto out; + + perf_output_put(&handle, deferred_event->event); + for (int i = 0; i < deferred_event->trace->nr; i++) { + u64 entry = deferred_event->trace->entries[i]; + perf_output_put(&handle, entry); + } + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + deferred_event->event.header.size = size; +} + +static void perf_unwind_deferred_callback(struct unwind_work *work, + struct unwind_stacktrace *trace, u64 cookie) +{ + struct perf_callchain_deferred_event deferred_event = { + .trace = trace, + .event = { + .header = { + .type = PERF_RECORD_CALLCHAIN_DEFERRED, + .misc = PERF_RECORD_MISC_USER, + .size = sizeof(deferred_event.event) + + (trace->nr * sizeof(u64)), + }, + .cookie = cookie, + .nr = trace->nr, + }, + }; + + perf_iterate_sb(perf_callchain_deferred_output, &deferred_event, NULL); +} + struct perf_text_poke_event { const void *old_bytes; const void *new_bytes; @@ -11773,7 +11844,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event = container_of(hrtimer, struct perf_event, hw.hrtimer); - if (event->state != PERF_EVENT_STATE_ACTIVE) + if (event->state != PERF_EVENT_STATE_ACTIVE || + event->hw.state & PERF_HES_STOPPED) return HRTIMER_NORESTART; event->pmu->read(event); @@ -11819,15 +11891,20 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; /* - * The throttle can be triggered in the hrtimer handler. - * The HRTIMER_NORESTART should be used to stop the timer, - * rather than hrtimer_cancel(). See perf_swevent_hrtimer() + * Careful: this function can be triggered in the hrtimer handler, + * for cpu-clock events, so hrtimer_cancel() would cause a + * deadlock. + * + * So use hrtimer_try_to_cancel() to try to stop the hrtimer, + * and the cpu-clock handler also sets the PERF_HES_STOPPED flag, + * which guarantees that perf_swevent_hrtimer() will stop the + * hrtimer once it sees the PERF_HES_STOPPED flag. */ if (is_sampling_event(event) && (hwc->interrupts != MAX_INTERRUPTS)) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); local64_set(&hwc->period_left, ktime_to_ns(remaining)); - hrtimer_cancel(&hwc->hrtimer); + hrtimer_try_to_cancel(&hwc->hrtimer); } } @@ -11871,12 +11948,14 @@ static void cpu_clock_event_update(struct perf_event *event) static void cpu_clock_event_start(struct perf_event *event, int flags) { + event->hw.state = 0; local64_set(&event->hw.prev_count, local_clock()); perf_swevent_start_hrtimer(event); } static void cpu_clock_event_stop(struct perf_event *event, int flags) { + event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); if (flags & PERF_EF_UPDATE) cpu_clock_event_update(event); @@ -11893,7 +11972,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags) static void cpu_clock_event_del(struct perf_event *event, int flags) { - cpu_clock_event_stop(event, flags); + cpu_clock_event_stop(event, PERF_EF_UPDATE); } static void cpu_clock_event_read(struct perf_event *event) @@ -11950,12 +12029,14 @@ static void task_clock_event_update(struct perf_event *event, u64 now) static void task_clock_event_start(struct perf_event *event, int flags) { + event->hw.state = 0; local64_set(&event->hw.prev_count, event->ctx->time); perf_swevent_start_hrtimer(event); } static void task_clock_event_stop(struct perf_event *event, int flags) { + event->hw.state = PERF_HES_STOPPED; perf_swevent_cancel_hrtimer(event); if (flags & PERF_EF_UPDATE) task_clock_event_update(event, event->ctx->time); @@ -14799,6 +14880,9 @@ void __init perf_event_init(void) idr_init(&pmu_idr); + unwind_deferred_init(&perf_unwind_work, + perf_unwind_deferred_callback); + perf_event_init_all_cpus(); init_srcu_struct(&pmus_srcu); perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); diff --git a/kernel/exit.c b/kernel/exit.c index 9f74e8f1c431..fdfd05d1826c 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -291,6 +291,7 @@ repeat: write_unlock_irq(&tasklist_lock); /* @thread_pid can't go away until free_pids() below */ proc_flush_pid(thread_pid); + exit_cred_namespaces(p); add_device_randomness(&p->se.sum_exec_runtime, sizeof(p->se.sum_exec_runtime)); free_pids(post.pids); @@ -939,7 +940,6 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead); - unwind_deferred_task_exit(tsk); trace_sched_process_exit(tsk, group_dead); /* @@ -950,6 +950,12 @@ void __noreturn do_exit(long code) * gets woken up by child-exit notifications. */ perf_event_exit_task(tsk); + /* + * PF_EXITING (above) ensures unwind_deferred_request() will no + * longer add new unwinds. While exit_mm() (below) will destroy the + * abaility to do unwinds. So flush any pending unwinds here. + */ + unwind_deferred_task_exit(tsk); exit_mm(); @@ -962,7 +968,7 @@ void __noreturn do_exit(long code) exit_fs(tsk); if (group_dead) disassociate_ctty(1); - exit_task_namespaces(tsk); + exit_nsproxy_namespaces(tsk); exit_task_work(tsk); exit_thread(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 3da0f08615a9..f1857672426e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2453,7 +2453,7 @@ bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: - exit_task_namespaces(p); + exit_nsproxy_namespaces(p); bad_fork_cleanup_mm: if (p->mm) { mm_clear_owner(p->mm, p); @@ -2487,6 +2487,7 @@ bad_fork_cleanup_delayacct: delayacct_tsk_free(p); bad_fork_cleanup_count: dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); + exit_cred_namespaces(p); exit_creds(p); bad_fork_free: WRITE_ONCE(p->__state, TASK_DEAD); diff --git a/kernel/futex/core.c b/kernel/futex/core.c index ebcccb16ae0b..cf7e610eac42 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -1680,10 +1680,10 @@ static bool futex_ref_get(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; - guard(rcu)(); + guard(preempt)(); - if (smp_load_acquire(&fph->state) == FR_PERCPU) { - this_cpu_inc(*mm->futex_ref); + if (READ_ONCE(fph->state) == FR_PERCPU) { + __this_cpu_inc(*mm->futex_ref); return true; } @@ -1694,10 +1694,10 @@ static bool futex_ref_put(struct futex_private_hash *fph) { struct mm_struct *mm = fph->mm; - guard(rcu)(); + guard(preempt)(); - if (smp_load_acquire(&fph->state) == FR_PERCPU) { - this_cpu_dec(*mm->futex_ref); + if (READ_ONCE(fph->state) == FR_PERCPU) { + __this_cpu_dec(*mm->futex_ref); return false; } diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index a08cc076f332..ffde93d051a4 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,9 @@ #include <linux/mm.h> #include "gcov.h" -#if (__GNUC__ >= 14) +#if (__GNUC__ >= 15) +#define GCOV_COUNTERS 10 +#elif (__GNUC__ >= 14) #define GCOV_COUNTERS 9 #elif (__GNUC__ >= 10) #define GCOV_COUNTERS 8 diff --git a/kernel/kexec_handover.c b/kernel/kexec_handover.c index 76f0940fb485..03d12e27189f 100644 --- a/kernel/kexec_handover.c +++ b/kernel/kexec_handover.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "KHO: " fmt +#include <linux/cleanup.h> #include <linux/cma.h> #include <linux/count_zeros.h> #include <linux/debugfs.h> @@ -22,6 +23,7 @@ #include <asm/early_ioremap.h> +#include "kexec_handover_internal.h" /* * KHO is tightly coupled with mm init and needs access to some of mm * internal APIs. @@ -67,10 +69,10 @@ early_param("kho", kho_parse_enable); * Keep track of memory that is to be preserved across KHO. * * The serializing side uses two levels of xarrays to manage chunks of per-order - * 512 byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order of a - * 1TB system would fit inside a single 512 byte bitmap. For order 0 allocations - * each bitmap will cover 16M of address space. Thus, for 16G of memory at most - * 512K of bitmap memory will be needed for order 0. + * PAGE_SIZE byte bitmaps. For instance if PAGE_SIZE = 4096, the entire 1G order + * of a 8TB system would fit inside a single 4096 byte bitmap. For order 0 + * allocations each bitmap will cover 128M of address space. Thus, for 16G of + * memory at most 512K of bitmap memory will be needed for order 0. * * This approach is fully incremental, as the serialization progresses folios * can continue be aggregated to the tracker. The final step, immediately prior @@ -78,12 +80,14 @@ early_param("kho", kho_parse_enable); * successor kernel to parse. */ -#define PRESERVE_BITS (512 * 8) +#define PRESERVE_BITS (PAGE_SIZE * 8) struct kho_mem_phys_bits { DECLARE_BITMAP(preserve, PRESERVE_BITS); }; +static_assert(sizeof(struct kho_mem_phys_bits) == PAGE_SIZE); + struct kho_mem_phys { /* * Points to kho_mem_phys_bits, a sparse bitmap array. Each bit is sized @@ -131,28 +135,28 @@ static struct kho_out kho_out = { .finalized = false, }; -static void *xa_load_or_alloc(struct xarray *xa, unsigned long index, size_t sz) +static void *xa_load_or_alloc(struct xarray *xa, unsigned long index) { - void *elm, *res; + void *res = xa_load(xa, index); + + if (res) + return res; - elm = xa_load(xa, index); - if (elm) - return elm; + void *elm __free(free_page) = (void *)get_zeroed_page(GFP_KERNEL); - elm = kzalloc(sz, GFP_KERNEL); if (!elm) return ERR_PTR(-ENOMEM); + if (WARN_ON(kho_scratch_overlap(virt_to_phys(elm), PAGE_SIZE))) + return ERR_PTR(-EINVAL); + res = xa_cmpxchg(xa, index, NULL, elm, GFP_KERNEL); if (xa_is_err(res)) - res = ERR_PTR(xa_err(res)); - - if (res) { - kfree(elm); + return ERR_PTR(xa_err(res)); + else if (res) return res; - } - return elm; + return no_free_ptr(elm); } static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, @@ -167,12 +171,12 @@ static void __kho_unpreserve(struct kho_mem_track *track, unsigned long pfn, const unsigned long pfn_high = pfn >> order; physxa = xa_load(&track->orders, order); - if (!physxa) - continue; + if (WARN_ON_ONCE(!physxa)) + return; bits = xa_load(&physxa->phys_bits, pfn_high / PRESERVE_BITS); - if (!bits) - continue; + if (WARN_ON_ONCE(!bits)) + return; clear_bit(pfn_high % PRESERVE_BITS, bits->preserve); @@ -216,8 +220,7 @@ static int __kho_preserve_order(struct kho_mem_track *track, unsigned long pfn, } } - bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS, - sizeof(*bits)); + bits = xa_load_or_alloc(&physxa->phys_bits, pfn_high / PRESERVE_BITS); if (IS_ERR(bits)) return PTR_ERR(bits); @@ -345,15 +348,19 @@ static_assert(sizeof(struct khoser_mem_chunk) == PAGE_SIZE); static struct khoser_mem_chunk *new_chunk(struct khoser_mem_chunk *cur_chunk, unsigned long order) { - struct khoser_mem_chunk *chunk; + struct khoser_mem_chunk *chunk __free(free_page) = NULL; - chunk = kzalloc(PAGE_SIZE, GFP_KERNEL); + chunk = (void *)get_zeroed_page(GFP_KERNEL); if (!chunk) - return NULL; + return ERR_PTR(-ENOMEM); + + if (WARN_ON(kho_scratch_overlap(virt_to_phys(chunk), PAGE_SIZE))) + return ERR_PTR(-EINVAL); + chunk->hdr.order = order; if (cur_chunk) KHOSER_STORE_PTR(cur_chunk->hdr.next, chunk); - return chunk; + return no_free_ptr(chunk); } static void kho_mem_ser_free(struct khoser_mem_chunk *first_chunk) @@ -374,14 +381,17 @@ static int kho_mem_serialize(struct kho_serialization *ser) struct khoser_mem_chunk *chunk = NULL; struct kho_mem_phys *physxa; unsigned long order; + int err = -ENOMEM; xa_for_each(&ser->track.orders, order, physxa) { struct kho_mem_phys_bits *bits; unsigned long phys; chunk = new_chunk(chunk, order); - if (!chunk) + if (IS_ERR(chunk)) { + err = PTR_ERR(chunk); goto err_free; + } if (!first_chunk) first_chunk = chunk; @@ -391,8 +401,10 @@ static int kho_mem_serialize(struct kho_serialization *ser) if (chunk->hdr.num_elms == ARRAY_SIZE(chunk->bitmaps)) { chunk = new_chunk(chunk, order); - if (!chunk) + if (IS_ERR(chunk)) { + err = PTR_ERR(chunk); goto err_free; + } } elm = &chunk->bitmaps[chunk->hdr.num_elms]; @@ -409,7 +421,7 @@ static int kho_mem_serialize(struct kho_serialization *ser) err_free: kho_mem_ser_free(first_chunk); - return -ENOMEM; + return err; } static void __init deserialize_bitmap(unsigned int order, @@ -465,8 +477,8 @@ static void __init kho_mem_deserialize(const void *fdt) * area for early allocations that happen before page allocator is * initialized. */ -static struct kho_scratch *kho_scratch; -static unsigned int kho_scratch_cnt; +struct kho_scratch *kho_scratch; +unsigned int kho_scratch_cnt; /* * The scratch areas are scaled by default as percent of memory allocated from @@ -752,6 +764,9 @@ int kho_preserve_folio(struct folio *folio) const unsigned int order = folio_order(folio); struct kho_mem_track *track = &kho_out.ser.track; + if (WARN_ON(kho_scratch_overlap(pfn << PAGE_SHIFT, PAGE_SIZE << order))) + return -EINVAL; + return __kho_preserve_order(track, pfn, order); } EXPORT_SYMBOL_GPL(kho_preserve_folio); @@ -775,6 +790,11 @@ int kho_preserve_pages(struct page *page, unsigned int nr_pages) unsigned long failed_pfn = 0; int err = 0; + if (WARN_ON(kho_scratch_overlap(start_pfn << PAGE_SHIFT, + nr_pages << PAGE_SHIFT))) { + return -EINVAL; + } + while (pfn < end_pfn) { const unsigned int order = min(count_trailing_zeros(pfn), ilog2(end_pfn - pfn)); @@ -862,16 +882,17 @@ err_free: return NULL; } -static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk) +static void kho_vmalloc_unpreserve_chunk(struct kho_vmalloc_chunk *chunk, + unsigned short order) { struct kho_mem_track *track = &kho_out.ser.track; unsigned long pfn = PHYS_PFN(virt_to_phys(chunk)); __kho_unpreserve(track, pfn, pfn + 1); - for (int i = 0; chunk->phys[i]; i++) { + for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { pfn = PHYS_PFN(chunk->phys[i]); - __kho_unpreserve(track, pfn, pfn + 1); + __kho_unpreserve(track, pfn, pfn + (1 << order)); } } @@ -882,7 +903,7 @@ static void kho_vmalloc_free_chunks(struct kho_vmalloc *kho_vmalloc) while (chunk) { struct kho_vmalloc_chunk *tmp = chunk; - kho_vmalloc_unpreserve_chunk(chunk); + kho_vmalloc_unpreserve_chunk(chunk, kho_vmalloc->order); chunk = KHOSER_LOAD_PTR(chunk->hdr.next); free_page((unsigned long)tmp); @@ -992,7 +1013,7 @@ void *kho_restore_vmalloc(const struct kho_vmalloc *preservation) while (chunk) { struct page *page; - for (int i = 0; chunk->phys[i]; i++) { + for (int i = 0; i < ARRAY_SIZE(chunk->phys) && chunk->phys[i]; i++) { phys_addr_t phys = chunk->phys[i]; if (idx + contig_pages > total_pages) diff --git a/kernel/kexec_handover_debug.c b/kernel/kexec_handover_debug.c new file mode 100644 index 000000000000..6efb696f5426 --- /dev/null +++ b/kernel/kexec_handover_debug.c @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kexec_handover_debug.c - kexec handover optional debug functionality + * Copyright (C) 2025 Google LLC, Pasha Tatashin <pasha.tatashin@soleen.com> + */ + +#define pr_fmt(fmt) "KHO: " fmt + +#include "kexec_handover_internal.h" + +bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + phys_addr_t scratch_start, scratch_end; + unsigned int i; + + for (i = 0; i < kho_scratch_cnt; i++) { + scratch_start = kho_scratch[i].addr; + scratch_end = kho_scratch[i].addr + kho_scratch[i].size; + + if (phys < scratch_end && (phys + size) > scratch_start) + return true; + } + + return false; +} diff --git a/kernel/kexec_handover_internal.h b/kernel/kexec_handover_internal.h new file mode 100644 index 000000000000..3c3c7148ceed --- /dev/null +++ b/kernel/kexec_handover_internal.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_KEXEC_HANDOVER_INTERNAL_H +#define LINUX_KEXEC_HANDOVER_INTERNAL_H + +#include <linux/kexec_handover.h> +#include <linux/types.h> + +extern struct kho_scratch *kho_scratch; +extern unsigned int kho_scratch_cnt; + +#ifdef CONFIG_KEXEC_HANDOVER_DEBUG +bool kho_scratch_overlap(phys_addr_t phys, size_t size); +#else +static inline bool kho_scratch_overlap(phys_addr_t phys, size_t size) +{ + return false; +} +#endif /* CONFIG_KEXEC_HANDOVER_DEBUG */ + +#endif /* LINUX_KEXEC_HANDOVER_INTERNAL_H */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 31b072e8d427..99a3808d086f 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -593,18 +593,16 @@ EXPORT_SYMBOL(kthread_create_on_node); static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, unsigned int state) { - unsigned long flags; - if (!wait_task_inactive(p, state)) { WARN_ON(1); return; } + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) + set_cpus_allowed_force(p, mask); + /* It's safe because the task is inactive. */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - do_set_cpus_allowed(p, mask); p->flags |= PF_NO_SETAFFINITY; - raw_spin_unlock_irqrestore(&p->pi_lock, flags); } static void __kthread_bind(struct task_struct *p, unsigned int cpu, unsigned int state) @@ -857,7 +855,6 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) { struct kthread *kthread = to_kthread(p); cpumask_var_t affinity; - unsigned long flags; int ret = 0; if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE) || kthread->started) { @@ -882,10 +879,8 @@ int kthread_affine_preferred(struct task_struct *p, const struct cpumask *mask) list_add_tail(&kthread->hotplug_node, &kthreads_hotplug); kthread_fetch_affinity(kthread, affinity); - /* It's safe because the task is inactive. */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - do_set_cpus_allowed(p, affinity); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + scoped_guard (raw_spinlock_irqsave, &p->pi_lock) + set_cpus_allowed_force(p, affinity); mutex_unlock(&kthreads_hotplug_lock); out: diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index 53d51ed619a3..4c0a9c18d0b2 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -18,3 +18,15 @@ config LIVEPATCH module uses the interface provided by this option to register a patch, causing calls to patched functions to be redirected to new function code contained in the patch module. + +config HAVE_KLP_BUILD + bool + help + Arch supports klp-build + +config KLP_BUILD + def_bool y + depends on LIVEPATCH && HAVE_KLP_BUILD + select OBJTOOL + help + Enable klp-build support diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 0e73fac55f8e..0044a8125013 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -217,14 +217,14 @@ static int klp_resolve_symbols(Elf_Shdr *sechdrs, const char *strtab, for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { sym = (Elf_Sym *)sechdrs[symndx].sh_addr + ELF_R_SYM(relas[i].r_info); if (sym->st_shndx != SHN_LIVEPATCH) { - pr_err("symbol %s is not marked as a livepatch symbol\n", - strtab + sym->st_name); + pr_err("symbol %s at rela sec %u idx %d is not marked as a livepatch symbol\n", + strtab + sym->st_name, symndx, i); return -EINVAL; } /* Format: .klp.sym.sym_objname.sym_name,sympos */ cnt = sscanf(strtab + sym->st_name, - ".klp.sym.%55[^.].%511[^,],%lu", + KLP_SYM_PREFIX "%55[^.].%511[^,],%lu", sym_objname, sym_name, &sympos); if (cnt != 3) { pr_err("symbol %s has an incorrectly formatted name\n", @@ -303,7 +303,7 @@ static int klp_write_section_relocs(struct module *pmod, Elf_Shdr *sechdrs, * See comment in klp_resolve_symbols() for an explanation * of the selected field width value. */ - cnt = sscanf(shstrtab + sec->sh_name, ".klp.rela.%55[^.]", + cnt = sscanf(shstrtab + sec->sh_name, KLP_RELOC_SEC_PREFIX "%55[^.]", sec_objname); if (cnt != 1) { pr_err("section %s has an incorrectly formatted name\n", diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 949103fd8e9b..2c6b02d4699b 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -78,16 +78,8 @@ void debug_mutex_unlock(struct mutex *lock) } } -void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key) +void debug_mutex_init(struct mutex *lock) { -#ifdef CONFIG_DEBUG_LOCK_ALLOC - /* - * Make sure we are not reinitializing a held lock: - */ - debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); -#endif lock->magic = lock; } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index de7d6702cd96..2a1d165b3167 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -43,8 +43,7 @@ # define MUTEX_WARN_ON(cond) #endif -void -__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) +static void __mutex_init_generic(struct mutex *lock) { atomic_long_set(&lock->owner, 0); raw_spin_lock_init(&lock->wait_lock); @@ -52,10 +51,8 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif - - debug_mutex_init(lock, name, key); + debug_mutex_init(lock); } -EXPORT_SYMBOL(__mutex_init); static inline struct task_struct *__owner_task(unsigned long owner) { @@ -142,6 +139,11 @@ static inline bool __mutex_trylock(struct mutex *lock) * There is nothing that would stop spreading the lockdep annotations outwards * except more code. */ +void mutex_init_generic(struct mutex *lock) +{ + __mutex_init_generic(lock); +} +EXPORT_SYMBOL(mutex_init_generic); /* * Optimistic trylock that only works in the uncontended case. Make sure to @@ -166,7 +168,21 @@ static __always_inline bool __mutex_unlock_fast(struct mutex *lock) return atomic_long_try_cmpxchg_release(&lock->owner, &curr, 0UL); } -#endif + +#else /* !CONFIG_DEBUG_LOCK_ALLOC */ + +void mutex_init_lockep(struct mutex *lock, const char *name, struct lock_class_key *key) +{ + __mutex_init_generic(lock); + + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map_wait(&lock->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(mutex_init_lockep); +#endif /* !CONFIG_DEBUG_LOCK_ALLOC */ static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag) { diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 2e8080a9bee3..9ad4da8cea00 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -59,8 +59,7 @@ extern void debug_mutex_add_waiter(struct mutex *lock, extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task); extern void debug_mutex_unlock(struct mutex *lock); -extern void debug_mutex_init(struct mutex *lock, const char *name, - struct lock_class_key *key); +extern void debug_mutex_init(struct mutex *lock); #else /* CONFIG_DEBUG_MUTEXES */ # define debug_mutex_lock_common(lock, waiter) do { } while (0) # define debug_mutex_wake_waiter(lock, waiter) do { } while (0) @@ -68,6 +67,6 @@ extern void debug_mutex_init(struct mutex *lock, const char *name, # define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) # define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0) # define debug_mutex_unlock(lock) do { } while (0) -# define debug_mutex_init(lock, name, key) do { } while (0) +# define debug_mutex_init(lock) do { } while (0) #endif /* !CONFIG_DEBUG_MUTEXES */ #endif /* CONFIG_PREEMPT_RT */ diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c index bafd5af98eae..59dbd29cb219 100644 --- a/kernel/locking/rtmutex_api.c +++ b/kernel/locking/rtmutex_api.c @@ -515,13 +515,11 @@ void rt_mutex_debug_task_free(struct task_struct *task) #ifdef CONFIG_PREEMPT_RT /* Mutexes */ -void __mutex_rt_init(struct mutex *mutex, const char *name, - struct lock_class_key *key) +static void __mutex_rt_init_generic(struct mutex *mutex) { + rt_mutex_base_init(&mutex->rtmutex); debug_check_no_locks_freed((void *)mutex, sizeof(*mutex)); - lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP); } -EXPORT_SYMBOL(__mutex_rt_init); static __always_inline int __mutex_lock_common(struct mutex *lock, unsigned int state, @@ -542,6 +540,13 @@ static __always_inline int __mutex_lock_common(struct mutex *lock, } #ifdef CONFIG_DEBUG_LOCK_ALLOC +void mutex_rt_init_lockdep(struct mutex *mutex, const char *name, struct lock_class_key *key) +{ + __mutex_rt_init_generic(mutex); + lockdep_init_map_wait(&mutex->dep_map, name, key, 0, LD_WAIT_SLEEP); +} +EXPORT_SYMBOL(mutex_rt_init_lockdep); + void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) { __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); @@ -598,6 +603,12 @@ int __sched _mutex_trylock_nest_lock(struct mutex *lock, EXPORT_SYMBOL_GPL(_mutex_trylock_nest_lock); #else /* CONFIG_DEBUG_LOCK_ALLOC */ +void mutex_rt_init_generic(struct mutex *mutex) +{ + __mutex_rt_init_generic(mutex); +} +EXPORT_SYMBOL(mutex_rt_init_generic); + void __sched mutex_lock(struct mutex *lock) { __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index 87b03d2e41db..2338b3adfb55 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -184,8 +184,8 @@ void do_raw_read_unlock(rwlock_t *lock) static inline void debug_write_lock_before(rwlock_t *lock) { RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic"); - RWLOCK_BUG_ON(lock->owner == current, lock, "recursion"); - RWLOCK_BUG_ON(lock->owner_cpu == raw_smp_processor_id(), + RWLOCK_BUG_ON(READ_ONCE(lock->owner) == current, lock, "recursion"); + RWLOCK_BUG_ON(READ_ONCE(lock->owner_cpu) == raw_smp_processor_id(), lock, "cpu recursion"); } diff --git a/kernel/nscommon.c b/kernel/nscommon.c index c1fb2bad6d72..bdc3c86231d3 100644 --- a/kernel/nscommon.c +++ b/kernel/nscommon.c @@ -1,7 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ #include <linux/ns_common.h> +#include <linux/nstree.h> #include <linux/proc_ns.h> +#include <linux/user_namespace.h> #include <linux/vfsdebug.h> #ifdef CONFIG_DEBUG_VFS @@ -52,26 +55,257 @@ static void ns_debug(struct ns_common *ns, const struct proc_ns_operations *ops) int __ns_common_init(struct ns_common *ns, u32 ns_type, const struct proc_ns_operations *ops, int inum) { + int ret = 0; + refcount_set(&ns->__ns_ref, 1); ns->stashed = NULL; ns->ops = ops; ns->ns_id = 0; ns->ns_type = ns_type; - RB_CLEAR_NODE(&ns->ns_tree_node); - INIT_LIST_HEAD(&ns->ns_list_node); + ns_tree_node_init(&ns->ns_tree_node); + ns_tree_node_init(&ns->ns_unified_node); + ns_tree_node_init(&ns->ns_owner_node); + ns_tree_root_init(&ns->ns_owner_root); #ifdef CONFIG_DEBUG_VFS ns_debug(ns, ops); #endif - if (inum) { + if (inum) ns->inum = inum; - return 0; - } - return proc_alloc_inum(&ns->inum); + else + ret = proc_alloc_inum(&ns->inum); + if (ret) + return ret; + /* + * Tree ref starts at 0. It's incremented when namespace enters + * active use (installed in nsproxy) and decremented when all + * active uses are gone. Initial namespaces are always active. + */ + if (is_ns_init_inum(ns)) + atomic_set(&ns->__ns_ref_active, 1); + else + atomic_set(&ns->__ns_ref_active, 0); + return 0; } void __ns_common_free(struct ns_common *ns) { proc_free_inum(ns->inum); } + +struct ns_common *__must_check ns_owner(struct ns_common *ns) +{ + struct user_namespace *owner; + + if (unlikely(!ns->ops)) + return NULL; + VFS_WARN_ON_ONCE(!ns->ops->owner); + owner = ns->ops->owner(ns); + VFS_WARN_ON_ONCE(!owner && ns != to_ns_common(&init_user_ns)); + if (!owner) + return NULL; + /* Skip init_user_ns as it's always active */ + if (owner == &init_user_ns) + return NULL; + return to_ns_common(owner); +} + +/* + * The active reference count works by having each namespace that gets + * created take a single active reference on its owning user namespace. + * That single reference is only released once the child namespace's + * active count itself goes down. + * + * A regular namespace tree might look as follow: + * Legend: + * + : adding active reference + * - : dropping active reference + * x : always active (initial namespace) + * + * + * net_ns pid_ns + * \ / + * + + + * user_ns1 (2) + * | + * ipc_ns | uts_ns + * \ | / + * + + + + * user_ns2 (3) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If both net_ns and pid_ns put their last active reference on + * themselves it will cascade to user_ns1 dropping its own active + * reference and dropping one active reference on user_ns2: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * + - + + * user_ns2 (2) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * The iteration stops once we reach a namespace that still has active + * references. + */ +void __ns_ref_active_put(struct ns_common *ns) +{ + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); + return; + } + + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + VFS_WARN_ON_ONCE(!__ns_ref_read(ns)); + + for (;;) { + ns = ns_owner(ns); + if (!ns) + return; + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + if (!atomic_dec_and_test(&ns->__ns_ref_active)) { + VFS_WARN_ON_ONCE(__ns_ref_active_read(ns) < 0); + return; + } + } +} + +/* + * The active reference count works by having each namespace that gets + * created take a single active reference on its owning user namespace. + * That single reference is only released once the child namespace's + * active count itself goes down. This makes it possible to efficiently + * resurrect a namespace tree: + * + * A regular namespace tree might look as follow: + * Legend: + * + : adding active reference + * - : dropping active reference + * x : always active (initial namespace) + * + * + * net_ns pid_ns + * \ / + * + + + * user_ns1 (2) + * | + * ipc_ns | uts_ns + * \ | / + * + + + + * user_ns2 (3) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If both net_ns and pid_ns put their last active reference on + * themselves it will cascade to user_ns1 dropping its own active + * reference and dropping one active reference on user_ns2: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * + - + + * user_ns2 (2) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * Assume the whole tree is dead but all namespaces are still active: + * + * net_ns pid_ns + * \ / + * - - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * - - - + * user_ns2 (0) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * Now assume the net_ns gets resurrected (.e.g., via the SIOCGSKNS ioctl()): + * + * net_ns pid_ns + * \ / + * + - + * user_ns1 (0) + * | + * ipc_ns | uts_ns + * \ | / + * - + - + * user_ns2 (0) + * | + * cgroup_ns | mnt_ns + * \ | / + * x x x + * init_user_ns (1) + * + * If net_ns had a zero reference count and we bumped it we also need to + * take another reference on its owning user namespace. Similarly, if + * pid_ns had a zero reference count it also needs to take another + * reference on its owning user namespace. So both net_ns and pid_ns + * will each have their own reference on the owning user namespace. + * + * If the owning user namespace user_ns1 had a zero reference count then + * it also needs to take another reference on its owning user namespace + * and so on. + */ +void __ns_ref_active_get(struct ns_common *ns) +{ + int prev; + + /* Initial namespaces are always active. */ + if (is_ns_init_id(ns)) + return; + + /* If we didn't resurrect the namespace we're done. */ + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) + return; + + /* + * We did resurrect it. Walk the ownership hierarchy upwards + * until we found an owning user namespace that is active. + */ + for (;;) { + ns = ns_owner(ns); + if (!ns) + return; + + VFS_WARN_ON_ONCE(is_ns_init_id(ns)); + prev = atomic_fetch_add(1, &ns->__ns_ref_active); + VFS_WARN_ON_ONCE(prev < 0); + if (likely(prev)) + return; + } +} diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 19aa64ab08c8..259c4b4f1eeb 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -26,6 +26,7 @@ #include <linux/syscalls.h> #include <linux/cgroup.h> #include <linux/perf_event.h> +#include <linux/nstree.h> static struct kmem_cache *nsproxy_cachep; @@ -59,6 +60,25 @@ static inline struct nsproxy *create_nsproxy(void) return nsproxy; } +static inline void nsproxy_free(struct nsproxy *ns) +{ + put_mnt_ns(ns->mnt_ns); + put_uts_ns(ns->uts_ns); + put_ipc_ns(ns->ipc_ns); + put_pid_ns(ns->pid_ns_for_children); + put_time_ns(ns->time_ns); + put_time_ns(ns->time_ns_for_children); + put_cgroup_ns(ns->cgroup_ns); + put_net(ns->net_ns); + kmem_cache_free(nsproxy_cachep, ns); +} + +void deactivate_nsproxy(struct nsproxy *ns) +{ + nsproxy_ns_active_put(ns); + nsproxy_free(ns); +} + /* * Create new nsproxy and all of its the associated namespaces. * Return the newly created nsproxy. Do not attach this to the task, @@ -179,23 +199,11 @@ int copy_namespaces(u64 flags, struct task_struct *tsk) if ((flags & CLONE_VM) == 0) timens_on_fork(new_ns, tsk); + nsproxy_ns_active_get(new_ns); tsk->nsproxy = new_ns; return 0; } -void free_nsproxy(struct nsproxy *ns) -{ - put_mnt_ns(ns->mnt_ns); - put_uts_ns(ns->uts_ns); - put_ipc_ns(ns->ipc_ns); - put_pid_ns(ns->pid_ns_for_children); - put_time_ns(ns->time_ns); - put_time_ns(ns->time_ns_for_children); - put_cgroup_ns(ns->cgroup_ns); - put_net(ns->net_ns); - kmem_cache_free(nsproxy_cachep, ns); -} - /* * Called from unshare. Unshare all the namespaces part of nsproxy. * On success, returns the new nsproxy. @@ -232,6 +240,9 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) might_sleep(); + if (new) + nsproxy_ns_active_get(new); + task_lock(p); ns = p->nsproxy; p->nsproxy = new; @@ -241,11 +252,27 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) put_nsproxy(ns); } -void exit_task_namespaces(struct task_struct *p) +void exit_nsproxy_namespaces(struct task_struct *p) { switch_task_namespaces(p, NULL); } +void switch_cred_namespaces(const struct cred *old, const struct cred *new) +{ + ns_ref_active_get(new->user_ns); + ns_ref_active_put(old->user_ns); +} + +void get_cred_namespaces(struct task_struct *tsk) +{ + ns_ref_active_get(tsk->real_cred->user_ns); +} + +void exit_cred_namespaces(struct task_struct *tsk) +{ + ns_ref_active_put(tsk->real_cred->user_ns); +} + int exec_task_namespaces(void) { struct task_struct *tsk = current; @@ -315,7 +342,7 @@ static void put_nsset(struct nsset *nsset) if (nsset->fs && (flags & CLONE_NEWNS) && (flags & ~CLONE_NEWNS)) free_fs_struct(nsset->fs); if (nsset->nsproxy) - free_nsproxy(nsset->nsproxy); + nsproxy_free(nsset->nsproxy); } static int prepare_nsset(unsigned flags, struct nsset *nsset) diff --git a/kernel/nstree.c b/kernel/nstree.c index b24a320a11a6..f36c59e6951d 100644 --- a/kernel/nstree.c +++ b/kernel/nstree.c @@ -1,140 +1,261 @@ // SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2025 Christian Brauner <brauner@kernel.org> */ #include <linux/nstree.h> #include <linux/proc_ns.h> +#include <linux/rculist.h> #include <linux/vfsdebug.h> +#include <linux/syscalls.h> +#include <linux/user_namespace.h> -/** - * struct ns_tree - Namespace tree - * @ns_tree: Rbtree of namespaces of a particular type - * @ns_list: Sequentially walkable list of all namespaces of this type - * @ns_tree_lock: Seqlock to protect the tree and list - * @type: type of namespaces in this tree - */ -struct ns_tree { - struct rb_root ns_tree; - struct list_head ns_list; - seqlock_t ns_tree_lock; - int type; +static __cacheline_aligned_in_smp DEFINE_SEQLOCK(ns_tree_lock); + +DEFINE_LOCK_GUARD_0(ns_tree_writer, + write_seqlock(&ns_tree_lock), + write_sequnlock(&ns_tree_lock)) + +DEFINE_LOCK_GUARD_0(ns_tree_locked_reader, + read_seqlock_excl(&ns_tree_lock), + read_sequnlock_excl(&ns_tree_lock)) + +static struct ns_tree_root ns_unified_root = { /* protected by ns_tree_lock */ + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(ns_unified_root.ns_list_head), }; -struct ns_tree mnt_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(mnt_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(mnt_ns_tree.ns_tree_lock), - .type = CLONE_NEWNS, +struct ns_tree_root mnt_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(mnt_ns_tree.ns_list_head), }; -struct ns_tree net_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(net_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(net_ns_tree.ns_tree_lock), - .type = CLONE_NEWNET, +struct ns_tree_root net_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(net_ns_tree.ns_list_head), }; EXPORT_SYMBOL_GPL(net_ns_tree); -struct ns_tree uts_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(uts_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(uts_ns_tree.ns_tree_lock), - .type = CLONE_NEWUTS, +struct ns_tree_root uts_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(uts_ns_tree.ns_list_head), }; -struct ns_tree user_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(user_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(user_ns_tree.ns_tree_lock), - .type = CLONE_NEWUSER, +struct ns_tree_root user_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(user_ns_tree.ns_list_head), }; -struct ns_tree ipc_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(ipc_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(ipc_ns_tree.ns_tree_lock), - .type = CLONE_NEWIPC, +struct ns_tree_root ipc_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(ipc_ns_tree.ns_list_head), }; -struct ns_tree pid_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(pid_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(pid_ns_tree.ns_tree_lock), - .type = CLONE_NEWPID, +struct ns_tree_root pid_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(pid_ns_tree.ns_list_head), }; -struct ns_tree cgroup_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(cgroup_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(cgroup_ns_tree.ns_tree_lock), - .type = CLONE_NEWCGROUP, +struct ns_tree_root cgroup_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(cgroup_ns_tree.ns_list_head), }; -struct ns_tree time_ns_tree = { - .ns_tree = RB_ROOT, - .ns_list = LIST_HEAD_INIT(time_ns_tree.ns_list), - .ns_tree_lock = __SEQLOCK_UNLOCKED(time_ns_tree.ns_tree_lock), - .type = CLONE_NEWTIME, +struct ns_tree_root time_ns_tree = { + .ns_rb = RB_ROOT, + .ns_list_head = LIST_HEAD_INIT(time_ns_tree.ns_list_head), }; -DEFINE_COOKIE(namespace_cookie); +/** + * ns_tree_node_init - Initialize a namespace tree node + * @node: The node to initialize + * + * Initializes both the rbtree node and list entry. + */ +void ns_tree_node_init(struct ns_tree_node *node) +{ + RB_CLEAR_NODE(&node->ns_node); + INIT_LIST_HEAD(&node->ns_list_entry); +} + +/** + * ns_tree_root_init - Initialize a namespace tree root + * @root: The root to initialize + * + * Initializes both the rbtree root and list head. + */ +void ns_tree_root_init(struct ns_tree_root *root) +{ + root->ns_rb = RB_ROOT; + INIT_LIST_HEAD(&root->ns_list_head); +} + +/** + * ns_tree_node_empty - Check if a namespace tree node is empty + * @node: The node to check + * + * Returns true if the node is not in any tree. + */ +bool ns_tree_node_empty(const struct ns_tree_node *node) +{ + return RB_EMPTY_NODE(&node->ns_node); +} + +/** + * ns_tree_node_add - Add a node to a namespace tree + * @node: The node to add + * @root: The tree root to add to + * @cmp: Comparison function for rbtree insertion + * + * Adds the node to both the rbtree and the list, maintaining sorted order. + * The list is maintained in the same order as the rbtree to enable efficient + * iteration. + * + * Returns: NULL if insertion succeeded, existing node if duplicate found + */ +struct rb_node *ns_tree_node_add(struct ns_tree_node *node, + struct ns_tree_root *root, + int (*cmp)(struct rb_node *, const struct rb_node *)) +{ + struct rb_node *ret, *prev; + + /* Add to rbtree */ + ret = rb_find_add_rcu(&node->ns_node, &root->ns_rb, cmp); + + /* Add to list in sorted order */ + prev = rb_prev(&node->ns_node); + if (!prev) { + /* No previous node, add at head */ + list_add_rcu(&node->ns_list_entry, &root->ns_list_head); + } else { + /* Add after previous node */ + struct ns_tree_node *prev_node; + prev_node = rb_entry(prev, struct ns_tree_node, ns_node); + list_add_rcu(&node->ns_list_entry, &prev_node->ns_list_entry); + } + + return ret; +} + +/** + * ns_tree_node_del - Remove a node from a namespace tree + * @node: The node to remove + * @root: The tree root to remove from + * + * Removes the node from both the rbtree and the list atomically. + */ +void ns_tree_node_del(struct ns_tree_node *node, struct ns_tree_root *root) +{ + rb_erase(&node->ns_node, &root->ns_rb); + RB_CLEAR_NODE(&node->ns_node); + list_bidir_del_rcu(&node->ns_list_entry); +} static inline struct ns_common *node_to_ns(const struct rb_node *node) { if (!node) return NULL; - return rb_entry(node, struct ns_common, ns_tree_node); + return rb_entry(node, struct ns_common, ns_tree_node.ns_node); } -static inline int ns_cmp(struct rb_node *a, const struct rb_node *b) +static inline struct ns_common *node_to_ns_unified(const struct rb_node *node) { - struct ns_common *ns_a = node_to_ns(a); - struct ns_common *ns_b = node_to_ns(b); - u64 ns_id_a = ns_a->ns_id; - u64 ns_id_b = ns_b->ns_id; + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_unified_node.ns_node); +} - if (ns_id_a < ns_id_b) +static inline struct ns_common *node_to_ns_owner(const struct rb_node *node) +{ + if (!node) + return NULL; + return rb_entry(node, struct ns_common, ns_owner_node.ns_node); +} + +static int ns_id_cmp(u64 id_a, u64 id_b) +{ + if (id_a < id_b) return -1; - if (ns_id_a > ns_id_b) + if (id_a > id_b) return 1; return 0; } -void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree *ns_tree) +static int ns_cmp(struct rb_node *a, const struct rb_node *b) +{ + return ns_id_cmp(node_to_ns(a)->ns_id, node_to_ns(b)->ns_id); +} + +static int ns_cmp_unified(struct rb_node *a, const struct rb_node *b) +{ + return ns_id_cmp(node_to_ns_unified(a)->ns_id, node_to_ns_unified(b)->ns_id); +} + +static int ns_cmp_owner(struct rb_node *a, const struct rb_node *b) { - struct rb_node *node, *prev; + return ns_id_cmp(node_to_ns_owner(a)->ns_id, node_to_ns_owner(b)->ns_id); +} + +void __ns_tree_add_raw(struct ns_common *ns, struct ns_tree_root *ns_tree) +{ + struct rb_node *node; + const struct proc_ns_operations *ops = ns->ops; VFS_WARN_ON_ONCE(!ns->ns_id); - write_seqlock(&ns_tree->ns_tree_lock); + guard(ns_tree_writer)(); - VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); + /* Add to per-type tree and list */ + node = ns_tree_node_add(&ns->ns_tree_node, ns_tree, ns_cmp); - node = rb_find_add_rcu(&ns->ns_tree_node, &ns_tree->ns_tree, ns_cmp); - /* - * If there's no previous entry simply add it after the - * head and if there is add it after the previous entry. - */ - prev = rb_prev(&ns->ns_tree_node); - if (!prev) - list_add_rcu(&ns->ns_list_node, &ns_tree->ns_list); - else - list_add_rcu(&ns->ns_list_node, &node_to_ns(prev)->ns_list_node); + /* Add to unified tree and list */ + ns_tree_node_add(&ns->ns_unified_node, &ns_unified_root, ns_cmp_unified); + + /* Add to owner's tree if applicable */ + if (ops) { + struct user_namespace *user_ns; - write_sequnlock(&ns_tree->ns_tree_lock); + VFS_WARN_ON_ONCE(!ops->owner); + user_ns = ops->owner(ns); + if (user_ns) { + struct ns_common *owner = &user_ns->ns; + VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); + + /* Insert into owner's tree and list */ + ns_tree_node_add(&ns->ns_owner_node, &owner->ns_owner_root, ns_cmp_owner); + } else { + /* Only the initial user namespace doesn't have an owner. */ + VFS_WARN_ON_ONCE(ns != to_ns_common(&init_user_ns)); + } + } VFS_WARN_ON_ONCE(node); } -void __ns_tree_remove(struct ns_common *ns, struct ns_tree *ns_tree) +void __ns_tree_remove(struct ns_common *ns, struct ns_tree_root *ns_tree) { - VFS_WARN_ON_ONCE(RB_EMPTY_NODE(&ns->ns_tree_node)); - VFS_WARN_ON_ONCE(list_empty(&ns->ns_list_node)); - VFS_WARN_ON_ONCE(ns->ns_type != ns_tree->type); + const struct proc_ns_operations *ops = ns->ops; + struct user_namespace *user_ns; + + VFS_WARN_ON_ONCE(ns_tree_node_empty(&ns->ns_tree_node)); + VFS_WARN_ON_ONCE(list_empty(&ns->ns_tree_node.ns_list_entry)); + + write_seqlock(&ns_tree_lock); + + /* Remove from per-type tree and list */ + ns_tree_node_del(&ns->ns_tree_node, ns_tree); + + /* Remove from unified tree and list */ + ns_tree_node_del(&ns->ns_unified_node, &ns_unified_root); - write_seqlock(&ns_tree->ns_tree_lock); - rb_erase(&ns->ns_tree_node, &ns_tree->ns_tree); - list_bidir_del_rcu(&ns->ns_list_node); - RB_CLEAR_NODE(&ns->ns_tree_node); - write_sequnlock(&ns_tree->ns_tree_lock); + /* Remove from owner's tree if applicable */ + if (ops) { + user_ns = ops->owner(ns); + if (user_ns) { + struct ns_common *owner = &user_ns->ns; + ns_tree_node_del(&ns->ns_owner_node, &owner->ns_owner_root); + } + } + + write_sequnlock(&ns_tree_lock); } EXPORT_SYMBOL_GPL(__ns_tree_remove); @@ -150,8 +271,19 @@ static int ns_find(const void *key, const struct rb_node *node) return 0; } +static int ns_find_unified(const void *key, const struct rb_node *node) +{ + const u64 ns_id = *(u64 *)key; + const struct ns_common *ns = node_to_ns_unified(node); -static struct ns_tree *ns_tree_from_type(int ns_type) + if (ns_id < ns->ns_id) + return -1; + if (ns_id > ns->ns_id) + return 1; + return 0; +} + +static struct ns_tree_root *ns_tree_from_type(int ns_type) { switch (ns_type) { case CLONE_NEWCGROUP: @@ -175,73 +307,507 @@ static struct ns_tree *ns_tree_from_type(int ns_type) return NULL; } -struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +static struct ns_common *__ns_unified_tree_lookup_rcu(u64 ns_id) { - struct ns_tree *ns_tree; struct rb_node *node; unsigned int seq; - RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); + do { + seq = read_seqbegin(&ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_unified_root.ns_rb, ns_find_unified); + if (node) + break; + } while (read_seqretry(&ns_tree_lock, seq)); + + return node_to_ns_unified(node); +} + +static struct ns_common *__ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + struct ns_tree_root *ns_tree; + struct rb_node *node; + unsigned int seq; ns_tree = ns_tree_from_type(ns_type); if (!ns_tree) return NULL; do { - seq = read_seqbegin(&ns_tree->ns_tree_lock); - node = rb_find_rcu(&ns_id, &ns_tree->ns_tree, ns_find); + seq = read_seqbegin(&ns_tree_lock); + node = rb_find_rcu(&ns_id, &ns_tree->ns_rb, ns_find); if (node) break; - } while (read_seqretry(&ns_tree->ns_tree_lock, seq)); + } while (read_seqretry(&ns_tree_lock, seq)); - if (!node) - return NULL; + return node_to_ns(node); +} - VFS_WARN_ON_ONCE(node_to_ns(node)->ns_type != ns_type); +struct ns_common *ns_tree_lookup_rcu(u64 ns_id, int ns_type) +{ + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_lookup_rcu() usage"); - return node_to_ns(node); + if (ns_type) + return __ns_tree_lookup_rcu(ns_id, ns_type); + + return __ns_unified_tree_lookup_rcu(ns_id); } /** - * ns_tree_adjoined_rcu - find the next/previous namespace in the same + * __ns_tree_adjoined_rcu - find the next/previous namespace in the same * tree * @ns: namespace to start from + * @ns_tree: namespace tree to search in * @previous: if true find the previous namespace, otherwise the next * * Find the next or previous namespace in the same tree as @ns. If * there is no next/previous namespace, -ENOENT is returned. */ struct ns_common *__ns_tree_adjoined_rcu(struct ns_common *ns, - struct ns_tree *ns_tree, bool previous) + struct ns_tree_root *ns_tree, bool previous) { struct list_head *list; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious ns_tree_adjoined_rcu() usage"); if (previous) - list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_list_node)); + list = rcu_dereference(list_bidir_prev_rcu(&ns->ns_tree_node.ns_list_entry)); else - list = rcu_dereference(list_next_rcu(&ns->ns_list_node)); - if (list_is_head(list, &ns_tree->ns_list)) + list = rcu_dereference(list_next_rcu(&ns->ns_tree_node.ns_list_entry)); + if (list_is_head(list, &ns_tree->ns_list_head)) return ERR_PTR(-ENOENT); - VFS_WARN_ON_ONCE(list_entry_rcu(list, struct ns_common, ns_list_node)->ns_type != ns_tree->type); - - return list_entry_rcu(list, struct ns_common, ns_list_node); + return list_entry_rcu(list, struct ns_common, ns_tree_node.ns_list_entry); } /** - * ns_tree_gen_id - generate a new namespace id + * __ns_tree_gen_id - generate a new namespace id * @ns: namespace to generate id for + * @id: if non-zero, this is the initial namespace and this is a fixed id * * Generates a new namespace id and assigns it to the namespace. All * namespaces types share the same id space and thus can be compared * directly. IOW, when two ids of two namespace are equal, they are * identical. */ -u64 ns_tree_gen_id(struct ns_common *ns) +u64 __ns_tree_gen_id(struct ns_common *ns, u64 id) { - guard(preempt)(); - ns->ns_id = gen_cookie_next(&namespace_cookie); + static atomic64_t namespace_cookie = ATOMIC64_INIT(NS_LAST_INIT_ID + 1); + + if (id) + ns->ns_id = id; + else + ns->ns_id = atomic64_inc_return(&namespace_cookie); return ns->ns_id; } + +struct klistns { + u64 __user *uns_ids; + u32 nr_ns_ids; + u64 last_ns_id; + u64 user_ns_id; + u32 ns_type; + struct user_namespace *user_ns; + bool userns_capable; + struct ns_common *first_ns; +}; + +static void __free_klistns_free(const struct klistns *kls) +{ + if (kls->user_ns_id != LISTNS_CURRENT_USER) + put_user_ns(kls->user_ns); + if (kls->first_ns && kls->first_ns->ops) + kls->first_ns->ops->put(kls->first_ns); +} + +#define NS_ALL (PID_NS | USER_NS | MNT_NS | UTS_NS | IPC_NS | NET_NS | CGROUP_NS | TIME_NS) + +static int copy_ns_id_req(const struct ns_id_req __user *req, + struct ns_id_req *kreq) +{ + int ret; + size_t usize; + + BUILD_BUG_ON(sizeof(struct ns_id_req) != NS_ID_REQ_SIZE_VER0); + + ret = get_user(usize, &req->size); + if (ret) + return -EFAULT; + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; + if (unlikely(usize < NS_ID_REQ_SIZE_VER0)) + return -EINVAL; + memset(kreq, 0, sizeof(*kreq)); + ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize); + if (ret) + return ret; + if (kreq->spare != 0) + return -EINVAL; + if (kreq->ns_type & ~NS_ALL) + return -EOPNOTSUPP; + return 0; +} + +static inline int prepare_klistns(struct klistns *kls, struct ns_id_req *kreq, + u64 __user *ns_ids, size_t nr_ns_ids) +{ + kls->last_ns_id = kreq->ns_id; + kls->user_ns_id = kreq->user_ns_id; + kls->nr_ns_ids = nr_ns_ids; + kls->ns_type = kreq->ns_type; + kls->uns_ids = ns_ids; + return 0; +} + +/* + * Lookup a namespace owned by owner with id >= ns_id. + * Returns the namespace with the smallest id that is >= ns_id. + */ +static struct ns_common *lookup_ns_owner_at(u64 ns_id, struct ns_common *owner) +{ + struct ns_common *ret = NULL; + struct rb_node *node; + + VFS_WARN_ON_ONCE(owner->ns_type != CLONE_NEWUSER); + + guard(ns_tree_locked_reader)(); + + node = owner->ns_owner_root.ns_rb.rb_node; + while (node) { + struct ns_common *ns; + + ns = node_to_ns_owner(node); + if (ns_id <= ns->ns_id) { + ret = ns; + if (ns_id == ns->ns_id) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + + if (ret) + ret = ns_get_unless_inactive(ret); + return ret; +} + +static struct ns_common *lookup_ns_id(u64 mnt_ns_id, int ns_type) +{ + struct ns_common *ns; + + guard(rcu)(); + ns = ns_tree_lookup_rcu(mnt_ns_id, ns_type); + if (!ns) + return NULL; + + if (!ns_get_unless_inactive(ns)) + return NULL; + + return ns; +} + +static inline bool __must_check ns_requested(const struct klistns *kls, + const struct ns_common *ns) +{ + return !kls->ns_type || (kls->ns_type & ns->ns_type); +} + +static inline bool __must_check may_list_ns(const struct klistns *kls, + struct ns_common *ns) +{ + if (kls->user_ns) { + if (kls->userns_capable) + return true; + } else { + struct ns_common *owner; + struct user_namespace *user_ns; + + owner = ns_owner(ns); + if (owner) + user_ns = to_user_ns(owner); + else + user_ns = &init_user_ns; + if (ns_capable_noaudit(user_ns, CAP_SYS_ADMIN)) + return true; + } + + if (is_current_namespace(ns)) + return true; + + if (ns->ns_type != CLONE_NEWUSER) + return false; + + if (ns_capable_noaudit(to_user_ns(ns), CAP_SYS_ADMIN)) + return true; + + return false; +} + +static inline void ns_put(struct ns_common *ns) +{ + if (ns && ns->ops) + ns->ops->put(ns); +} + +DEFINE_FREE(ns_put, struct ns_common *, if (!IS_ERR_OR_NULL(_T)) ns_put(_T)) + +static inline struct ns_common *__must_check legitimize_ns(const struct klistns *kls, + struct ns_common *candidate) +{ + struct ns_common *ns __free(ns_put) = NULL; + + if (!ns_requested(kls, candidate)) + return NULL; + + ns = ns_get_unless_inactive(candidate); + if (!ns) + return NULL; + + if (!may_list_ns(kls, ns)) + return NULL; + + return no_free_ptr(ns); +} + +static ssize_t do_listns_userns(struct klistns *kls) +{ + u64 __user *ns_ids = kls->uns_ids; + size_t nr_ns_ids = kls->nr_ns_ids; + struct ns_common *ns = NULL, *first_ns = NULL, *prev = NULL; + const struct list_head *head; + ssize_t ret; + + VFS_WARN_ON_ONCE(!kls->user_ns_id); + + if (kls->user_ns_id == LISTNS_CURRENT_USER) + ns = to_ns_common(current_user_ns()); + else if (kls->user_ns_id) + ns = lookup_ns_id(kls->user_ns_id, CLONE_NEWUSER); + if (!ns) + return -EINVAL; + kls->user_ns = to_user_ns(ns); + + /* + * Use the rbtree to find the first namespace we care about and + * then use it's list entry to iterate from there. + */ + if (kls->last_ns_id) { + kls->first_ns = lookup_ns_owner_at(kls->last_ns_id + 1, ns); + if (!kls->first_ns) + return -ENOENT; + first_ns = kls->first_ns; + } + + ret = 0; + head = &to_ns_common(kls->user_ns)->ns_owner_root.ns_list_head; + kls->userns_capable = ns_capable_noaudit(kls->user_ns, CAP_SYS_ADMIN); + + rcu_read_lock(); + + if (!first_ns) + first_ns = list_entry_rcu(head->next, typeof(*first_ns), ns_owner_node.ns_list_entry); + + ns = first_ns; + list_for_each_entry_from_rcu(ns, head, ns_owner_node.ns_list_entry) { + struct ns_common *valid; + + if (!nr_ns_ids) + break; + + valid = legitimize_ns(kls, ns); + if (!valid) + continue; + + rcu_read_unlock(); + + ns_put(prev); + prev = valid; + + if (put_user(valid->ns_id, ns_ids + ret)) { + ns_put(prev); + return -EFAULT; + } + + nr_ns_ids--; + ret++; + + rcu_read_lock(); + } + + rcu_read_unlock(); + ns_put(prev); + return ret; +} + +/* + * Lookup a namespace with id >= ns_id in either the unified tree or a type-specific tree. + * Returns the namespace with the smallest id that is >= ns_id. + */ +static struct ns_common *lookup_ns_id_at(u64 ns_id, int ns_type) +{ + struct ns_common *ret = NULL; + struct ns_tree_root *ns_tree = NULL; + struct rb_node *node; + + if (ns_type) { + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return NULL; + } + + guard(ns_tree_locked_reader)(); + + if (ns_tree) + node = ns_tree->ns_rb.rb_node; + else + node = ns_unified_root.ns_rb.rb_node; + + while (node) { + struct ns_common *ns; + + if (ns_type) + ns = node_to_ns(node); + else + ns = node_to_ns_unified(node); + + if (ns_id <= ns->ns_id) { + if (ns_type) + ret = node_to_ns(node); + else + ret = node_to_ns_unified(node); + if (ns_id == ns->ns_id) + break; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + + if (ret) + ret = ns_get_unless_inactive(ret); + return ret; +} + +static inline struct ns_common *first_ns_common(const struct list_head *head, + struct ns_tree_root *ns_tree) +{ + if (ns_tree) + return list_entry_rcu(head->next, struct ns_common, ns_tree_node.ns_list_entry); + return list_entry_rcu(head->next, struct ns_common, ns_unified_node.ns_list_entry); +} + +static inline struct ns_common *next_ns_common(struct ns_common *ns, + struct ns_tree_root *ns_tree) +{ + if (ns_tree) + return list_entry_rcu(ns->ns_tree_node.ns_list_entry.next, struct ns_common, ns_tree_node.ns_list_entry); + return list_entry_rcu(ns->ns_unified_node.ns_list_entry.next, struct ns_common, ns_unified_node.ns_list_entry); +} + +static inline bool ns_common_is_head(struct ns_common *ns, + const struct list_head *head, + struct ns_tree_root *ns_tree) +{ + if (ns_tree) + return &ns->ns_tree_node.ns_list_entry == head; + return &ns->ns_unified_node.ns_list_entry == head; +} + +static ssize_t do_listns(struct klistns *kls) +{ + u64 __user *ns_ids = kls->uns_ids; + size_t nr_ns_ids = kls->nr_ns_ids; + struct ns_common *ns, *first_ns = NULL, *prev = NULL; + struct ns_tree_root *ns_tree = NULL; + const struct list_head *head; + u32 ns_type; + ssize_t ret; + + if (hweight32(kls->ns_type) == 1) + ns_type = kls->ns_type; + else + ns_type = 0; + + if (ns_type) { + ns_tree = ns_tree_from_type(ns_type); + if (!ns_tree) + return -EINVAL; + } + + if (kls->last_ns_id) { + kls->first_ns = lookup_ns_id_at(kls->last_ns_id + 1, ns_type); + if (!kls->first_ns) + return -ENOENT; + first_ns = kls->first_ns; + } + + ret = 0; + if (ns_tree) + head = &ns_tree->ns_list_head; + else + head = &ns_unified_root.ns_list_head; + + rcu_read_lock(); + + if (!first_ns) + first_ns = first_ns_common(head, ns_tree); + + for (ns = first_ns; !ns_common_is_head(ns, head, ns_tree) && nr_ns_ids; + ns = next_ns_common(ns, ns_tree)) { + struct ns_common *valid; + + valid = legitimize_ns(kls, ns); + if (!valid) + continue; + + rcu_read_unlock(); + + ns_put(prev); + prev = valid; + + if (put_user(valid->ns_id, ns_ids + ret)) { + ns_put(prev); + return -EFAULT; + } + + nr_ns_ids--; + ret++; + + rcu_read_lock(); + } + + rcu_read_unlock(); + ns_put(prev); + return ret; +} + +SYSCALL_DEFINE4(listns, const struct ns_id_req __user *, req, + u64 __user *, ns_ids, size_t, nr_ns_ids, unsigned int, flags) +{ + struct klistns klns __free(klistns_free) = {}; + const size_t maxcount = 1000000; + struct ns_id_req kreq; + ssize_t ret; + + if (flags) + return -EINVAL; + + if (unlikely(nr_ns_ids > maxcount)) + return -EOVERFLOW; + + if (!access_ok(ns_ids, nr_ns_ids * sizeof(*ns_ids))) + return -EFAULT; + + ret = copy_ns_id_req(req, &kreq); + if (ret) + return ret; + + ret = prepare_klistns(&klns, &kreq, ns_ids, nr_ns_ids); + if (ret) + return ret; + + if (kreq.user_ns_id) + return do_listns_userns(&klns); + + return do_listns(&klns); +} diff --git a/kernel/panic.c b/kernel/panic.c index 24cc3eec1805..b2f2470af7e5 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -873,13 +873,15 @@ void __warn(const char *file, int line, void *caller, unsigned taint, disable_trace_on_warning(); - if (file) - pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", - raw_smp_processor_id(), current->pid, file, line, - caller); - else - pr_warn("WARNING: CPU: %d PID: %d at %pS\n", - raw_smp_processor_id(), current->pid, caller); + if (file) { + pr_warn("WARNING: %s:%d at %pS, CPU#%d: %s/%d\n", + file, line, caller, + raw_smp_processor_id(), current->comm, current->pid); + } else { + pr_warn("WARNING: at %pS, CPU#%d: %s/%d\n", + caller, + raw_smp_processor_id(), current->comm, current->pid); + } #pragma GCC diagnostic push #ifndef __clang__ diff --git a/kernel/pid.c b/kernel/pid.c index 4fffec767a63..a31771bc89c1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -71,21 +71,16 @@ static int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .ns.__ns_ref = REFCOUNT_INIT(2), + .ns = NS_COMMON_INIT(init_pid_ns), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, - .ns.inum = ns_init_inum(&init_pid_ns), -#ifdef CONFIG_PID_NS - .ns.ops = &pidns_operations, -#endif .pid_max = PID_MAX_DEFAULT, #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif - .ns.ns_type = ns_common_type(&init_pid_ns), }; EXPORT_SYMBOL_GPL(init_pid_ns); @@ -117,9 +112,13 @@ static void delayed_put_pid(struct rcu_head *rhp) void free_pid(struct pid *pid) { int i; + struct pid_namespace *active_ns; lockdep_assert_not_held(&tasklist_lock); + active_ns = pid->numbers[pid->level].ns; + ns_ref_active_put(active_ns); + spin_lock(&pidmap_lock); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; @@ -283,6 +282,7 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, } spin_unlock(&pidmap_lock); idr_preload_end(); + ns_ref_active_get(ns); return pid; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 650be58d8d18..e48f5de41361 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -184,7 +184,7 @@ struct pid_namespace *copy_pid_ns(u64 flags, void put_pid_ns(struct pid_namespace *ns) { - if (ns && ns != &init_pid_ns && ns_ref_put(ns)) + if (ns && ns_ref_put(ns)) schedule_work(&ns->work); } EXPORT_SYMBOL_GPL(put_pid_ns); diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 53166ef86ba4..26e45f86b955 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -821,8 +821,7 @@ int hibernate(void) goto Restore; ksys_sync_helper(); - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) @@ -928,8 +927,7 @@ int hibernate_quiet_exec(int (*func)(void *data), void *data) if (error) goto restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); error = freeze_processes(); if (error) @@ -1079,8 +1077,7 @@ static int software_resume(void) if (error) goto Restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); pm_pr_dbg("Preparing processes for hibernation restore.\n"); error = freeze_processes(); diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index b4ca17c2fecf..3d4ebedad69f 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -375,8 +375,7 @@ static int suspend_prepare(suspend_state_t state) if (error) goto Restore; - if (filesystem_freeze_enabled) - filesystems_freeze(); + filesystems_freeze(filesystem_freeze_enabled); trace_suspend_resume(TPS("freeze_processes"), 0, true); error = suspend_freeze_processes(); trace_suspend_resume(TPS("freeze_processes"), 0, false); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 0beff7eeaaba..70ae21f7370d 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -635,7 +635,7 @@ struct cmp_data { }; /* Indicates the image size after compression */ -static atomic_t compressed_size = ATOMIC_INIT(0); +static atomic64_t compressed_size = ATOMIC_INIT(0); /* * Compression function that runs in its own thread. @@ -664,7 +664,7 @@ static int compress_threadfn(void *data) d->ret = crypto_acomp_compress(d->cr); d->cmp_len = d->cr->dlen; - atomic_set(&compressed_size, atomic_read(&compressed_size) + d->cmp_len); + atomic64_add(d->cmp_len, &compressed_size); atomic_set_release(&d->stop, 1); wake_up(&d->done); } @@ -689,14 +689,14 @@ static int save_compressed_image(struct swap_map_handle *handle, ktime_t start; ktime_t stop; size_t off; - unsigned thr, run_threads, nr_threads; + unsigned int thr, run_threads, nr_threads; unsigned char *page = NULL; struct cmp_data *data = NULL; struct crc_data *crc = NULL; hib_init_batch(&hb); - atomic_set(&compressed_size, 0); + atomic64_set(&compressed_size, 0); /* * We'll limit the number of threads for compression to limit memory @@ -877,11 +877,14 @@ out_finish: stop = ktime_get(); if (!ret) ret = err2; - if (!ret) + if (!ret) { + swsusp_show_speed(start, stop, nr_to_write, "Wrote"); + pr_info("Image size after compression: %lld kbytes\n", + (atomic64_read(&compressed_size) / 1024)); pr_info("Image saving done\n"); - swsusp_show_speed(start, stop, nr_to_write, "Wrote"); - pr_info("Image size after compression: %d kbytes\n", - (atomic_read(&compressed_size) / 1024)); + } else { + pr_err("Image saving failed: %d\n", ret); + } out_clean: hib_finish_batch(&hb); @@ -899,7 +902,8 @@ out_clean: } vfree(data); } - if (page) free_page((unsigned long)page); + if (page) + free_page((unsigned long)page); return ret; } diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index c1ebfd51768b..585cade21010 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -70,12 +70,10 @@ void rcu_qs(void) */ void rcu_sched_clock_irq(int user) { - if (user) { + if (user) rcu_qs(); - } else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) { - set_tsk_need_resched(current); - set_preempt_need_resched(); - } + else if (rcu_ctrlblk.donetail != rcu_ctrlblk.curtail) + set_need_resched_current(); } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8293bae1dec1..85b82a7007b9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2696,10 +2696,8 @@ void rcu_sched_clock_irq(int user) /* The load-acquire pairs with the store-release setting to true. */ if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { /* Idle and userspace execution already are quiescent states. */ - if (!rcu_is_cpu_rrupt_from_idle() && !user) { - set_tsk_need_resched(current); - set_preempt_need_resched(); - } + if (!rcu_is_cpu_rrupt_from_idle() && !user) + set_need_resched_current(); __this_cpu_write(rcu_data.rcu_urgent_qs, false); } rcu_flavor_sched_clock_irq(user); @@ -2824,7 +2822,6 @@ static void strict_work_handler(struct work_struct *work) /* Perform RCU core processing work for the current CPU. */ static __latent_entropy void rcu_core(void) { - unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; @@ -2837,8 +2834,8 @@ static __latent_entropy void rcu_core(void) if (IS_ENABLED(CONFIG_PREEMPT_COUNT) && (!(preempt_count() & PREEMPT_MASK))) { rcu_preempt_deferred_qs(current); } else if (rcu_preempt_need_deferred_qs(current)) { - set_tsk_need_resched(current); - set_preempt_need_resched(); + guard(irqsave)(); + set_need_resched_current(); } /* Update RCU state based on any recent quiescent states. */ @@ -2847,10 +2844,9 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && rcu_segcblist_is_enabled(&rdp->cblist) && !rcu_rdp_is_offloaded(rdp)) { - local_irq_save(flags); + guard(irqsave)(); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); - local_irq_restore(flags); } rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6058a734090c..96c49c56fc14 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -729,8 +729,7 @@ static void rcu_exp_need_qs(void) __this_cpu_write(rcu_data.cpu_no_qs.b.exp, true); /* Store .exp before .rcu_urgent_qs. */ smp_store_release(this_cpu_ptr(&rcu_data.rcu_urgent_qs), true); - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); } #ifdef CONFIG_PREEMPT_RCU diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d85763336b3c..dbe2d02be824 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -753,8 +753,7 @@ static void rcu_read_unlock_special(struct task_struct *t) // Also if no expediting and no possible deboosting, // slow is OK. Plus nohz_full CPUs eventually get // tick enabled. - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && needs_exp && rdp->defer_qs_iw_pending != DEFER_QS_PENDING && cpu_online(rdp->cpu)) { @@ -813,10 +812,8 @@ static void rcu_flavor_sched_clock_irq(int user) if (rcu_preempt_depth() > 0 || (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) { /* No QS, force context switch if deferred. */ - if (rcu_preempt_need_deferred_qs(t)) { - set_tsk_need_resched(t); - set_preempt_need_resched(); - } + if (rcu_preempt_need_deferred_qs(t)) + set_need_resched_current(); } else if (rcu_preempt_need_deferred_qs(t)) { rcu_preempt_deferred_qs(t); /* Report deferred QS. */ return; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index d16afeb11506..b67532cb8770 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -763,8 +763,7 @@ static void print_cpu_stall(unsigned long gp_seq, unsigned long gps) * progress and it could be we're stuck in kernel space without context * switches for an entirely unreasonable amount of time. */ - set_tsk_need_resched(current); - set_preempt_need_resched(); + set_need_resched_current(); } static bool csd_lock_suppress_rcu_stall; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f1ebf67b48e2..0c4ff93eeb78 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -121,6 +121,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DEFINE_PER_CPU(struct rnd_state, sched_rnd_state); #ifdef CONFIG_SCHED_PROXY_EXEC DEFINE_STATIC_KEY_TRUE(__sched_proxy_exec); @@ -583,8 +584,8 @@ EXPORT_SYMBOL(__trace_set_current_state); * * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: * - * is set by activate_task() and cleared by deactivate_task(), under - * rq->lock. Non-zero indicates the task is runnable, the special + * is set by activate_task() and cleared by deactivate_task()/block_task(), + * under rq->lock. Non-zero indicates the task is runnable, the special * ON_RQ_MIGRATING state is used for migration without holding both * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). * @@ -2089,6 +2090,7 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) */ uclamp_rq_inc(rq, p, flags); + rq->queue_mask |= p->sched_class->queue_mask; p->sched_class->enqueue_task(rq, p, flags); psi_enqueue(p, flags); @@ -2121,6 +2123,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) * and mark the task ->sched_delayed. */ uclamp_rq_dec(rq, p); + rq->queue_mask |= p->sched_class->queue_mask; return p->sched_class->dequeue_task(rq, p, flags); } @@ -2169,37 +2172,6 @@ inline int task_curr(const struct task_struct *p) return cpu_curr(task_cpu(p)) == p; } -/* - * ->switching_to() is called with the pi_lock and rq_lock held and must not - * mess with locking. - */ -void check_class_changing(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class) -{ - if (prev_class != p->sched_class && p->sched_class->switching_to) - p->sched_class->switching_to(rq, p); -} - -/* - * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, - * use the balance_callback list if you want balancing. - * - * this means any call to check_class_changed() must be followed by a call to - * balance_callback(). - */ -void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio) -{ - if (prev_class != p->sched_class) { - if (prev_class->switched_from) - prev_class->switched_from(rq, p); - - p->sched_class->switched_to(rq, p); - } else if (oldprio != p->prio || dl_task(p)) - p->sched_class->prio_changed(rq, p, oldprio); -} - void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { struct task_struct *donor = rq->donor; @@ -2362,7 +2334,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state } static void -__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); +do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); static void migrate_disable_switch(struct rq *rq, struct task_struct *p) { @@ -2377,10 +2349,8 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) if (p->cpus_ptr != &p->cpus_mask) return; - /* - * Violates locking rules! See comment in __do_set_cpus_allowed(). - */ - __do_set_cpus_allowed(p, &ac); + scoped_guard (task_rq_lock, p) + do_set_cpus_allowed(p, &ac); } void ___migrate_enable(void) @@ -2613,7 +2583,8 @@ static int migration_cpu_stop(void *data) */ WARN_ON_ONCE(!pending->stop_pending); preempt_disable(); - task_rq_unlock(rq, p, &rf); + rq_unlock(rq, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, &pending->arg, &pending->stop_work); preempt_enable(); @@ -2622,7 +2593,8 @@ static int migration_cpu_stop(void *data) out: if (pending) pending->stop_pending = false; - task_rq_unlock(rq, p, &rf); + rq_unlock(rq, &rf); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); if (complete) complete_all(&pending->done); @@ -2693,56 +2665,19 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx } static void -__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) +do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) { - struct rq *rq = task_rq(p); - bool queued, running; - - /* - * This here violates the locking rules for affinity, since we're only - * supposed to change these variables while holding both rq->lock and - * p->pi_lock. - * - * HOWEVER, it magically works, because ttwu() is the only code that - * accesses these variables under p->pi_lock and only does so after - * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() - * before finish_task(). - * - * XXX do further audits, this smells like something putrid. - */ - if (ctx->flags & SCA_MIGRATE_DISABLE) - WARN_ON_ONCE(!p->on_cpu); - else - lockdep_assert_held(&p->pi_lock); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - - if (queued) { - /* - * Because __kthread_bind() calls this on blocked tasks without - * holding rq->lock. - */ - lockdep_assert_rq_held(rq); - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); + scoped_guard (sched_change, p, DEQUEUE_SAVE) { + p->sched_class->set_cpus_allowed(p, ctx); + mm_set_cpus_allowed(p->mm, ctx->new_mask); } - if (running) - put_prev_task(rq, p); - - p->sched_class->set_cpus_allowed(p, ctx); - mm_set_cpus_allowed(p->mm, ctx->new_mask); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); } /* * Used for kthread_bind() and select_fallback_rq(), in both cases the user * affinity (if any) should be destroyed too. */ -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +void set_cpus_allowed_force(struct task_struct *p, const struct cpumask *new_mask) { struct affinity_context ac = { .new_mask = new_mask, @@ -2754,7 +2689,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) struct rcu_head rcu; }; - __do_set_cpus_allowed(p, &ac); + scoped_guard (__task_rq_lock, p) + do_set_cpus_allowed(p, &ac); /* * Because this is called with p->pi_lock held, it is not possible @@ -2792,7 +2728,7 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, * Use pi_lock to protect content of user_cpus_ptr * * Though unlikely, user_cpus_ptr can be reset to NULL by a concurrent - * do_set_cpus_allowed(). + * set_cpus_allowed_force(). */ raw_spin_lock_irqsave(&src->pi_lock, flags); if (src->user_cpus_ptr) { @@ -3064,8 +3000,6 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, unsigned int dest_cpu; int ret = 0; - update_rq_clock(rq); - if (kthread || is_migration_disabled(p)) { /* * Kernel threads are allowed on online && !active CPUs, @@ -3120,7 +3054,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, goto out; } - __do_set_cpus_allowed(p, ctx); + do_set_cpus_allowed(p, ctx); return affine_move_task(rq, p, rf, dest_cpu, ctx->flags); @@ -3529,13 +3463,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } fallthrough; case possible: - /* - * XXX When called from select_task_rq() we only - * hold p->pi_lock and again violate locking order. - * - * More yuck to audit. - */ - do_set_cpus_allowed(p, task_cpu_fallback_mask(p)); + set_cpus_allowed_force(p, task_cpu_fallback_mask(p)); state = fail; break; case fail: @@ -3777,7 +3705,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) ttwu_do_wakeup(p); ret = 1; } - __task_rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); return ret; } @@ -4231,7 +4159,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * __schedule(). See the comment for smp_mb__after_spinlock(). * * Form a control-dep-acquire with p->on_rq == 0 above, to ensure - * schedule()'s deactivate_task() has 'happened' and p will no longer + * schedule()'s block_task() has 'happened' and p will no longer * care about it's own p->state. See the comment in __schedule(). */ smp_acquire__after_ctrl_dep(); @@ -4370,7 +4298,7 @@ int task_call_func(struct task_struct *p, task_call_f func, void *arg) ret = func(p, arg); if (rq) - rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return ret; @@ -5692,7 +5620,7 @@ static void sched_tick_remote(struct work_struct *work) * reasonable amount of time. */ u64 delta = rq_clock_task(rq) - curr->se.exec_start; - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 30); } curr->sched_class->task_tick(rq, curr, 0); @@ -5916,19 +5844,6 @@ static void prev_balance(struct rq *rq, struct task_struct *prev, const struct sched_class *start_class = prev->sched_class; const struct sched_class *class; -#ifdef CONFIG_SCHED_CLASS_EXT - /* - * SCX requires a balance() call before every pick_task() including when - * waking up from SCHED_IDLE. If @start_class is below SCX, start from - * SCX instead. Also, set a flag to detect missing balance() call. - */ - if (scx_enabled()) { - rq->scx.flags |= SCX_RQ_BAL_PENDING; - if (sched_class_above(&ext_sched_class, start_class)) - start_class = &ext_sched_class; - } -#endif - /* * We must do the balancing pass before put_prev_task(), such * that when we release the rq->lock the task is in the same @@ -5972,7 +5887,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* Assume the next prioritized class is idle_sched_class */ if (!p) { - p = pick_task_idle(rq); + p = pick_task_idle(rq, rf); put_prev_set_next_task(rq, prev, p); } @@ -5984,11 +5899,15 @@ restart: for_each_active_class(class) { if (class->pick_next_task) { - p = class->pick_next_task(rq, prev); + p = class->pick_next_task(rq, prev, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; if (p) return p; } else { - p = class->pick_task(rq); + p = class->pick_task(rq, rf); + if (unlikely(p == RETRY_TASK)) + goto restart; if (p) { put_prev_set_next_task(rq, prev, p); return p; @@ -6018,7 +5937,11 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b) return a->core_cookie == b->core_cookie; } -static inline struct task_struct *pick_task(struct rq *rq) +/* + * Careful; this can return RETRY_TASK, it does not include the retry-loop + * itself due to the whole SMT pick retry thing below. + */ +static inline struct task_struct *pick_task(struct rq *rq, struct rq_flags *rf) { const struct sched_class *class; struct task_struct *p; @@ -6026,7 +5949,7 @@ static inline struct task_struct *pick_task(struct rq *rq) rq->dl_server = NULL; for_each_active_class(class) { - p = class->pick_task(rq); + p = class->pick_task(rq, rf); if (p) return p; } @@ -6041,7 +5964,7 @@ static void queue_core_balance(struct rq *rq); static struct task_struct * pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - struct task_struct *next, *p, *max = NULL; + struct task_struct *next, *p, *max; const struct cpumask *smt_mask; bool fi_before = false; bool core_clock_updated = (rq == rq->core); @@ -6126,7 +6049,10 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * and there are no cookied tasks running on siblings. */ if (!need_sync) { - next = pick_task(rq); +restart_single: + next = pick_task(rq, rf); + if (unlikely(next == RETRY_TASK)) + goto restart_single; if (!next->core_cookie) { rq->core_pick = NULL; rq->core_dl_server = NULL; @@ -6146,6 +6072,8 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * Tie-break prio towards the current CPU */ +restart_multi: + max = NULL; for_each_cpu_wrap(i, smt_mask, cpu) { rq_i = cpu_rq(i); @@ -6157,7 +6085,11 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (i != cpu && (rq_i != rq->core || !core_clock_updated)) update_rq_clock(rq_i); - rq_i->core_pick = p = pick_task(rq_i); + p = pick_task(rq_i, rf); + if (unlikely(p == RETRY_TASK)) + goto restart_multi; + + rq_i->core_pick = p; rq_i->core_dl_server = rq_i->dl_server; if (!max || prio_less(max, p, fi_before)) @@ -6179,7 +6111,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) if (cookie) p = sched_core_find(rq_i, cookie); if (!p) - p = idle_sched_class.pick_task(rq_i); + p = idle_sched_class.pick_task(rq_i, rf); } rq_i->core_pick = p; @@ -6812,6 +6744,7 @@ static void __sched notrace __schedule(int sched_mode) local_irq_disable(); rcu_note_context_switch(preempt); + migrate_disable_switch(rq, prev); /* * Make sure that signal_pending_state()->signal_pending() below @@ -6918,7 +6851,6 @@ keep_resched: */ ++*switch_count; - migrate_disable_switch(rq, prev); psi_account_irqtime(rq, prev, next); psi_sched_switch(prev, next, !task_on_rq_queued(prev) || prev->se.sched_delayed); @@ -7326,7 +7258,7 @@ void rt_mutex_post_schedule(void) */ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { - int prio, oldprio, queued, running, queue_flag = + int prio, oldprio, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *prev_class, *next_class; struct rq_flags rf; @@ -7388,64 +7320,51 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) prev_class = p->sched_class; next_class = __setscheduler_class(p->policy, prio); - if (prev_class != next_class && p->se.sched_delayed) - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, queue_flag); - if (running) - put_prev_task(rq, p); + if (prev_class != next_class) + queue_flag |= DEQUEUE_CLASS; - /* - * Boosting condition are: - * 1. -rt task is running and holds mutex A - * --> -dl task blocks on mutex A - * - * 2. -dl task is running and holds mutex A - * --> -dl task blocks on mutex A and could preempt the - * running task - */ - if (dl_prio(prio)) { - if (!dl_prio(p->normal_prio) || - (pi_task && dl_prio(pi_task->prio) && - dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.pi_se = pi_task->dl.pi_se; - queue_flag |= ENQUEUE_REPLENISH; + scoped_guard (sched_change, p, queue_flag) { + /* + * Boosting condition are: + * 1. -rt task is running and holds mutex A + * --> -dl task blocks on mutex A + * + * 2. -dl task is running and holds mutex A + * --> -dl task blocks on mutex A and could preempt the + * running task + */ + if (dl_prio(prio)) { + if (!dl_prio(p->normal_prio) || + (pi_task && dl_prio(pi_task->prio) && + dl_entity_preempt(&pi_task->dl, &p->dl))) { + p->dl.pi_se = pi_task->dl.pi_se; + scope->flags |= ENQUEUE_REPLENISH; + } else { + p->dl.pi_se = &p->dl; + } + } else if (rt_prio(prio)) { + if (dl_prio(oldprio)) + p->dl.pi_se = &p->dl; + if (oldprio < prio) + scope->flags |= ENQUEUE_HEAD; } else { - p->dl.pi_se = &p->dl; + if (dl_prio(oldprio)) + p->dl.pi_se = &p->dl; + if (rt_prio(oldprio)) + p->rt.timeout = 0; } - } else if (rt_prio(prio)) { - if (dl_prio(oldprio)) - p->dl.pi_se = &p->dl; - if (oldprio < prio) - queue_flag |= ENQUEUE_HEAD; - } else { - if (dl_prio(oldprio)) - p->dl.pi_se = &p->dl; - if (rt_prio(oldprio)) - p->rt.timeout = 0; - } - p->sched_class = next_class; - p->prio = prio; - - check_class_changing(rq, p, prev_class); - - if (queued) - enqueue_task(rq, p, queue_flag); - if (running) - set_next_task(rq, p); - - check_class_changed(rq, p, prev_class, oldprio); + p->sched_class = next_class; + p->prio = prio; + } out_unlock: /* Avoid rq from going away on us: */ preempt_disable(); rq_unpin_lock(rq, &rf); __balance_callbacks(rq); - raw_spin_rq_unlock(rq); + rq_repin_lock(rq, &rf); + __task_rq_unlock(rq, p, &rf); preempt_enable(); } @@ -8084,26 +8003,9 @@ int migrate_task_to(struct task_struct *p, int target_cpu) */ void sched_setnuma(struct task_struct *p, int nid) { - bool queued, running; - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(p, &rf); - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); - if (running) - put_prev_task(rq, p); - - p->numa_preferred_nid = nid; - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); - task_rq_unlock(rq, p, &rf); + guard(task_rq_lock)(p); + scoped_guard (sched_change, p, DEQUEUE_SAVE) + p->numa_preferred_nid = nid; } #endif /* CONFIG_NUMA_BALANCING */ @@ -8141,18 +8043,15 @@ static int __balance_push_cpu_stop(void *arg) struct rq_flags rf; int cpu; - raw_spin_lock_irq(&p->pi_lock); - rq_lock(rq, &rf); - - update_rq_clock(rq); - - if (task_rq(p) == rq && task_on_rq_queued(p)) { + scoped_guard (raw_spinlock_irq, &p->pi_lock) { cpu = select_fallback_rq(rq->cpu, p); - rq = __migrate_task(rq, &rf, p, cpu); - } - rq_unlock(rq, &rf); - raw_spin_unlock_irq(&p->pi_lock); + rq_lock(rq, &rf); + update_rq_clock(rq); + if (task_rq(p) == rq && task_on_rq_queued(p)) + rq = __migrate_task(rq, &rf, p, cpu); + rq_unlock(rq, &rf); + } put_task_struct(p); @@ -8591,6 +8490,8 @@ void __init sched_init_smp(void) { sched_init_numa(NUMA_NO_NODE); + prandom_init_once(&sched_rnd_state); + /* * There's no userspace yet to cause hotplug operations; hence all the * CPU masks are stable and all blatant races in the below code cannot @@ -9207,38 +9108,23 @@ static void sched_change_group(struct task_struct *tsk) */ void sched_move_task(struct task_struct *tsk, bool for_autogroup) { - int queued, running, queue_flags = - DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; + bool resched = false; struct rq *rq; CLASS(task_rq_lock, rq_guard)(tsk); rq = rq_guard.rq; - update_rq_clock(rq); - - running = task_current_donor(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, queue_flags); - if (running) - put_prev_task(rq, tsk); - - sched_change_group(tsk); - if (!for_autogroup) - scx_cgroup_move_task(tsk); + scoped_guard (sched_change, tsk, queue_flags) { + sched_change_group(tsk); + if (!for_autogroup) + scx_cgroup_move_task(tsk); + if (scope->running) + resched = true; + } - if (queued) - enqueue_task(rq, tsk, queue_flags); - if (running) { - set_next_task(rq, tsk); - /* - * After changing group, the running task may have joined a - * throttled one but it's still the running task. Trigger a - * resched to make sure that task can still run. - */ + if (resched) resched_curr(rq); - } } static struct cgroup_subsys_state * @@ -9606,7 +9492,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, guard(rq_lock_irq)(rq); cfs_rq->runtime_enabled = runtime_enabled; - cfs_rq->runtime_remaining = 0; + cfs_rq->runtime_remaining = 1; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); @@ -10894,37 +10780,75 @@ void sched_mm_cid_fork(struct task_struct *t) } #endif /* CONFIG_SCHED_MM_CID */ -#ifdef CONFIG_SCHED_CLASS_EXT -void sched_deq_and_put_task(struct task_struct *p, int queue_flags, - struct sched_enq_and_set_ctx *ctx) +static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx); + +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags) { + struct sched_change_ctx *ctx = this_cpu_ptr(&sched_change_ctx); struct rq *rq = task_rq(p); + /* + * Must exclusively use matched flags since this is both dequeue and + * enqueue. + */ + WARN_ON_ONCE(flags & 0xFFFF0000); + lockdep_assert_rq_held(rq); - *ctx = (struct sched_enq_and_set_ctx){ + if (!(flags & DEQUEUE_NOCLOCK)) { + update_rq_clock(rq); + flags |= DEQUEUE_NOCLOCK; + } + + if (flags & DEQUEUE_CLASS) { + if (p->sched_class->switching_from) + p->sched_class->switching_from(rq, p); + } + + *ctx = (struct sched_change_ctx){ .p = p, - .queue_flags = queue_flags, + .flags = flags, .queued = task_on_rq_queued(p), - .running = task_current(rq, p), + .running = task_current_donor(rq, p), }; - update_rq_clock(rq); + if (!(flags & DEQUEUE_CLASS)) { + if (p->sched_class->get_prio) + ctx->prio = p->sched_class->get_prio(rq, p); + else + ctx->prio = p->prio; + } + if (ctx->queued) - dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); + dequeue_task(rq, p, flags); if (ctx->running) put_prev_task(rq, p); + + if ((flags & DEQUEUE_CLASS) && p->sched_class->switched_from) + p->sched_class->switched_from(rq, p); + + return ctx; } -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) +void sched_change_end(struct sched_change_ctx *ctx) { - struct rq *rq = task_rq(ctx->p); + struct task_struct *p = ctx->p; + struct rq *rq = task_rq(p); lockdep_assert_rq_held(rq); + if ((ctx->flags & ENQUEUE_CLASS) && p->sched_class->switching_to) + p->sched_class->switching_to(rq, p); + if (ctx->queued) - enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); + enqueue_task(rq, p, ctx->flags); if (ctx->running) - set_next_task(rq, ctx->p); + set_next_task(rq, p); + + if (ctx->flags & ENQUEUE_CLASS) { + if (p->sched_class->switched_to) + p->sched_class->switched_to(rq, p); + } else { + p->sched_class->prio_changed(rq, p, ctx->prio); + } } -#endif /* CONFIG_SCHED_CLASS_EXT */ diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index cdd740b3f774..37b572cc8aca 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -166,12 +166,13 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, * cpudl_clear - remove a CPU from the cpudl max-heap * @cp: the cpudl max-heap context * @cpu: the target CPU + * @online: the online state of the deadline runqueue * * Notes: assumes cpu_rq(cpu)->lock is locked * * Returns: (void) */ -void cpudl_clear(struct cpudl *cp, int cpu) +void cpudl_clear(struct cpudl *cp, int cpu, bool online) { int old_idx, new_cpu; unsigned long flags; @@ -184,7 +185,7 @@ void cpudl_clear(struct cpudl *cp, int cpu) if (old_idx == IDX_INVALID) { /* * Nothing to remove if old_idx was invalid. - * This could happen if a rq_offline_dl is + * This could happen if rq_online_dl or rq_offline_dl is * called for a CPU without -dl tasks running. */ } else { @@ -195,9 +196,12 @@ void cpudl_clear(struct cpudl *cp, int cpu) cp->elements[new_cpu].idx = old_idx; cp->elements[cpu].idx = IDX_INVALID; cpudl_heapify(cp, old_idx); - - cpumask_set_cpu(cpu, cp->free_cpus); } + if (likely(online)) + __cpumask_set_cpu(cpu, cp->free_cpus); + else + __cpumask_clear_cpu(cpu, cp->free_cpus); + raw_spin_unlock_irqrestore(&cp->lock, flags); } @@ -228,7 +232,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) cp->elements[new_idx].cpu = cpu; cp->elements[cpu].idx = new_idx; cpudl_heapify_up(cp, new_idx); - cpumask_clear_cpu(cpu, cp->free_cpus); + __cpumask_clear_cpu(cpu, cp->free_cpus); } else { cp->elements[old_idx].dl = dl; cpudl_heapify(cp, old_idx); @@ -238,26 +242,6 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) } /* - * cpudl_set_freecpu - Set the cpudl.free_cpus - * @cp: the cpudl max-heap context - * @cpu: rd attached CPU - */ -void cpudl_set_freecpu(struct cpudl *cp, int cpu) -{ - cpumask_set_cpu(cpu, cp->free_cpus); -} - -/* - * cpudl_clear_freecpu - Clear the cpudl.free_cpus - * @cp: the cpudl max-heap context - * @cpu: rd attached CPU - */ -void cpudl_clear_freecpu(struct cpudl *cp, int cpu) -{ - cpumask_clear_cpu(cpu, cp->free_cpus); -} - -/* * cpudl_init - initialize the cpudl structure * @cp: the cpudl max-heap context */ diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 11c0f1faa7e1..d7699468eedd 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h @@ -19,8 +19,6 @@ struct cpudl { int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); void cpudl_set(struct cpudl *cp, int cpu, u64 dl); -void cpudl_clear(struct cpudl *cp, int cpu); +void cpudl_clear(struct cpudl *cp, int cpu, bool online); int cpudl_init(struct cpudl *cp); -void cpudl_set_freecpu(struct cpudl *cp, int cpu); -void cpudl_clear_freecpu(struct cpudl *cp, int cpu); void cpudl_cleanup(struct cpudl *cp); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 7097de2c8cda..4f97896887ec 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -313,10 +313,8 @@ static u64 read_sum_exec_runtime(struct task_struct *t) void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) { struct signal_struct *sig = tsk->signal; - u64 utime, stime; struct task_struct *t; - unsigned int seq, nextseq; - unsigned long flags; + u64 utime, stime; /* * Update current task runtime to account pending time since last @@ -329,27 +327,19 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) if (same_thread_group(current, tsk)) (void) task_sched_runtime(current); - rcu_read_lock(); - /* Attempt a lockless read on the first round. */ - nextseq = 0; - do { - seq = nextseq; - flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + guard(rcu)(); + scoped_seqlock_read (&sig->stats_lock, ss_lock_irqsave) { times->utime = sig->utime; times->stime = sig->stime; times->sum_exec_runtime = sig->sum_sched_runtime; - for_each_thread(tsk, t) { + __for_each_thread(sig, t) { task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; times->sum_exec_runtime += read_sum_exec_runtime(t); } - /* If lockless access failed, take the lock. */ - nextseq = 1; - } while (need_seqretry(&sig->stats_lock, seq)); - done_seqretry_irqrestore(&sig->stats_lock, seq, flags); - rcu_read_unlock(); + } } #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 7b7671060bf9..67f540c23717 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -125,20 +125,11 @@ static inline struct dl_bw *dl_bw_of(int i) static inline int dl_bw_cpus(int i) { struct root_domain *rd = cpu_rq(i)->rd; - int cpus; RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), "sched RCU must be held"); - if (cpumask_subset(rd->span, cpu_active_mask)) - return cpumask_weight(rd->span); - - cpus = 0; - - for_each_cpu_and(i, rd->span, cpu_active_mask) - cpus++; - - return cpus; + return cpumask_weight_and(rd->span, cpu_active_mask); } static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) @@ -405,7 +396,7 @@ static void __dl_clear_params(struct sched_dl_entity *dl_se); * up, and checks if the task is still in the "ACTIVE non contending" * state or not (in the second case, it updates running_bw). */ -static void task_non_contending(struct sched_dl_entity *dl_se) +static void task_non_contending(struct sched_dl_entity *dl_se, bool dl_task) { struct hrtimer *timer = &dl_se->inactive_timer; struct rq *rq = rq_of_dl_se(dl_se); @@ -444,10 +435,10 @@ static void task_non_contending(struct sched_dl_entity *dl_se) } else { struct task_struct *p = dl_task_of(dl_se); - if (dl_task(p)) + if (dl_task) sub_running_bw(dl_se, dl_rq); - if (!dl_task(p) || READ_ONCE(p->__state) == TASK_DEAD) { + if (!dl_task || READ_ONCE(p->__state) == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (READ_ONCE(p->__state) == TASK_DEAD) @@ -1166,8 +1157,17 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_ sched_clock_tick(); update_rq_clock(rq); - if (!dl_se->dl_runtime) + /* + * Make sure current has propagated its pending runtime into + * any relevant server through calling dl_server_update() and + * friends. + */ + rq->donor->sched_class->update_curr(rq); + + if (dl_se->dl_defer_idle) { + dl_server_stop(dl_se); return HRTIMER_NORESTART; + } if (dl_se->dl_defer_armed) { /* @@ -1416,10 +1416,11 @@ s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta } static inline void -update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, - int flags); +update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, int flags); + static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) { + bool idle = rq->curr == rq->idle; s64 scaled_delta_exec; if (unlikely(delta_exec <= 0)) { @@ -1440,6 +1441,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 dl_se->runtime -= scaled_delta_exec; + if (dl_se->dl_defer_idle && !idle) + dl_se->dl_defer_idle = 0; + /* * The fair server can consume its runtime while throttled (not queued/ * running as regular CFS). @@ -1450,6 +1454,29 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 */ if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { /* + * Non-servers would never get time accounted while throttled. + */ + WARN_ON_ONCE(!dl_server(dl_se)); + + /* + * While the server is marked idle, do not push out the + * activation further, instead wait for the period timer + * to lapse and stop the server. + */ + if (dl_se->dl_defer_idle && idle) { + /* + * The timer is at the zero-laxity point, this means + * dl_server_stop() / dl_server_start() can happen + * while now < deadline. This means update_dl_entity() + * will not replenish. Additionally start_dl_timer() + * will be set for 'deadline - runtime'. Negative + * runtime will not do. + */ + dl_se->runtime = 0; + return; + } + + /* * If the server was previously activated - the starving condition * took place, it this point it went away because the fair scheduler * was able to get runtime in background. So return to the initial @@ -1461,6 +1488,9 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 replenish_dl_new_period(dl_se, dl_se->rq); + if (idle) + dl_se->dl_defer_idle = 1; + /* * Not being able to start the timer seems problematic. If it could not * be started for whatever reason, we need to "unthrottle" the DL server @@ -1543,38 +1573,213 @@ throttle: * as time available for the fair server, avoiding a penalty for the * rt scheduler that did not consumed that time. */ -void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) +void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec) { - s64 delta_exec; - - if (!rq->fair_server.dl_defer) - return; - - /* no need to discount more */ - if (rq->fair_server.runtime < 0) - return; - - delta_exec = rq_clock_task(rq) - p->se.exec_start; - if (delta_exec < 0) - return; - - rq->fair_server.runtime -= delta_exec; - - if (rq->fair_server.runtime < 0) { - rq->fair_server.dl_defer_running = 0; - rq->fair_server.runtime = 0; - } - - p->se.exec_start = rq_clock_task(rq); + if (dl_se->dl_server_active && dl_se->dl_runtime && dl_se->dl_defer) + update_curr_dl_se(dl_se->rq, dl_se, delta_exec); } void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) { /* 0 runtime = fair server disabled */ - if (dl_se->dl_runtime) + if (dl_se->dl_server_active && dl_se->dl_runtime) update_curr_dl_se(dl_se->rq, dl_se, delta_exec); } +/* + * dl_server && dl_defer: + * + * 6 + * +--------------------+ + * v | + * +-------------+ 4 +-----------+ 5 +------------------+ + * +-> | A:init | <--- | D:running | -----> | E:replenish-wait | + * | +-------------+ +-----------+ +------------------+ + * | | | 1 ^ ^ | + * | | 1 +----------+ | 3 | + * | v | | + * | +--------------------------------+ 2 | + * | | | ----+ | + * | 8 | B:zero_laxity-wait | | | + * | | | <---+ | + * | +--------------------------------+ | + * | | ^ ^ 2 | + * | | 7 | 2 +--------------------+ + * | v | + * | +-------------+ | + * +-- | C:idle-wait | -+ + * +-------------+ + * ^ 7 | + * +---------+ + * + * + * [A] - init + * dl_server_active = 0 + * dl_throttled = 0 + * dl_defer_armed = 0 + * dl_defer_running = 0/1 + * dl_defer_idle = 0 + * + * [B] - zero_laxity-wait + * dl_server_active = 1 + * dl_throttled = 1 + * dl_defer_armed = 1 + * dl_defer_running = 0 + * dl_defer_idle = 0 + * + * [C] - idle-wait + * dl_server_active = 1 + * dl_throttled = 1 + * dl_defer_armed = 1 + * dl_defer_running = 0 + * dl_defer_idle = 1 + * + * [D] - running + * dl_server_active = 1 + * dl_throttled = 0 + * dl_defer_armed = 0 + * dl_defer_running = 1 + * dl_defer_idle = 0 + * + * [E] - replenish-wait + * dl_server_active = 1 + * dl_throttled = 1 + * dl_defer_armed = 0 + * dl_defer_running = 1 + * dl_defer_idle = 0 + * + * + * [1] A->B, A->D + * dl_server_start() + * dl_server_active = 1; + * enqueue_dl_entity() + * update_dl_entity(WAKEUP) + * if (!dl_defer_running) + * dl_defer_armed = 1; + * dl_throttled = 1; + * if (dl_throttled && start_dl_timer()) + * return; // [B] + * __enqueue_dl_entity(); + * // [D] + * + * // deplete server runtime from client-class + * [2] B->B, C->B, E->B + * dl_server_update() + * update_curr_dl_se() // idle = false + * if (dl_defer_idle) + * dl_defer_idle = 0; + * if (dl_defer && dl_throttled && dl_runtime_exceeded()) + * dl_defer_running = 0; + * hrtimer_try_to_cancel(); // stop timer + * replenish_dl_new_period() + * // fwd period + * dl_throttled = 1; + * dl_defer_armed = 1; + * start_dl_timer(); // restart timer + * // [B] + * + * // timer actually fires means we have runtime + * [3] B->D + * dl_server_timer() + * if (dl_defer_armed) + * dl_defer_running = 1; + * enqueue_dl_entity(REPLENISH) + * replenish_dl_entity() + * // fwd period + * if (dl_throttled) + * dl_throttled = 0; + * if (dl_defer_armed) + * dl_defer_armed = 0; + * __enqueue_dl_entity(); + * // [D] + * + * // schedule server + * [4] D->A + * pick_task_dl() + * p = server_pick_task(); + * if (!p) + * dl_server_stop() + * dequeue_dl_entity(); + * hrtimer_try_to_cancel(); + * dl_defer_armed = 0; + * dl_throttled = 0; + * dl_server_active = 0; + * // [A] + * return p; + * + * // server running + * [5] D->E + * update_curr_dl_se() + * if (dl_runtime_exceeded()) + * dl_throttled = 1; + * dequeue_dl_entity(); + * start_dl_timer(); + * // [E] + * + * // server replenished + * [6] E->D + * dl_server_timer() + * enqueue_dl_entity(REPLENISH) + * replenish_dl_entity() + * fwd-period + * if (dl_throttled) + * dl_throttled = 0; + * __enqueue_dl_entity(); + * // [D] + * + * // deplete server runtime from idle + * [7] B->C, C->C + * dl_server_update_idle() + * update_curr_dl_se() // idle = true + * if (dl_defer && dl_throttled && dl_runtime_exceeded()) + * if (dl_defer_idle) + * return; + * dl_defer_running = 0; + * hrtimer_try_to_cancel(); + * replenish_dl_new_period() + * // fwd period + * dl_throttled = 1; + * dl_defer_armed = 1; + * dl_defer_idle = 1; + * start_dl_timer(); // restart timer + * // [C] + * + * // stop idle server + * [8] C->A + * dl_server_timer() + * if (dl_defer_idle) + * dl_server_stop(); + * // [A] + * + * + * digraph dl_server { + * "A:init" -> "B:zero_laxity-wait" [label="1:dl_server_start"] + * "A:init" -> "D:running" [label="1:dl_server_start"] + * "B:zero_laxity-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] + * "B:zero_laxity-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"] + * "B:zero_laxity-wait" -> "D:running" [label="3:dl_server_timer"] + * "C:idle-wait" -> "A:init" [label="8:dl_server_timer"] + * "C:idle-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] + * "C:idle-wait" -> "C:idle-wait" [label="7:dl_server_update_idle"] + * "D:running" -> "A:init" [label="4:pick_task_dl"] + * "D:running" -> "E:replenish-wait" [label="5:update_curr_dl_se"] + * "E:replenish-wait" -> "B:zero_laxity-wait" [label="2:dl_server_update"] + * "E:replenish-wait" -> "D:running" [label="6:dl_server_timer"] + * } + * + * + * Notes: + * + * - When there are fair tasks running the most likely loop is [2]->[2]. + * the dl_server never actually runs, the timer never fires. + * + * - When there is actual fair starvation; the timer fires and starts the + * dl_server. This will then throttle and replenish like a normal DL + * task. Notably it will not 'defer' again. + * + * - When idle it will push the actication forward once, and then wait + * for the timer to hit or a non-idle update to restart things. + */ void dl_server_start(struct sched_dl_entity *dl_se) { struct rq *rq = dl_se->rq; @@ -1582,6 +1787,11 @@ void dl_server_start(struct sched_dl_entity *dl_se) if (!dl_server(dl_se) || dl_se->dl_server_active) return; + /* + * Update the current task to 'now'. + */ + rq->donor->sched_class->update_curr(rq); + if (WARN_ON_ONCE(!cpu_online(cpu_of(rq)))) return; @@ -1600,6 +1810,7 @@ void dl_server_stop(struct sched_dl_entity *dl_se) hrtimer_try_to_cancel(&dl_se->dl_timer); dl_se->dl_defer_armed = 0; dl_se->dl_throttled = 0; + dl_se->dl_defer_idle = 0; dl_se->dl_server_active = 0; } @@ -1811,7 +2022,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (!dl_rq->dl_nr_running) { dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; - cpudl_clear(&rq->rd->cpudl, rq->cpu); + cpudl_clear(&rq->rd->cpudl, rq->cpu, rq->online); cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } else { struct rb_node *leftmost = rb_first_cached(&dl_rq->root); @@ -2048,7 +2259,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se, int flags) * or "inactive") */ if (flags & DEQUEUE_SLEEP) - task_non_contending(dl_se); + task_non_contending(dl_se, true); } static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) @@ -2143,7 +2354,7 @@ static void yield_task_dl(struct rq *rq) * it and the bandwidth timer will wake it up and will give it * new scheduling parameters (thanks to dl_yielded=1). */ - rq->curr->dl.dl_yielded = 1; + rq->donor->dl.dl_yielded = 1; update_rq_clock(rq); update_curr_dl(rq); @@ -2173,7 +2384,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) struct rq *rq; if (!(flags & WF_TTWU)) - goto out; + return cpu; rq = cpu_rq(cpu); @@ -2211,7 +2422,6 @@ select_task_rq_dl(struct task_struct *p, int cpu, int flags) } rcu_read_unlock(); -out: return cpu; } @@ -2355,7 +2565,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) * __pick_next_task_dl - Helper to pick the next -deadline task to run. * @rq: The runqueue to pick the next task from. */ -static struct task_struct *__pick_task_dl(struct rq *rq) +static struct task_struct *__pick_task_dl(struct rq *rq, struct rq_flags *rf) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; @@ -2369,7 +2579,7 @@ again: WARN_ON_ONCE(!dl_se); if (dl_server(dl_se)) { - p = dl_se->server_pick_task(dl_se); + p = dl_se->server_pick_task(dl_se, rf); if (!p) { dl_server_stop(dl_se); goto again; @@ -2382,9 +2592,9 @@ again: return p; } -static struct task_struct *pick_task_dl(struct rq *rq) +static struct task_struct *pick_task_dl(struct rq *rq, struct rq_flags *rf) { - return __pick_task_dl(rq); + return __pick_task_dl(rq, rf); } static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) @@ -2883,9 +3093,10 @@ static void rq_online_dl(struct rq *rq) if (rq->dl.overloaded) dl_set_overload(rq); - cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); if (rq->dl.dl_nr_running > 0) cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr); + else + cpudl_clear(&rq->rd->cpudl, rq->cpu, true); } /* Assumes rq->lock is held */ @@ -2894,8 +3105,7 @@ static void rq_offline_dl(struct rq *rq) if (rq->dl.overloaded) dl_clear_overload(rq); - cpudl_clear(&rq->rd->cpudl, rq->cpu); - cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); + cpudl_clear(&rq->rd->cpudl, rq->cpu, false); } void __init init_sched_dl_class(void) @@ -2973,7 +3183,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) * will reset the task parameters. */ if (task_on_rq_queued(p) && p->dl.dl_runtime) - task_non_contending(&p->dl); + task_non_contending(&p->dl, false); /* * In case a task is setscheduled out from SCHED_DEADLINE we need to @@ -3045,23 +3255,24 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) } } +static u64 get_prio_dl(struct rq *rq, struct task_struct *p) +{ + return p->dl.deadline; +} + /* * If the scheduling parameters of a -deadline task changed, * a push or pull operation might be needed. */ -static void prio_changed_dl(struct rq *rq, struct task_struct *p, - int oldprio) +static void prio_changed_dl(struct rq *rq, struct task_struct *p, u64 old_deadline) { if (!task_on_rq_queued(p)) return; - /* - * This might be too much, but unfortunately - * we don't have the old deadline value, and - * we can't argue if the task is increasing - * or lowering its prio, so... - */ - if (!rq->dl.overloaded) + if (p->dl.deadline == old_deadline) + return; + + if (dl_time_before(old_deadline, p->dl.deadline)) deadline_queue_pull_task(rq); if (task_current_donor(rq, p)) { @@ -3094,6 +3305,8 @@ static int task_is_throttled_dl(struct task_struct *p, int cpu) DEFINE_SCHED_CLASS(dl) = { + .queue_mask = 8, + .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, @@ -3116,6 +3329,7 @@ DEFINE_SCHED_CLASS(dl) = { .task_tick = task_tick_dl, .task_fork = task_fork_dl, + .get_prio = get_prio_dl, .prio_changed = prio_changed_dl, .switched_from = switched_from_dl, .switched_to = switched_to_dl, diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 02e16b70a790..41caa22e0680 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -796,7 +796,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) { - s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, left_deadline = -1, spread; + s64 left_vruntime = -1, zero_vruntime, right_vruntime = -1, left_deadline = -1, spread; struct sched_entity *last, *first, *root; struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -819,15 +819,15 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) last = __pick_last_entity(cfs_rq); if (last) right_vruntime = last->vruntime; - min_vruntime = cfs_rq->min_vruntime; + zero_vruntime = cfs_rq->zero_vruntime; raw_spin_rq_unlock_irqrestore(rq, flags); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_deadline", SPLIT_NS(left_deadline)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", SPLIT_NS(left_vruntime)); - SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", - SPLIT_NS(min_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "zero_vruntime", + SPLIT_NS(zero_vruntime)); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", SPLIT_NS(avg_vruntime(cfs_rq))); SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index ecb251e883ea..6827689a0966 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -25,7 +25,7 @@ static struct scx_sched __rcu *scx_root; * guarantee system safety. Maintain a dedicated task list which contains every * task between its fork and eventual free. */ -static DEFINE_SPINLOCK(scx_tasks_lock); +static DEFINE_RAW_SPINLOCK(scx_tasks_lock); static LIST_HEAD(scx_tasks); /* ops enable/disable */ @@ -476,7 +476,7 @@ static void scx_task_iter_start(struct scx_task_iter *iter) BUILD_BUG_ON(__SCX_DSQ_ITER_ALL_FLAGS & ((1U << __SCX_DSQ_LNODE_PRIV_SHIFT) - 1)); - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; list_add(&iter->cursor.tasks_node, &scx_tasks); @@ -507,14 +507,14 @@ static void scx_task_iter_unlock(struct scx_task_iter *iter) __scx_task_iter_rq_unlock(iter); if (iter->list_locked) { iter->list_locked = false; - spin_unlock_irq(&scx_tasks_lock); + raw_spin_unlock_irq(&scx_tasks_lock); } } static void __scx_task_iter_maybe_relock(struct scx_task_iter *iter) { if (!iter->list_locked) { - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); iter->list_locked = true; } } @@ -1474,7 +1474,7 @@ static bool dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags static void yield_task_scx(struct rq *rq) { struct scx_sched *sch = scx_root; - struct task_struct *p = rq->curr; + struct task_struct *p = rq->donor; if (SCX_HAS_OP(sch, yield)) SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, p, NULL); @@ -1485,7 +1485,7 @@ static void yield_task_scx(struct rq *rq) static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) { struct scx_sched *sch = scx_root; - struct task_struct *from = rq->curr; + struct task_struct *from = rq->donor; if (SCX_HAS_OP(sch, yield)) return SCX_CALL_OP_2TASKS_RET(sch, SCX_KF_REST, yield, rq, @@ -2047,7 +2047,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev) lockdep_assert_rq_held(rq); rq->scx.flags |= SCX_RQ_IN_BALANCE; - rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP); + rq->scx.flags &= ~SCX_RQ_BAL_KEEP; if ((sch->ops.flags & SCX_OPS_HAS_CPU_PREEMPT) && unlikely(rq->scx.cpu_released)) { @@ -2153,42 +2153,6 @@ has_tasks: return true; } -static int balance_scx(struct rq *rq, struct task_struct *prev, - struct rq_flags *rf) -{ - int ret; - - rq_unpin_lock(rq, rf); - - ret = balance_one(rq, prev); - -#ifdef CONFIG_SCHED_SMT - /* - * When core-sched is enabled, this ops.balance() call will be followed - * by pick_task_scx() on this CPU and the SMT siblings. Balance the - * siblings too. - */ - if (sched_core_enabled(rq)) { - const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); - int scpu; - - for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { - struct rq *srq = cpu_rq(scpu); - struct task_struct *sprev = srq->curr; - - WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); - update_rq_clock(srq); - balance_one(srq, sprev); - } - } -#endif - rq_repin_lock(rq, rf); - - maybe_queue_balance_callback(rq); - - return ret; -} - static void process_ddsp_deferred_locals(struct rq *rq) { struct task_struct *p; @@ -2368,41 +2332,23 @@ static struct task_struct *first_local_task(struct rq *rq) struct task_struct, scx.dsq_list.node); } -static struct task_struct *pick_task_scx(struct rq *rq) +static struct task_struct *pick_task_scx(struct rq *rq, struct rq_flags *rf) { struct task_struct *prev = rq->curr; + bool keep_prev, kick_idle = false; struct task_struct *p; - bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; - bool kick_idle = false; - /* - * WORKAROUND: - * - * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just - * have gone through balance_scx(). Unfortunately, there currently is a - * bug where fair could say yes on balance() but no on pick_task(), - * which then ends up calling pick_task_scx() without preceding - * balance_scx(). - * - * Keep running @prev if possible and avoid stalling from entering idle - * without balancing. - * - * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE() - * if pick_task_scx() is called without preceding balance_scx(). - */ - if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) { - if (prev->scx.flags & SCX_TASK_QUEUED) { - keep_prev = true; - } else { - keep_prev = false; - kick_idle = true; - } - } else if (unlikely(keep_prev && - prev->sched_class != &ext_sched_class)) { - /* - * Can happen while enabling as SCX_RQ_BAL_PENDING assertion is - * conditional on scx_enabled() and may have been skipped. - */ + rq_modified_clear(rq); + rq_unpin_lock(rq, rf); + balance_one(rq, prev); + rq_repin_lock(rq, rf); + maybe_queue_balance_callback(rq); + if (rq_modified_above(rq, &ext_sched_class)) + return RETRY_TASK; + + keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP; + if (unlikely(keep_prev && + prev->sched_class != &ext_sched_class)) { WARN_ON_ONCE(scx_enable_state() == SCX_ENABLED); keep_prev = false; } @@ -2940,9 +2886,9 @@ void scx_post_fork(struct task_struct *p) } } - spin_lock_irq(&scx_tasks_lock); + raw_spin_lock_irq(&scx_tasks_lock); list_add_tail(&p->scx.tasks_node, &scx_tasks); - spin_unlock_irq(&scx_tasks_lock); + raw_spin_unlock_irq(&scx_tasks_lock); percpu_up_read(&scx_fork_rwsem); } @@ -2966,9 +2912,9 @@ void sched_ext_free(struct task_struct *p) { unsigned long flags; - spin_lock_irqsave(&scx_tasks_lock, flags); + raw_spin_lock_irqsave(&scx_tasks_lock, flags); list_del_init(&p->scx.tasks_node); - spin_unlock_irqrestore(&scx_tasks_lock, flags); + raw_spin_unlock_irqrestore(&scx_tasks_lock, flags); /* * @p is off scx_tasks and wholly ours. scx_enable()'s READY -> ENABLED @@ -2997,7 +2943,7 @@ static void reweight_task_scx(struct rq *rq, struct task_struct *p, p, p->scx.weight); } -static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) +static void prio_changed_scx(struct rq *rq, struct task_struct *p, u64 oldprio) { } @@ -3270,6 +3216,8 @@ static void scx_cgroup_unlock(void) {} * their current sched_class. Call them directly from sched core instead. */ DEFINE_SCHED_CLASS(ext) = { + .queue_mask = 1, + .enqueue_task = enqueue_task_scx, .dequeue_task = dequeue_task_scx, .yield_task = yield_task_scx, @@ -3277,7 +3225,6 @@ DEFINE_SCHED_CLASS(ext) = { .wakeup_preempt = wakeup_preempt_scx, - .balance = balance_scx, .pick_task = pick_task_scx, .put_prev_task = put_prev_task_scx, @@ -3818,11 +3765,10 @@ static void scx_bypass(bool bypass) */ list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, scx.runnable_node) { - struct sched_enq_and_set_ctx ctx; - /* cycling deq/enq is enough, see the function comment */ - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, DEQUEUE_SAVE | DEQUEUE_MOVE) { + /* nothing */ ; + } } /* resched to restore ticks and idle state */ @@ -3972,22 +3918,20 @@ static void scx_disable_workfn(struct kthread_work *work) scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *old_class = p->sched_class; const struct sched_class *new_class = __setscheduler_class(p->policy, p->prio); - struct sched_enq_and_set_ctx ctx; - if (old_class != new_class && p->se.sched_delayed) - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); + update_rq_clock(task_rq(p)); - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); + if (old_class != new_class) + queue_flags |= DEQUEUE_CLASS; - p->sched_class = new_class; - check_class_changing(task_rq(p), p, old_class); - - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, queue_flags) { + p->sched_class = new_class; + } - check_class_changed(task_rq(p), p, old_class, p->prio); scx_exit_task(p); } scx_task_iter_stop(&sti); @@ -4276,7 +4220,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) size_t avail, used; bool idle; - rq_lock(rq, &rf); + rq_lock_irqsave(rq, &rf); idle = list_empty(&rq->scx.runnable_list) && rq->curr->sched_class == &idle_sched_class; @@ -4345,7 +4289,7 @@ static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) scx_dump_task(&s, &dctx, p, ' '); next: - rq_unlock(rq, &rf); + rq_unlock_irqrestore(rq, &rf); } dump_newline(&s); @@ -4479,8 +4423,11 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) goto err_free_gdsqs; sch->helper = kthread_run_worker(0, "sched_ext_helper"); - if (!sch->helper) + if (IS_ERR(sch->helper)) { + ret = PTR_ERR(sch->helper); goto err_free_pcpu; + } + sched_set_fifo(sch->helper->task); atomic_set(&sch->exit_kind, SCX_EXIT_NONE); @@ -4748,26 +4695,22 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) percpu_down_write(&scx_fork_rwsem); scx_task_iter_start(&sti); while ((p = scx_task_iter_next_locked(&sti))) { + unsigned int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; const struct sched_class *old_class = p->sched_class; const struct sched_class *new_class = __setscheduler_class(p->policy, p->prio); - struct sched_enq_and_set_ctx ctx; if (!tryget_task_struct(p)) continue; - if (old_class != new_class && p->se.sched_delayed) - dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED); - - sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); - - p->scx.slice = SCX_SLICE_DFL; - p->sched_class = new_class; - check_class_changing(task_rq(p), p, old_class); + if (old_class != new_class) + queue_flags |= DEQUEUE_CLASS; - sched_enq_and_set_task(&ctx); + scoped_guard (sched_change, p, queue_flags) { + p->scx.slice = SCX_SLICE_DFL; + p->sched_class = new_class; + } - check_class_changed(task_rq(p), p, old_class, p->prio); put_task_struct(p); } scx_task_iter_stop(&sti); @@ -5321,8 +5264,8 @@ void __init init_sched_ext_class(void) BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_preempt, GFP_KERNEL, n)); BUG_ON(!zalloc_cpumask_var_node(&rq->scx.cpus_to_wait, GFP_KERNEL, n)); - init_irq_work(&rq->scx.deferred_irq_work, deferred_irq_workfn); - init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); + rq->scx.deferred_irq_work = IRQ_WORK_INIT_HARD(deferred_irq_workfn); + rq->scx.kick_cpus_irq_work = IRQ_WORK_INIT_HARD(kick_cpus_irq_workfn); if (cpu_online(cpu)) cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; @@ -6401,7 +6344,7 @@ __bpf_kfunc void scx_bpf_cpuperf_set(s32 cpu, u32 perf) guard(rcu)(); - sch = rcu_dereference(sch); + sch = rcu_dereference(scx_root); if (unlikely(!sch)) return; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 25970dbbb279..769d7b7990df 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -554,7 +554,7 @@ static inline bool entity_before(const struct sched_entity *a, static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return (s64)(se->vruntime - cfs_rq->min_vruntime); + return (s64)(se->vruntime - cfs_rq->zero_vruntime); } #define __node_2_se(node) \ @@ -606,13 +606,13 @@ static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) * * Which we track using: * - * v0 := cfs_rq->min_vruntime + * v0 := cfs_rq->zero_vruntime * \Sum (v_i - v0) * w_i := cfs_rq->avg_vruntime * \Sum w_i := cfs_rq->avg_load * - * Since min_vruntime is a monotonic increasing variable that closely tracks - * the per-task service, these deltas: (v_i - v), will be in the order of the - * maximal (virtual) lag induced in the system due to quantisation. + * Since zero_vruntime closely tracks the per-task service, these + * deltas: (v_i - v), will be in the order of the maximal (virtual) lag + * induced in the system due to quantisation. * * Also, we use scale_load_down() to reduce the size. * @@ -671,7 +671,7 @@ u64 avg_vruntime(struct cfs_rq *cfs_rq) avg = div_s64(avg, load); } - return cfs_rq->min_vruntime + avg; + return cfs_rq->zero_vruntime + avg; } /* @@ -732,7 +732,7 @@ static int vruntime_eligible(struct cfs_rq *cfs_rq, u64 vruntime) load += weight; } - return avg >= (s64)(vruntime - cfs_rq->min_vruntime) * load; + return avg >= (s64)(vruntime - cfs_rq->zero_vruntime) * load; } int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -740,42 +740,14 @@ int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) return vruntime_eligible(cfs_rq, se->vruntime); } -static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) +static void update_zero_vruntime(struct cfs_rq *cfs_rq) { - u64 min_vruntime = cfs_rq->min_vruntime; - /* - * open coded max_vruntime() to allow updating avg_vruntime - */ - s64 delta = (s64)(vruntime - min_vruntime); - if (delta > 0) { - avg_vruntime_update(cfs_rq, delta); - min_vruntime = vruntime; - } - return min_vruntime; -} + u64 vruntime = avg_vruntime(cfs_rq); + s64 delta = (s64)(vruntime - cfs_rq->zero_vruntime); -static void update_min_vruntime(struct cfs_rq *cfs_rq) -{ - struct sched_entity *se = __pick_root_entity(cfs_rq); - struct sched_entity *curr = cfs_rq->curr; - u64 vruntime = cfs_rq->min_vruntime; - - if (curr) { - if (curr->on_rq) - vruntime = curr->vruntime; - else - curr = NULL; - } - - if (se) { - if (!curr) - vruntime = se->min_vruntime; - else - vruntime = min_vruntime(vruntime, se->min_vruntime); - } + avg_vruntime_update(cfs_rq, delta); - /* ensure we never gain time by being placed backwards. */ - cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); + cfs_rq->zero_vruntime = vruntime; } static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) @@ -848,6 +820,7 @@ RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { avg_vruntime_add(cfs_rq, se); + update_zero_vruntime(cfs_rq); se->min_vruntime = se->vruntime; se->min_slice = se->slice; rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, @@ -859,6 +832,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, &min_vruntime_cb); avg_vruntime_sub(cfs_rq, se); + update_zero_vruntime(cfs_rq); } struct sched_entity *__pick_root_entity(struct cfs_rq *cfs_rq) @@ -955,6 +929,16 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) if (cfs_rq->nr_queued == 1) return curr && curr->on_rq ? curr : se; + /* + * Picking the ->next buddy will affect latency but not fairness. + */ + if (sched_feat(PICK_BUDDY) && + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { + /* ->next will never be delayed */ + WARN_ON_ONCE(cfs_rq->next->sched_delayed); + return cfs_rq->next; + } + if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) curr = NULL; @@ -1193,6 +1177,8 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) return delta_exec; } +static void set_next_buddy(struct sched_entity *se); + /* * Used by other classes to account runtime. */ @@ -1226,7 +1212,6 @@ static void update_curr(struct cfs_rq *cfs_rq) curr->vruntime += calc_delta_fair(delta_exec, curr); resched = update_deadline(cfs_rq, curr); - update_min_vruntime(cfs_rq); if (entity_is_task(curr)) { /* @@ -1239,8 +1224,7 @@ static void update_curr(struct cfs_rq *cfs_rq) * against fair_server such that it can account for this time * and possibly avoid running this period. */ - if (dl_server_active(&rq->fair_server)) - dl_server_update(&rq->fair_server, delta_exec); + dl_server_update(&rq->fair_server, delta_exec); } account_cfs_rq_runtime(cfs_rq, delta_exec); @@ -3808,15 +3792,6 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, if (!curr) __enqueue_entity(cfs_rq, se); cfs_rq->nr_queued++; - - /* - * The entity's vruntime has been adjusted, so let's check - * whether the rq-wide min_vruntime needs updated too. Since - * the calculations above require stable min_vruntime rather - * than up-to-date one, we do the update at the end of the - * reweight process. - */ - update_min_vruntime(cfs_rq); } } @@ -5429,15 +5404,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_cfs_group(se); - /* - * Now advance min_vruntime if @se was the entity holding it back, - * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be - * put back on, and if we advance min_vruntime, we'll be placed back - * further than we started -- i.e. we'll be penalized. - */ - if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) - update_min_vruntime(cfs_rq); - if (flags & DEQUEUE_DELAYED) finish_delayed_dequeue_entity(se); @@ -5512,16 +5478,6 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) { struct sched_entity *se; - /* - * Picking the ->next buddy will affect latency but not fairness. - */ - if (sched_feat(PICK_BUDDY) && - cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { - /* ->next will never be delayed */ - WARN_ON_ONCE(cfs_rq->next->sched_delayed); - return cfs_rq->next; - } - se = pick_eevdf(cfs_rq); if (se->sched_delayed) { dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); @@ -6024,20 +5980,17 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; /* - * It's possible we are called with !runtime_remaining due to things - * like user changed quota setting(see tg_set_cfs_bandwidth()) or async - * unthrottled us with a positive runtime_remaining but other still - * running entities consumed those runtime before we reached here. + * It's possible we are called with runtime_remaining < 0 due to things + * like async unthrottled us with a positive runtime_remaining but other + * still running entities consumed those runtime before we reached here. * - * Anyway, we can't unthrottle this cfs_rq without any runtime remaining - * because any enqueue in tg_unthrottle_up() will immediately trigger a - * throttle, which is not supposed to happen on unthrottle path. + * We can't unthrottle this cfs_rq without any runtime remaining because + * any enqueue in tg_unthrottle_up() will immediately trigger a throttle, + * which is not supposed to happen on unthrottle path. */ if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) return; - se = cfs_rq->tg->se[cpu_of(rq)]; - cfs_rq->throttled = 0; update_rq_clock(rq); @@ -7006,12 +6959,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) h_nr_idle = 1; } - if (!rq_h_nr_queued && rq->cfs.h_nr_queued) { - /* Account for idle runtime */ - if (!rq->nr_running) - dl_server_update_idle_time(rq, rq->curr); + if (!rq_h_nr_queued && rq->cfs.h_nr_queued) dl_server_start(&rq->fair_server); - } /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); @@ -7038,8 +6987,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) hrtick_update(rq); } -static void set_next_buddy(struct sched_entity *se); - /* * Basically dequeue_task_fair(), except it can deal with dequeue_entity() * failing half-way through and resume the dequeue later. @@ -8715,15 +8662,6 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context set_task_max_allowed_capacity(p); } -static int -balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) -{ - if (sched_fair_runnable(rq)) - return 1; - - return sched_balance_newidle(rq, rf) != 0; -} - static void set_next_buddy(struct sched_entity *se) { for_each_sched_entity(se) { @@ -8735,16 +8673,81 @@ static void set_next_buddy(struct sched_entity *se) } } +enum preempt_wakeup_action { + PREEMPT_WAKEUP_NONE, /* No preemption. */ + PREEMPT_WAKEUP_SHORT, /* Ignore slice protection. */ + PREEMPT_WAKEUP_PICK, /* Let __pick_eevdf() decide. */ + PREEMPT_WAKEUP_RESCHED, /* Force reschedule. */ +}; + +static inline bool +set_preempt_buddy(struct cfs_rq *cfs_rq, int wake_flags, + struct sched_entity *pse, struct sched_entity *se) +{ + /* + * Keep existing buddy if the deadline is sooner than pse. + * The older buddy may be cache cold and completely unrelated + * to the current wakeup but that is unpredictable where as + * obeying the deadline is more in line with EEVDF objectives. + */ + if (cfs_rq->next && entity_before(cfs_rq->next, pse)) + return false; + + set_next_buddy(pse); + return true; +} + +/* + * WF_SYNC|WF_TTWU indicates the waker expects to sleep but it is not + * strictly enforced because the hint is either misunderstood or + * multiple tasks must be woken up. + */ +static inline enum preempt_wakeup_action +preempt_sync(struct rq *rq, int wake_flags, + struct sched_entity *pse, struct sched_entity *se) +{ + u64 threshold, delta; + + /* + * WF_SYNC without WF_TTWU is not expected so warn if it happens even + * though it is likely harmless. + */ + WARN_ON_ONCE(!(wake_flags & WF_TTWU)); + + threshold = sysctl_sched_migration_cost; + delta = rq_clock_task(rq) - se->exec_start; + if ((s64)delta < 0) + delta = 0; + + /* + * WF_RQ_SELECTED implies the tasks are stacking on a CPU when they + * could run on other CPUs. Reduce the threshold before preemption is + * allowed to an arbitrary lower value as it is more likely (but not + * guaranteed) the waker requires the wakee to finish. + */ + if (wake_flags & WF_RQ_SELECTED) + threshold >>= 2; + + /* + * As WF_SYNC is not strictly obeyed, allow some runtime for batch + * wakeups to be issued. + */ + if (entity_before(pse, se) && delta >= threshold) + return PREEMPT_WAKEUP_RESCHED; + + return PREEMPT_WAKEUP_NONE; +} + /* * Preempt the current task with a newly woken task if needed: */ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { + enum preempt_wakeup_action preempt_action = PREEMPT_WAKEUP_PICK; struct task_struct *donor = rq->donor; struct sched_entity *se = &donor->se, *pse = &p->se; struct cfs_rq *cfs_rq = task_cfs_rq(donor); int cse_is_idle, pse_is_idle; - bool do_preempt_short = false; if (unlikely(se == pse)) return; @@ -8758,10 +8761,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int if (task_is_throttled(p)) return; - if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK) && !pse->sched_delayed) { - set_next_buddy(pse); - } - /* * We can come here with TIF_NEED_RESCHED already set from new task * wake up path. @@ -8793,7 +8792,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * When non-idle entity preempt an idle entity, * don't give idle entity slice protection. */ - do_preempt_short = true; + preempt_action = PREEMPT_WAKEUP_SHORT; goto preempt; } @@ -8812,27 +8811,74 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int * If @p has a shorter slice than current and @p is eligible, override * current's slice protection in order to allow preemption. */ - do_preempt_short = sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice); + if (sched_feat(PREEMPT_SHORT) && (pse->slice < se->slice)) { + preempt_action = PREEMPT_WAKEUP_SHORT; + goto pick; + } /* + * Ignore wakee preemption on WF_FORK as it is less likely that + * there is shared data as exec often follow fork. Do not + * preempt for tasks that are sched_delayed as it would violate + * EEVDF to forcibly queue an ineligible task. + */ + if ((wake_flags & WF_FORK) || pse->sched_delayed) + return; + + /* + * If @p potentially is completing work required by current then + * consider preemption. + * + * Reschedule if waker is no longer eligible. */ + if (in_task() && !entity_eligible(cfs_rq, se)) { + preempt_action = PREEMPT_WAKEUP_RESCHED; + goto preempt; + } + + /* Prefer picking wakee soon if appropriate. */ + if (sched_feat(NEXT_BUDDY) && + set_preempt_buddy(cfs_rq, wake_flags, pse, se)) { + + /* + * Decide whether to obey WF_SYNC hint for a new buddy. Old + * buddies are ignored as they may not be relevant to the + * waker and less likely to be cache hot. + */ + if (wake_flags & WF_SYNC) + preempt_action = preempt_sync(rq, wake_flags, pse, se); + } + + switch (preempt_action) { + case PREEMPT_WAKEUP_NONE: + return; + case PREEMPT_WAKEUP_RESCHED: + goto preempt; + case PREEMPT_WAKEUP_SHORT: + fallthrough; + case PREEMPT_WAKEUP_PICK: + break; + } + +pick: + /* * If @p has become the most eligible task, force preemption. */ - if (__pick_eevdf(cfs_rq, !do_preempt_short) == pse) + if (__pick_eevdf(cfs_rq, preempt_action != PREEMPT_WAKEUP_SHORT) == pse) goto preempt; - if (sched_feat(RUN_TO_PARITY) && do_preempt_short) + if (sched_feat(RUN_TO_PARITY)) update_protect_slice(cfs_rq, se); return; preempt: - if (do_preempt_short) + if (preempt_action == PREEMPT_WAKEUP_SHORT) cancel_protect_slice(se); resched_curr_lazy(rq); } -static struct task_struct *pick_task_fair(struct rq *rq) +static struct task_struct *pick_task_fair(struct rq *rq, struct rq_flags *rf) { struct sched_entity *se; struct cfs_rq *cfs_rq; @@ -8876,7 +8922,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf int new_tasks; again: - p = pick_task_fair(rq); + p = pick_task_fair(rq, rf); if (!p) goto idle; se = &p->se; @@ -8955,14 +9001,10 @@ idle: return NULL; } -static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) +static struct task_struct * +fair_server_pick_task(struct sched_dl_entity *dl_se, struct rq_flags *rf) { - return pick_next_task_fair(rq, prev, NULL); -} - -static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) -{ - return pick_task_fair(dl_se->rq); + return pick_task_fair(dl_se->rq, rf); } void fair_server_init(struct rq *rq) @@ -8993,7 +9035,7 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct t */ static void yield_task_fair(struct rq *rq) { - struct task_struct *curr = rq->curr; + struct task_struct *curr = rq->donor; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se; @@ -9017,7 +9059,18 @@ static void yield_task_fair(struct rq *rq) */ rq_clock_skip_update(rq); - se->deadline += calc_delta_fair(se->slice, se); + /* + * Forfeit the remaining vruntime, only if the entity is eligible. This + * condition is necessary because in core scheduling we prefer to run + * ineligible tasks rather than force idling. If this happens we may + * end up in a loop where the core scheduler picks the yielding task, + * which yields immediately again; without the condition the vruntime + * ends up quickly running away. + */ + if (entity_eligible(cfs_rq, se)) { + se->vruntime = se->deadline; + se->deadline += calc_delta_fair(se->slice, se); + } } static bool yield_to_task_fair(struct rq *rq, struct task_struct *p) @@ -10681,7 +10734,7 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd, if (sd->flags & SD_ASYM_CPUCAPACITY) sgs->group_misfit_task_load = 1; - for_each_cpu(i, sched_group_span(group)) { + for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { struct rq *rq = cpu_rq(i); unsigned int local; @@ -11733,6 +11786,21 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd } /* + * This flag serializes load-balancing passes over large domains + * (above the NODE topology level) - only one load-balancing instance + * may run at a time, to reduce overhead on very large systems with + * lots of CPUs and large NUMA distances. + * + * - Note that load-balancing passes triggered while another one + * is executing are skipped and not re-tried. + * + * - Also note that this does not serialize rebalance_domains() + * execution, as non-SD_SERIALIZE domains will still be + * load-balanced in parallel. + */ +static atomic_t sched_balance_running = ATOMIC_INIT(0); + +/* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ @@ -11757,6 +11825,7 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq, .fbq_type = all, .tasks = LIST_HEAD_INIT(env.tasks), }; + bool need_unlock = false; cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); @@ -11768,6 +11837,14 @@ redo: goto out_balanced; } + if (!need_unlock && (sd->flags & SD_SERIALIZE)) { + int zero = 0; + if (!atomic_try_cmpxchg_acquire(&sched_balance_running, &zero, 1)) + goto out_balanced; + + need_unlock = true; + } + group = sched_balance_find_src_group(&env); if (!group) { schedstat_inc(sd->lb_nobusyg[idle]); @@ -12008,6 +12085,9 @@ out_one_pinned: sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; out: + if (need_unlock) + atomic_set_release(&sched_balance_running, 0); + return ld_moved; } @@ -12133,21 +12213,6 @@ out_unlock: } /* - * This flag serializes load-balancing passes over large domains - * (above the NODE topology level) - only one load-balancing instance - * may run at a time, to reduce overhead on very large systems with - * lots of CPUs and large NUMA distances. - * - * - Note that load-balancing passes triggered while another one - * is executing are skipped and not re-tried. - * - * - Also note that this does not serialize rebalance_domains() - * execution, as non-SD_SERIALIZE domains will still be - * load-balanced in parallel. - */ -static atomic_t sched_balance_running = ATOMIC_INIT(0); - -/* * Scale the max sched_balance_rq interval with the number of CPUs in the system. * This trades load-balance latency on larger machines for less cross talk. */ @@ -12156,30 +12221,43 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } -static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost) +static inline void update_newidle_stats(struct sched_domain *sd, unsigned int success) +{ + sd->newidle_call++; + sd->newidle_success += success; + + if (sd->newidle_call >= 1024) { + sd->newidle_ratio = sd->newidle_success; + sd->newidle_call /= 2; + sd->newidle_success /= 2; + } +} + +static inline bool +update_newidle_cost(struct sched_domain *sd, u64 cost, unsigned int success) { + unsigned long next_decay = sd->last_decay_max_lb_cost + HZ; + unsigned long now = jiffies; + + if (cost) + update_newidle_stats(sd, success); + if (cost > sd->max_newidle_lb_cost) { /* * Track max cost of a domain to make sure to not delay the * next wakeup on the CPU. - * - * sched_balance_newidle() bumps the cost whenever newidle - * balance fails, and we don't want things to grow out of - * control. Use the sysctl_sched_migration_cost as the upper - * limit, plus a litle extra to avoid off by ones. */ - sd->max_newidle_lb_cost = - min(cost, sysctl_sched_migration_cost + 200); - sd->last_decay_max_lb_cost = jiffies; - } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) { + sd->max_newidle_lb_cost = cost; + sd->last_decay_max_lb_cost = now; + + } else if (time_after(now, next_decay)) { /* * Decay the newidle max times by ~1% per second to ensure that * it is not outdated and the current max cost is actually * shorter. */ sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256; - sd->last_decay_max_lb_cost = jiffies; - + sd->last_decay_max_lb_cost = now; return true; } @@ -12202,7 +12280,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; int update_next_balance = 0; - int need_serialize, need_decay = 0; + int need_decay = 0; u64 max_cost = 0; rcu_read_lock(); @@ -12211,7 +12289,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) * Decay the newidle max times here because this is a regular * visit to all the domains. */ - need_decay = update_newidle_cost(sd, 0); + need_decay = update_newidle_cost(sd, 0, 0); max_cost += sd->max_newidle_lb_cost; /* @@ -12226,13 +12304,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) } interval = get_sd_balance_interval(sd, busy); - - need_serialize = sd->flags & SD_SERIALIZE; - if (need_serialize) { - if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) - goto out; - } - if (time_after_eq(jiffies, sd->last_balance + interval)) { if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) { /* @@ -12246,9 +12317,6 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle) sd->last_balance = jiffies; interval = get_sd_balance_interval(sd, busy); } - if (need_serialize) - atomic_set_release(&sched_balance_running, 0); -out: if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; update_next_balance = 1; @@ -12827,18 +12895,21 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) rcu_read_lock(); sd = rcu_dereference_check_sched_domain(this_rq->sd); + if (!sd) { + rcu_read_unlock(); + goto out; + } if (!get_rd_overloaded(this_rq->rd) || - (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) { + this_rq->avg_idle < sd->max_newidle_lb_cost) { - if (sd) - update_next_balance(sd, &next_balance); + update_next_balance(sd, &next_balance); rcu_read_unlock(); - goto out; } rcu_read_unlock(); + rq_modified_clear(this_rq); raw_spin_rq_unlock(this_rq); t0 = sched_clock_cpu(this_cpu); @@ -12854,6 +12925,22 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) break; if (sd->flags & SD_BALANCE_NEWIDLE) { + unsigned int weight = 1; + + if (sched_feat(NI_RANDOM)) { + /* + * Throw a 1k sided dice; and only run + * newidle_balance according to the success + * rate. + */ + u32 d1k = sched_rng() % 1024; + weight = 1 + sd->newidle_ratio; + if (d1k > weight) { + update_newidle_stats(sd, 0); + continue; + } + weight = (1024 + weight/2) / weight; + } pulled_task = sched_balance_rq(this_cpu, this_rq, sd, CPU_NEWLY_IDLE, @@ -12865,13 +12952,10 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) t0 = t1; /* - * Failing newidle means it is not effective; - * bump the cost so we end up doing less of it. + * Track max cost of a domain to make sure to not delay the + * next wakeup on the CPU. */ - if (!pulled_task) - domain_cost = (3 * sd->max_newidle_lb_cost) / 2; - - update_newidle_cost(sd, domain_cost); + update_newidle_cost(sd, domain_cost, weight * !!pulled_task); } /* @@ -12896,8 +12980,8 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf) if (this_rq->cfs.h_nr_queued && !pulled_task) pulled_task = 1; - /* Is there a task of a high priority class? */ - if (this_rq->nr_running != this_rq->cfs.h_nr_queued) + /* If a higher prio class was modified, restart the pick */ + if (rq_modified_above(this_rq, &fair_sched_class)) pulled_task = -1; out: @@ -13015,7 +13099,170 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) } /* - * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed. + * Consider any infeasible weight scenario. Take for instance two tasks, + * each bound to their respective sibling, one with weight 1 and one with + * weight 2. Then the lower weight task will run ahead of the higher weight + * task without bound. + * + * This utterly destroys the concept of a shared time base. + * + * Remember; all this is about a proportionally fair scheduling, where each + * tasks receives: + * + * w_i + * dt_i = ---------- dt (1) + * \Sum_j w_j + * + * which we do by tracking a virtual time, s_i: + * + * 1 + * s_i = --- d[t]_i (2) + * w_i + * + * Where d[t] is a delta of discrete time, while dt is an infinitesimal. + * The immediate corollary is that the ideal schedule S, where (2) to use + * an infinitesimal delta, is: + * + * 1 + * S = ---------- dt (3) + * \Sum_i w_i + * + * From which we can define the lag, or deviation from the ideal, as: + * + * lag(i) = S - s_i (4) + * + * And since the one and only purpose is to approximate S, we get that: + * + * \Sum_i w_i lag(i) := 0 (5) + * + * If this were not so, we no longer converge to S, and we can no longer + * claim our scheduler has any of the properties we derive from S. This is + * exactly what you did above, you broke it! + * + * + * Let's continue for a while though; to see if there is anything useful to + * be learned. We can combine (1)-(3) or (4)-(5) and express S in s_i: + * + * \Sum_i w_i s_i + * S = -------------- (6) + * \Sum_i w_i + * + * Which gives us a way to compute S, given our s_i. Now, if you've read + * our code, you know that we do not in fact do this, the reason for this + * is two-fold. Firstly, computing S in that way requires a 64bit division + * for every time we'd use it (see 12), and secondly, this only describes + * the steady-state, it doesn't handle dynamics. + * + * Anyway, in (6): s_i -> x + (s_i - x), to get: + * + * \Sum_i w_i (s_i - x) + * S - x = -------------------- (7) + * \Sum_i w_i + * + * Which shows that S and s_i transform alike (which makes perfect sense + * given that S is basically the (weighted) average of s_i). + * + * So the thing to remember is that the above is strictly UP. It is + * possible to generalize to multiple runqueues -- however it gets really + * yuck when you have to add affinity support, as illustrated by our very + * first counter-example. + * + * Luckily I think we can avoid needing a full multi-queue variant for + * core-scheduling (or load-balancing). The crucial observation is that we + * only actually need this comparison in the presence of forced-idle; only + * then do we need to tell if the stalled rq has higher priority over the + * other. + * + * [XXX assumes SMT2; better consider the more general case, I suspect + * it'll work out because our comparison is always between 2 rqs and the + * answer is only interesting if one of them is forced-idle] + * + * And (under assumption of SMT2) when there is forced-idle, there is only + * a single queue, so everything works like normal. + * + * Let, for our runqueue 'k': + * + * T_k = \Sum_i w_i s_i + * W_k = \Sum_i w_i ; for all i of k (8) + * + * Then we can write (6) like: + * + * T_k + * S_k = --- (9) + * W_k + * + * From which immediately follows that: + * + * T_k + T_l + * S_k+l = --------- (10) + * W_k + W_l + * + * On which we can define a combined lag: + * + * lag_k+l(i) := S_k+l - s_i (11) + * + * And that gives us the tools to compare tasks across a combined runqueue. + * + * + * Combined this gives the following: + * + * a) when a runqueue enters force-idle, sync it against it's sibling rq(s) + * using (7); this only requires storing single 'time'-stamps. + * + * b) when comparing tasks between 2 runqueues of which one is forced-idle, + * compare the combined lag, per (11). + * + * Now, of course cgroups (I so hate them) make this more interesting in + * that a) seems to suggest we need to iterate all cgroup on a CPU at such + * boundaries, but I think we can avoid that. The force-idle is for the + * whole CPU, all it's rqs. So we can mark it in the root and lazily + * propagate downward on demand. + */ + +/* + * So this sync is basically a relative reset of S to 0. + * + * So with 2 queues, when one goes idle, we drop them both to 0 and one + * then increases due to not being idle, and the idle one builds up lag to + * get re-elected. So far so simple, right? + * + * When there's 3, we can have the situation where 2 run and one is idle, + * we sync to 0 and let the idle one build up lag to get re-election. Now + * suppose another one also drops idle. At this point dropping all to 0 + * again would destroy the built-up lag from the queue that was already + * idle, not good. + * + * So instead of syncing everything, we can: + * + * less := !((s64)(s_a - s_b) <= 0) + * + * (v_a - S_a) - (v_b - S_b) == v_a - v_b - S_a + S_b + * == v_a - (v_b - S_a + S_b) + * + * IOW, we can recast the (lag) comparison to a one-sided difference. + * So if then, instead of syncing the whole queue, sync the idle queue + * against the active queue with S_a + S_b at the point where we sync. + * + * (XXX consider the implication of living in a cyclic group: N / 2^n N) + * + * This gives us means of syncing single queues against the active queue, + * and for already idle queues to preserve their build-up lag. + * + * Of course, then we get the situation where there's 2 active and one + * going idle, who do we pick to sync against? Theory would have us sync + * against the combined S, but as we've already demonstrated, there is no + * such thing in infeasible weight scenarios. + * + * One thing I've considered; and this is where that core_active rudiment + * came from, is having active queues sync up between themselves after + * every tick. This limits the observed divergence due to the work + * conservancy. + * + * On top of that, we can improve upon things by employing (10) here. + */ + +/* + * se_fi_update - Update the cfs_rq->zero_vruntime_fi in a CFS hierarchy if needed. */ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, bool forceidle) @@ -13029,7 +13276,7 @@ static void se_fi_update(const struct sched_entity *se, unsigned int fi_seq, cfs_rq->forceidle_seq = fi_seq; } - cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime; + cfs_rq->zero_vruntime_fi = cfs_rq->zero_vruntime; } } @@ -13082,11 +13329,11 @@ bool cfs_prio_less(const struct task_struct *a, const struct task_struct *b, /* * Find delta after normalizing se's vruntime with its cfs_rq's - * min_vruntime_fi, which would have been updated in prior calls + * zero_vruntime_fi, which would have been updated in prior calls * to se_fi_update(). */ delta = (s64)(sea->vruntime - seb->vruntime) + - (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi); + (s64)(cfs_rqb->zero_vruntime_fi - cfs_rqa->zero_vruntime_fi); return delta > 0; } @@ -13148,11 +13395,14 @@ static void task_fork_fair(struct task_struct *p) * the current task. */ static void -prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_fair(struct rq *rq, struct task_struct *p, u64 oldprio) { if (!task_on_rq_queued(p)) return; + if (p->prio == oldprio) + return; + if (rq->cfs.nr_queued == 1) return; @@ -13164,8 +13414,9 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (task_current_donor(rq, p)) { if (p->prio > oldprio) resched_curr(rq); - } else + } else { wakeup_preempt(rq, p, 0); + } } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -13249,6 +13500,12 @@ static void attach_task_cfs_rq(struct task_struct *p) attach_entity_cfs_rq(se); } +static void switching_from_fair(struct rq *rq, struct task_struct *p) +{ + if (p->se.sched_delayed) + dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); +} + static void switched_from_fair(struct rq *rq, struct task_struct *p) { detach_task_cfs_rq(p); @@ -13322,7 +13579,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT_CACHED; - cfs_rq->min_vruntime = (u64)(-(1LL << 20)); + cfs_rq->zero_vruntime = (u64)(-(1LL << 20)); raw_spin_lock_init(&cfs_rq->removed.lock); } @@ -13623,6 +13880,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task */ DEFINE_SCHED_CLASS(fair) = { + .queue_mask = 2, + .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, @@ -13631,11 +13890,10 @@ DEFINE_SCHED_CLASS(fair) = { .wakeup_preempt = check_preempt_wakeup_fair, .pick_task = pick_task_fair, - .pick_next_task = __pick_next_task_fair, + .pick_next_task = pick_next_task_fair, .put_prev_task = put_prev_task_fair, .set_next_task = set_next_task_fair, - .balance = balance_fair, .select_task_rq = select_task_rq_fair, .migrate_task_rq = migrate_task_rq_fair, @@ -13650,6 +13908,7 @@ DEFINE_SCHED_CLASS(fair) = { .reweight_task = reweight_task_fair, .prio_changed = prio_changed_fair, + .switching_from = switching_from_fair, .switched_from = switched_from_fair, .switched_to = switched_to_fair, diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 3c12d9f93331..980d92bab8ab 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -29,7 +29,7 @@ SCHED_FEAT(PREEMPT_SHORT, true) * wakeup-preemption), since its likely going to consume data we * touched, increases cache locality. */ -SCHED_FEAT(NEXT_BUDDY, false) +SCHED_FEAT(NEXT_BUDDY, true) /* * Allow completely ignoring cfs_rq->next; which can be set from various @@ -121,3 +121,8 @@ SCHED_FEAT(WA_BIAS, true) SCHED_FEAT(UTIL_EST, true) SCHED_FEAT(LATENCY_WARN, false) + +/* + * Do newidle balancing proportional to its success rate using randomization. + */ +SCHED_FEAT(NI_RANDOM, true) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c39b089d4f09..1cb7a3d70e65 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -452,9 +452,11 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) resched_curr(rq); } +static void update_curr_idle(struct rq *rq); + static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) { - dl_server_update_idle_time(rq, prev); + update_curr_idle(rq); scx_update_idle(rq, false, true); } @@ -466,7 +468,7 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir next->se.exec_start = rq_clock_task(rq); } -struct task_struct *pick_task_idle(struct rq *rq) +struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf) { scx_update_idle(rq, true, false); return rq->idle; @@ -496,21 +498,36 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) */ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) { + update_curr_idle(rq); } -static void switched_to_idle(struct rq *rq, struct task_struct *p) +static void switching_to_idle(struct rq *rq, struct task_struct *p) { BUG(); } static void -prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_idle(struct rq *rq, struct task_struct *p, u64 oldprio) { + if (p->prio == oldprio) + return; + BUG(); } static void update_curr_idle(struct rq *rq) { + struct sched_entity *se = &rq->idle->se; + u64 now = rq_clock_task(rq); + s64 delta_exec; + + delta_exec = now - se->exec_start; + if (unlikely(delta_exec <= 0)) + return; + + se->exec_start = now; + + dl_server_update_idle(&rq->fair_server, delta_exec); } /* @@ -518,6 +535,8 @@ static void update_curr_idle(struct rq *rq) */ DEFINE_SCHED_CLASS(idle) = { + .queue_mask = 0, + /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ @@ -536,6 +555,6 @@ DEFINE_SCHED_CLASS(idle) = { .task_tick = task_tick_idle, .prio_changed = prio_changed_idle, - .switched_to = switched_to_idle, + .switching_to = switching_to_idle, .update_curr = update_curr_idle, }; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7936d4333731..f1867fe8e5c5 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1490,7 +1490,7 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) static void yield_task_rt(struct rq *rq) { - requeue_task_rt(rq, rq->curr, 0); + requeue_task_rt(rq, rq->donor, 0); } static int find_lowest_rq(struct task_struct *task); @@ -1695,7 +1695,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return rt_task_of(rt_se); } -static struct task_struct *pick_task_rt(struct rq *rq) +static struct task_struct *pick_task_rt(struct rq *rq, struct rq_flags *rf) { struct task_struct *p; @@ -2437,11 +2437,14 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) * us to initiate a push or pull. */ static void -prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_rt(struct rq *rq, struct task_struct *p, u64 oldprio) { if (!task_on_rq_queued(p)) return; + if (p->prio == oldprio) + return; + if (task_current_donor(rq, p)) { /* * If our priority decreases while running, we @@ -2566,6 +2569,8 @@ static int task_is_throttled_rt(struct task_struct *p, int cpu) DEFINE_SCHED_CLASS(rt) = { + .queue_mask = 4, + .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, @@ -2589,8 +2594,8 @@ DEFINE_SCHED_CLASS(rt) = { .get_rr_interval = get_rr_interval_rt, - .prio_changed = prio_changed_rt, .switched_to = switched_to_rt, + .prio_changed = prio_changed_rt, .update_curr = update_curr_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index adfb6e3409d7..b419a4d98461 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -5,6 +5,7 @@ #ifndef _KERNEL_SCHED_SCHED_H #define _KERNEL_SCHED_SCHED_H +#include <linux/prandom.h> #include <linux/sched/affinity.h> #include <linux/sched/autogroup.h> #include <linux/sched/cpufreq.h> @@ -20,7 +21,6 @@ #include <linux/sched/task_flags.h> #include <linux/sched/task.h> #include <linux/sched/topology.h> - #include <linux/atomic.h> #include <linux/bitmap.h> #include <linux/bug.h> @@ -405,6 +405,7 @@ extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s6 * naturally thottled to once per period, avoiding high context switch * workloads from spamming the hrtimer program/cancel paths. */ +extern void dl_server_update_idle(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec); extern void dl_server_start(struct sched_dl_entity *dl_se); extern void dl_server_stop(struct sched_dl_entity *dl_se); @@ -412,8 +413,6 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, dl_server_pick_f pick_task); extern void sched_init_dl_servers(void); -extern void dl_server_update_idle_time(struct rq *rq, - struct task_struct *p); extern void fair_server_init(struct rq *rq); extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); extern int dl_server_apply_params(struct sched_dl_entity *dl_se, @@ -682,10 +681,10 @@ struct cfs_rq { s64 avg_vruntime; u64 avg_load; - u64 min_vruntime; + u64 zero_vruntime; #ifdef CONFIG_SCHED_CORE unsigned int forceidle_seq; - u64 min_vruntime_fi; + u64 zero_vruntime_fi; #endif struct rb_root_cached tasks_timeline; @@ -780,7 +779,6 @@ enum scx_rq_flags { */ SCX_RQ_ONLINE = 1 << 0, SCX_RQ_CAN_STOP_TICK = 1 << 1, - SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */ SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */ SCX_RQ_BYPASSING = 1 << 4, SCX_RQ_CLK_VALID = 1 << 5, /* RQ clock is fresh and valid */ @@ -1120,6 +1118,8 @@ struct rq { /* runqueue lock: */ raw_spinlock_t __lock; + /* Per class runqueue modification mask; bits in class order. */ + unsigned int queue_mask; unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -1349,6 +1349,12 @@ static inline bool is_migration_disabled(struct task_struct *p) } DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +DECLARE_PER_CPU(struct rnd_state, sched_rnd_state); + +static inline u32 sched_rng(void) +{ + return prandom_u32_state(this_cpu_ptr(&sched_rnd_state)); +} #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() this_cpu_ptr(&runqueues) @@ -1432,6 +1438,9 @@ static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) if (!sched_core_enabled(rq)) return true; + if (rq->core->core_cookie == p->core_cookie) + return true; + for_each_cpu(cpu, cpu_smt_mask(cpu_of(rq))) { if (!available_idle_cpu(cpu)) { idle_core = false; @@ -1443,7 +1452,7 @@ static inline bool sched_core_cookie_match(struct rq *rq, struct task_struct *p) * A CPU in an idle core is always the best choice for tasks with * cookies. */ - return idle_core || rq->core->core_cookie == p->core_cookie; + return idle_core; } static inline bool sched_group_cookie_match(struct rq *rq, @@ -1827,7 +1836,8 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); -static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) +static inline void +__task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) { rq_unpin_lock(rq, rf); @@ -1839,8 +1849,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) __releases(p->pi_lock) { - rq_unpin_lock(rq, rf); - raw_spin_rq_unlock(rq); + __task_rq_unlock(rq, p, rf); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } @@ -1849,6 +1858,11 @@ DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, task_rq_unlock(_T->rq, _T->lock, &_T->rf), struct rq *rq; struct rq_flags rf) +DEFINE_LOCK_GUARD_1(__task_rq_lock, struct task_struct, + _T->rq = __task_rq_lock(_T->lock, &_T->rf), + __task_rq_unlock(_T->rq, _T->lock, &_T->rf), + struct rq *rq; struct rq_flags rf) + static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) { @@ -2342,8 +2356,7 @@ extern const u32 sched_prio_to_wmult[40]; /* * {de,en}queue flags: * - * DEQUEUE_SLEEP - task is no longer runnable - * ENQUEUE_WAKEUP - task just became runnable + * SLEEP/WAKEUP - task is no-longer/just-became runnable * * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks * are in a known state which allows modification. Such pairs @@ -2356,34 +2369,46 @@ extern const u32 sched_prio_to_wmult[40]; * * MIGRATION - p->on_rq == TASK_ON_RQ_MIGRATING (used for DEADLINE) * + * DELAYED - de/re-queue a sched_delayed task + * + * CLASS - going to update p->sched_class; makes sched_change call the + * various switch methods. + * * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) * ENQUEUE_MIGRATED - the task was migrated during wakeup * ENQUEUE_RQ_SELECTED - ->select_task_rq() was called * + * XXX SAVE/RESTORE in combination with CLASS doesn't really make sense, but + * SCHED_DEADLINE seems to rely on this for now. */ -#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ -#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ -#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ -#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ -#define DEQUEUE_SPECIAL 0x10 -#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ -#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ -#define DEQUEUE_THROTTLE 0x800 - -#define ENQUEUE_WAKEUP 0x01 -#define ENQUEUE_RESTORE 0x02 -#define ENQUEUE_MOVE 0x04 -#define ENQUEUE_NOCLOCK 0x08 - -#define ENQUEUE_HEAD 0x10 -#define ENQUEUE_REPLENISH 0x20 -#define ENQUEUE_MIGRATED 0x40 -#define ENQUEUE_INITIAL 0x80 -#define ENQUEUE_MIGRATING 0x100 -#define ENQUEUE_DELAYED 0x200 -#define ENQUEUE_RQ_SELECTED 0x400 +#define DEQUEUE_SLEEP 0x0001 /* Matches ENQUEUE_WAKEUP */ +#define DEQUEUE_SAVE 0x0002 /* Matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x0004 /* Matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x0008 /* Matches ENQUEUE_NOCLOCK */ + +#define DEQUEUE_MIGRATING 0x0010 /* Matches ENQUEUE_MIGRATING */ +#define DEQUEUE_DELAYED 0x0020 /* Matches ENQUEUE_DELAYED */ +#define DEQUEUE_CLASS 0x0040 /* Matches ENQUEUE_CLASS */ + +#define DEQUEUE_SPECIAL 0x00010000 +#define DEQUEUE_THROTTLE 0x00020000 + +#define ENQUEUE_WAKEUP 0x0001 +#define ENQUEUE_RESTORE 0x0002 +#define ENQUEUE_MOVE 0x0004 +#define ENQUEUE_NOCLOCK 0x0008 + +#define ENQUEUE_MIGRATING 0x0010 +#define ENQUEUE_DELAYED 0x0020 +#define ENQUEUE_CLASS 0x0040 + +#define ENQUEUE_HEAD 0x00010000 +#define ENQUEUE_REPLENISH 0x00020000 +#define ENQUEUE_MIGRATED 0x00040000 +#define ENQUEUE_INITIAL 0x00080000 +#define ENQUEUE_RQ_SELECTED 0x00100000 #define RETRY_TASK ((void *)-1UL) @@ -2400,16 +2425,61 @@ struct sched_class { #ifdef CONFIG_UCLAMP_TASK int uclamp_enabled; #endif + /* + * idle: 0 + * ext: 1 + * fair: 2 + * rt: 4 + * dl: 8 + * stop: 16 + */ + unsigned int queue_mask; + /* + * move_queued_task/activate_task/enqueue_task: rq->lock + * ttwu_do_activate/activate_task/enqueue_task: rq->lock + * wake_up_new_task/activate_task/enqueue_task: task_rq_lock + * ttwu_runnable/enqueue_task: task_rq_lock + * proxy_task_current: rq->lock + * sched_change_end + */ void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); + /* + * move_queued_task/deactivate_task/dequeue_task: rq->lock + * __schedule/block_task/dequeue_task: rq->lock + * proxy_task_current: rq->lock + * wait_task_inactive: task_rq_lock + * sched_change_begin + */ bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + + /* + * do_sched_yield: rq->lock + */ void (*yield_task) (struct rq *rq); + /* + * yield_to: rq->lock (double) + */ bool (*yield_to_task)(struct rq *rq, struct task_struct *p); + /* + * move_queued_task: rq->lock + * __migrate_swap_task: rq->lock + * ttwu_do_activate: rq->lock + * ttwu_runnable: task_rq_lock + * wake_up_new_task: task_rq_lock + */ void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); + /* + * schedule/pick_next_task/prev_balance: rq->lock + */ int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - struct task_struct *(*pick_task)(struct rq *rq); + + /* + * schedule/pick_next_task: rq->lock + */ + struct task_struct *(*pick_task)(struct rq *rq, struct rq_flags *rf); /* * Optional! When implemented pick_next_task() should be equivalent to: * @@ -2419,55 +2489,123 @@ struct sched_class { * set_next_task_first(next); * } */ - struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); + struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf); + /* + * sched_change: + * __schedule: rq->lock + */ void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); + /* + * select_task_rq: p->pi_lock + * sched_exec: p->pi_lock + */ int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); + /* + * set_task_cpu: p->pi_lock || rq->lock (ttwu like) + */ void (*migrate_task_rq)(struct task_struct *p, int new_cpu); + /* + * ttwu_do_activate: rq->lock + * wake_up_new_task: task_rq_lock + */ void (*task_woken)(struct rq *this_rq, struct task_struct *task); + /* + * do_set_cpus_allowed: task_rq_lock + sched_change + */ void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx); + /* + * sched_set_rq_{on,off}line: rq->lock + */ void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); + /* + * push_cpu_stop: p->pi_lock && rq->lock + */ struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); + /* + * hrtick: rq->lock + * sched_tick: rq->lock + * sched_tick_remote: rq->lock + */ void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); + /* + * sched_cgroup_fork: p->pi_lock + */ void (*task_fork)(struct task_struct *p); + /* + * finish_task_switch: no locks + */ void (*task_dead)(struct task_struct *p); /* - * The switched_from() call is allowed to drop rq->lock, therefore we - * cannot assume the switched_from/switched_to pair is serialized by - * rq->lock. They are however serialized by p->pi_lock. + * sched_change + */ + void (*switching_from)(struct rq *this_rq, struct task_struct *task); + void (*switched_from) (struct rq *this_rq, struct task_struct *task); + void (*switching_to) (struct rq *this_rq, struct task_struct *task); + void (*switched_to) (struct rq *this_rq, struct task_struct *task); + u64 (*get_prio) (struct rq *this_rq, struct task_struct *task); + void (*prio_changed) (struct rq *this_rq, struct task_struct *task, + u64 oldprio); + + /* + * set_load_weight: task_rq_lock + sched_change + * __setscheduler_parms: task_rq_lock + sched_change */ - void (*switching_to) (struct rq *this_rq, struct task_struct *task); - void (*switched_from)(struct rq *this_rq, struct task_struct *task); - void (*switched_to) (struct rq *this_rq, struct task_struct *task); void (*reweight_task)(struct rq *this_rq, struct task_struct *task, const struct load_weight *lw); - void (*prio_changed) (struct rq *this_rq, struct task_struct *task, - int oldprio); + /* + * sched_rr_get_interval: task_rq_lock + */ unsigned int (*get_rr_interval)(struct rq *rq, struct task_struct *task); + /* + * task_sched_runtime: task_rq_lock + */ void (*update_curr)(struct rq *rq); #ifdef CONFIG_FAIR_GROUP_SCHED + /* + * sched_change_group: task_rq_lock + sched_change + */ void (*task_change_group)(struct task_struct *p); #endif #ifdef CONFIG_SCHED_CORE + /* + * pick_next_task: rq->lock + * try_steal_cookie: rq->lock (double) + */ int (*task_is_throttled)(struct task_struct *p, int cpu); #endif }; +/* + * Does not nest; only used around sched_class::pick_task() rq-lock-breaks. + */ +static inline void rq_modified_clear(struct rq *rq) +{ + rq->queue_mask = 0; +} + +static inline bool rq_modified_above(struct rq *rq, const struct sched_class * class) +{ + unsigned int mask = class->queue_mask; + return rq->queue_mask & ~((mask << 1) - 1); +} + static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->donor != prev); @@ -2579,8 +2717,9 @@ static inline bool sched_fair_runnable(struct rq *rq) return rq->cfs.nr_queued > 0; } -extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); -extern struct task_struct *pick_task_idle(struct rq *rq); +extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, + struct rq_flags *rf); +extern struct task_struct *pick_task_idle(struct rq *rq, struct rq_flags *rf); #define SCA_CHECK 0x01 #define SCA_MIGRATE_DISABLE 0x02 @@ -2610,7 +2749,7 @@ static inline bool task_allowed_on_cpu(struct task_struct *p, int cpu) static inline cpumask_t *alloc_user_cpus_ptr(int node) { /* - * See do_set_cpus_allowed() above for the rcu_head usage. + * See set_cpus_allowed_force() above for the rcu_head usage. */ int size = max_t(int, cpumask_size(), sizeof(struct rcu_head)); @@ -3875,32 +4014,42 @@ extern void set_load_weight(struct task_struct *p, bool update_load); extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); -extern void check_class_changing(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class); -extern void check_class_changed(struct rq *rq, struct task_struct *p, - const struct sched_class *prev_class, - int oldprio); - extern struct balance_callback *splice_balance_callbacks(struct rq *rq); extern void balance_callbacks(struct rq *rq, struct balance_callback *head); -#ifdef CONFIG_SCHED_CLASS_EXT /* - * Used by SCX in the enable/disable paths to move tasks between sched_classes - * and establish invariants. + * The 'sched_change' pattern is the safe, easy and slow way of changing a + * task's scheduling properties. It dequeues a task, such that the scheduler + * is fully unaware of it; at which point its properties can be modified; + * after which it is enqueued again. + * + * Typically this must be called while holding task_rq_lock, since most/all + * properties are serialized under those locks. There is currently one + * exception to this rule in sched/ext which only holds rq->lock. + */ + +/* + * This structure is a temporary, used to preserve/convey the queueing state + * of the task between sched_change_begin() and sched_change_end(). Ensuring + * the task's queueing state is idempotent across the operation. */ -struct sched_enq_and_set_ctx { +struct sched_change_ctx { + u64 prio; struct task_struct *p; - int queue_flags; + int flags; bool queued; bool running; }; -void sched_deq_and_put_task(struct task_struct *p, int queue_flags, - struct sched_enq_and_set_ctx *ctx); -void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); +struct sched_change_ctx *sched_change_begin(struct task_struct *p, unsigned int flags); +void sched_change_end(struct sched_change_ctx *ctx); -#endif /* CONFIG_SCHED_CLASS_EXT */ +DEFINE_CLASS(sched_change, struct sched_change_ctx *, + sched_change_end(_T), + sched_change_begin(p, flags), + struct task_struct *p, unsigned int flags) + +DEFINE_CLASS_IS_UNCONDITIONAL(sched_change) #include "ext.h" diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 26f3fd4d34ce..cbf7206b3f9d 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -206,7 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) rq = __task_rq_lock(p, &rf); psi_task_change(p, p->psi_flags, 0); - __task_rq_unlock(rq, &rf); + __task_rq_unlock(rq, p, &rf); } } diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 2d4e279f05ee..4f9192be4b5b 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -32,7 +32,7 @@ static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool fir stop->se.exec_start = rq_clock_task(rq); } -static struct task_struct *pick_task_stop(struct rq *rq) +static struct task_struct *pick_task_stop(struct rq *rq, struct rq_flags *rf) { if (!sched_stop_runnable(rq)) return NULL; @@ -75,14 +75,17 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) { } -static void switched_to_stop(struct rq *rq, struct task_struct *p) +static void switching_to_stop(struct rq *rq, struct task_struct *p) { BUG(); /* its impossible to change to this class */ } static void -prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) +prio_changed_stop(struct rq *rq, struct task_struct *p, u64 oldprio) { + if (p->prio == oldprio) + return; + BUG(); /* how!?, what priority? */ } @@ -95,6 +98,8 @@ static void update_curr_stop(struct rq *rq) */ DEFINE_SCHED_CLASS(stop) = { + .queue_mask = 16, + .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, @@ -112,6 +117,6 @@ DEFINE_SCHED_CLASS(stop) = { .task_tick = task_tick_stop, .prio_changed = prio_changed_stop, - .switched_to = switched_to_stop, + .switching_to = switching_to_stop, .update_curr = update_curr_stop, }; diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c index 77ae87f36e84..807879131add 100644 --- a/kernel/sched/syscalls.c +++ b/kernel/sched/syscalls.c @@ -64,8 +64,6 @@ static int effective_prio(struct task_struct *p) void set_user_nice(struct task_struct *p, long nice) { - bool queued, running; - struct rq *rq; int old_prio; if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) @@ -74,10 +72,7 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - CLASS(task_rq_lock, rq_guard)(p); - rq = rq_guard.rq; - - update_rq_clock(rq); + guard(task_rq_lock)(p); /* * The RT priorities are set via sched_setscheduler(), but we still @@ -90,28 +85,12 @@ void set_user_nice(struct task_struct *p, long nice) return; } - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); - if (running) - put_prev_task(rq, p); - - p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p, true); - old_prio = p->prio; - p->prio = effective_prio(p); - - if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); - if (running) - set_next_task(rq, p); - - /* - * If the task increased its priority or is running and - * lowered its priority, then reschedule its CPU: - */ - p->sched_class->prio_changed(rq, p, old_prio); + scoped_guard (sched_change, p, DEQUEUE_SAVE) { + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p, true); + old_prio = p->prio; + p->prio = effective_prio(p); + } } EXPORT_SYMBOL(set_user_nice); @@ -515,7 +494,7 @@ int __sched_setscheduler(struct task_struct *p, bool user, bool pi) { int oldpolicy = -1, policy = attr->sched_policy; - int retval, oldprio, newprio, queued, running; + int retval, oldprio, newprio; const struct sched_class *prev_class, *next_class; struct balance_callback *head; struct rq_flags rf; @@ -695,38 +674,27 @@ change: prev_class = p->sched_class; next_class = __setscheduler_class(policy, newprio); - if (prev_class != next_class && p->se.sched_delayed) - dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK); - - queued = task_on_rq_queued(p); - running = task_current_donor(rq, p); - if (queued) - dequeue_task(rq, p, queue_flags); - if (running) - put_prev_task(rq, p); + if (prev_class != next_class) + queue_flags |= DEQUEUE_CLASS; - if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { - __setscheduler_params(p, attr); - p->sched_class = next_class; - p->prio = newprio; - } - __setscheduler_uclamp(p, attr); - check_class_changing(rq, p, prev_class); + scoped_guard (sched_change, p, queue_flags) { - if (queued) { - /* - * We enqueue to tail when the priority of a task is - * increased (user space view). - */ - if (oldprio < p->prio) - queue_flags |= ENQUEUE_HEAD; + if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) { + __setscheduler_params(p, attr); + p->sched_class = next_class; + p->prio = newprio; + } + __setscheduler_uclamp(p, attr); - enqueue_task(rq, p, queue_flags); + if (scope->queued) { + /* + * We enqueue to tail when the priority of a task is + * increased (user space view). + */ + if (oldprio < p->prio) + scope->flags |= ENQUEUE_HEAD; + } } - if (running) - set_next_task(rq, p); - - check_class_changed(rq, p, prev_class, oldprio); /* Avoid rq from going away on us: */ preempt_disable(); @@ -1351,7 +1319,7 @@ static void do_sched_yield(void) rq = this_rq_lock_irq(&rf); schedstat_inc(rq->yld_count); - current->sched_class->yield_task(rq); + rq->donor->sched_class->yield_task(rq); preempt_disable(); rq_unlock_irq(rq, &rf); @@ -1420,12 +1388,13 @@ EXPORT_SYMBOL(yield); */ int __sched yield_to(struct task_struct *p, bool preempt) { - struct task_struct *curr = current; + struct task_struct *curr; struct rq *rq, *p_rq; int yielded = 0; scoped_guard (raw_spinlock_irqsave, &p->pi_lock) { rq = this_rq(); + curr = rq->donor; again: p_rq = task_rq(p); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 444bdfdab731..cf643a5ddedd 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1590,10 +1590,17 @@ static void claim_allocations(int cpu, struct sched_domain *sd) #ifdef CONFIG_NUMA enum numa_topology_type sched_numa_topology_type; +/* + * sched_domains_numa_distance is derived from sched_numa_node_distance + * and provides a simplified view of NUMA distances used specifically + * for building NUMA scheduling domains. + */ static int sched_domains_numa_levels; +static int sched_numa_node_levels; int sched_max_numa_distance; static int *sched_domains_numa_distance; +static int *sched_numa_node_distance; static struct cpumask ***sched_domains_numa_masks; #endif /* CONFIG_NUMA */ @@ -1662,6 +1669,12 @@ sd_init(struct sched_domain_topology_level *tl, .last_balance = jiffies, .balance_interval = sd_weight, + + /* 50% success rate */ + .newidle_call = 512, + .newidle_success = 256, + .newidle_ratio = 512, + .max_newidle_lb_cost = 0, .last_decay_max_lb_cost = jiffies, .child = child, @@ -1845,10 +1858,10 @@ bool find_numa_distance(int distance) return true; rcu_read_lock(); - distances = rcu_dereference(sched_domains_numa_distance); + distances = rcu_dereference(sched_numa_node_distance); if (!distances) goto unlock; - for (i = 0; i < sched_domains_numa_levels; i++) { + for (i = 0; i < sched_numa_node_levels; i++) { if (distances[i] == distance) { found = true; break; @@ -1924,14 +1937,34 @@ static void init_numa_topology_type(int offline_node) #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) -void sched_init_numa(int offline_node) +/* + * An architecture could modify its NUMA distance, to change + * grouping of NUMA nodes and number of NUMA levels when creating + * NUMA level sched domains. + * + * A NUMA level is created for each unique + * arch_sched_node_distance. + */ +static int numa_node_dist(int i, int j) { - struct sched_domain_topology_level *tl; - unsigned long *distance_map; + return node_distance(i, j); +} + +int arch_sched_node_distance(int from, int to) + __weak __alias(numa_node_dist); + +static bool modified_sched_node_distance(void) +{ + return numa_node_dist != arch_sched_node_distance; +} + +static int sched_record_numa_dist(int offline_node, int (*n_dist)(int, int), + int **dist, int *levels) +{ + unsigned long *distance_map __free(bitmap) = NULL; int nr_levels = 0; int i, j; int *distances; - struct cpumask ***masks; /* * O(nr_nodes^2) de-duplicating selection sort -- in order to find the @@ -1939,17 +1972,16 @@ void sched_init_numa(int offline_node) */ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); if (!distance_map) - return; + return -ENOMEM; bitmap_zero(distance_map, NR_DISTANCE_VALUES); for_each_cpu_node_but(i, offline_node) { for_each_cpu_node_but(j, offline_node) { - int distance = node_distance(i, j); + int distance = n_dist(i, j); if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { sched_numa_warn("Invalid distance value range"); - bitmap_free(distance_map); - return; + return -EINVAL; } bitmap_set(distance_map, distance, 1); @@ -1962,18 +1994,46 @@ void sched_init_numa(int offline_node) nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); distances = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); - if (!distances) { - bitmap_free(distance_map); - return; - } + if (!distances) + return -ENOMEM; for (i = 0, j = 0; i < nr_levels; i++, j++) { j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); distances[i] = j; } - rcu_assign_pointer(sched_domains_numa_distance, distances); + *dist = distances; + *levels = nr_levels; - bitmap_free(distance_map); + return 0; +} + +void sched_init_numa(int offline_node) +{ + struct sched_domain_topology_level *tl; + int nr_levels, nr_node_levels; + int i, j; + int *distances, *domain_distances; + struct cpumask ***masks; + + /* Record the NUMA distances from SLIT table */ + if (sched_record_numa_dist(offline_node, numa_node_dist, &distances, + &nr_node_levels)) + return; + + /* Record modified NUMA distances for building sched domains */ + if (modified_sched_node_distance()) { + if (sched_record_numa_dist(offline_node, arch_sched_node_distance, + &domain_distances, &nr_levels)) { + kfree(distances); + return; + } + } else { + domain_distances = distances; + nr_levels = nr_node_levels; + } + rcu_assign_pointer(sched_numa_node_distance, distances); + WRITE_ONCE(sched_max_numa_distance, distances[nr_node_levels - 1]); + WRITE_ONCE(sched_numa_node_levels, nr_node_levels); /* * 'nr_levels' contains the number of unique distances @@ -1991,6 +2051,8 @@ void sched_init_numa(int offline_node) * * We reset it to 'nr_levels' at the end of this function. */ + rcu_assign_pointer(sched_domains_numa_distance, domain_distances); + sched_domains_numa_levels = 0; masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); @@ -2016,10 +2078,13 @@ void sched_init_numa(int offline_node) masks[i][j] = mask; for_each_cpu_node_but(k, offline_node) { - if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) + if (sched_debug() && + (arch_sched_node_distance(j, k) != + arch_sched_node_distance(k, j))) sched_numa_warn("Node-distance not symmetric"); - if (node_distance(j, k) > sched_domains_numa_distance[i]) + if (arch_sched_node_distance(j, k) > + sched_domains_numa_distance[i]) continue; cpumask_or(mask, mask, cpumask_of_node(k)); @@ -2059,7 +2124,6 @@ void sched_init_numa(int offline_node) sched_domain_topology = tl; sched_domains_numa_levels = nr_levels; - WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]); init_numa_topology_type(offline_node); } @@ -2067,14 +2131,18 @@ void sched_init_numa(int offline_node) static void sched_reset_numa(void) { - int nr_levels, *distances; + int nr_levels, *distances, *dom_distances = NULL; struct cpumask ***masks; nr_levels = sched_domains_numa_levels; + sched_numa_node_levels = 0; sched_domains_numa_levels = 0; sched_max_numa_distance = 0; sched_numa_topology_type = NUMA_DIRECT; - distances = sched_domains_numa_distance; + distances = sched_numa_node_distance; + if (sched_numa_node_distance != sched_domains_numa_distance) + dom_distances = sched_domains_numa_distance; + rcu_assign_pointer(sched_numa_node_distance, NULL); rcu_assign_pointer(sched_domains_numa_distance, NULL); masks = sched_domains_numa_masks; rcu_assign_pointer(sched_domains_numa_masks, NULL); @@ -2083,6 +2151,7 @@ static void sched_reset_numa(void) synchronize_rcu(); kfree(distances); + kfree(dom_distances); for (i = 0; i < nr_levels && masks; i++) { if (!masks[i]) continue; @@ -2129,7 +2198,8 @@ void sched_domains_numa_masks_set(unsigned int cpu) continue; /* Set ourselves in the remote node's masks */ - if (node_distance(j, node) <= sched_domains_numa_distance[i]) + if (arch_sched_node_distance(j, node) <= + sched_domains_numa_distance[i]) cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]); } } diff --git a/kernel/task_work.c b/kernel/task_work.c index d1efec571a4a..0f7519f8e7c9 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -9,7 +9,12 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ #ifdef CONFIG_IRQ_WORK static void task_work_set_notify_irq(struct irq_work *entry) { - test_and_set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); + /* + * no-op IPI + * + * TWA_NMI_CURRENT will already have set the TIF flag, all + * this interrupt does it tickle the return-to-user path. + */ } static DEFINE_PER_CPU(struct irq_work, irq_work_NMI_resume) = IRQ_WORK_INIT_HARD(task_work_set_notify_irq); @@ -86,6 +91,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work, break; #ifdef CONFIG_IRQ_WORK case TWA_NMI_CURRENT: + set_tsk_thread_flag(current, TIF_NOTIFY_RESUME); irq_work_queue(this_cpu_ptr(&irq_work_NMI_resume)); break; #endif diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 5b6997f4dc3d..e76be24b132c 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -478,11 +478,8 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .ns.ns_type = ns_common_type(&init_time_ns), - .ns.__ns_ref = REFCOUNT_INIT(3), + .ns = NS_COMMON_INIT(init_time_ns), .user_ns = &init_user_ns, - .ns.inum = ns_init_inum(&init_time_ns), - .ns.ops = &timens_operations, .frozen_offsets = true, }; diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index aa3120104a51..56e17b625c72 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -475,12 +475,6 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, if (!kc->timer_create) return -EOPNOTSUPP; - new_timer = alloc_posix_timer(); - if (unlikely(!new_timer)) - return -EAGAIN; - - spin_lock_init(&new_timer->it_lock); - /* Special case for CRIU to restore timers with a given timer ID. */ if (unlikely(current->signal->timer_create_restore_ids)) { if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) @@ -490,6 +484,12 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, return -EINVAL; } + new_timer = alloc_posix_timer(); + if (unlikely(!new_timer)) + return -EAGAIN; + + spin_lock_init(&new_timer->it_lock); + /* * Add the timer to the hash table. The timer is not yet valid * after insertion, but has a unique ID allocated. diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index c527b421c865..466e083c8272 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1152,16 +1152,15 @@ static bool report_idle_softirq(void) return false; } - if (ratelimit >= 10) - return false; - /* On RT, softirq handling may be waiting on some lock */ if (local_bh_blocked()) return false; - pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", - pending); - ratelimit++; + if (ratelimit < 10) { + pr_warn("NOHZ tick-stop error: local softirq work is pending, handler #%02x!!!\n", + pending); + ratelimit++; + } return true; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3a4d3b2e3f74..4790da895203 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -3060,29 +3060,34 @@ static const struct attribute_group aux_clock_enable_attr_group = { static int __init tk_aux_sysfs_init(void) { struct kobject *auxo, *tko = kobject_create_and_add("time", kernel_kobj); + int ret = -ENOMEM; if (!tko) - return -ENOMEM; + return ret; auxo = kobject_create_and_add("aux_clocks", tko); - if (!auxo) { - kobject_put(tko); - return -ENOMEM; - } + if (!auxo) + goto err_clean; for (int i = 0; i < MAX_AUX_CLOCKS; i++) { char id[2] = { [0] = '0' + i, }; struct kobject *clk = kobject_create_and_add(id, auxo); - if (!clk) - return -ENOMEM; - - int ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); + if (!clk) { + ret = -ENOMEM; + goto err_clean; + } + ret = sysfs_create_group(clk, &aux_clock_enable_attr_group); if (ret) - return ret; + goto err_clean; } return 0; + +err_clean: + kobject_put(auxo); + kobject_put(tko); + return ret; } late_initcall(tk_aux_sysfs_init); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 553fa469d7cc..d5ebb1d927ea 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1458,10 +1458,11 @@ static int __try_to_del_timer_sync(struct timer_list *timer, bool shutdown) base = lock_timer_base(timer, &flags); - if (base->running_timer != timer) + if (base->running_timer != timer) { ret = detach_if_pending(timer, base, true); - if (shutdown) - timer->function = NULL; + if (shutdown) + timer->function = NULL; + } raw_spin_unlock_irqrestore(&base->lock, flags); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 42bd2ba68a82..59cfacb8a5bb 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1971,7 +1971,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops) */ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, struct ftrace_hash *old_hash, - struct ftrace_hash *new_hash) + struct ftrace_hash *new_hash, + bool update_target) { struct ftrace_page *pg; struct dyn_ftrace *rec, *end = NULL; @@ -2006,10 +2007,13 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, if (rec->flags & FTRACE_FL_DISABLED) continue; - /* We need to update only differences of filter_hash */ + /* + * Unless we are updating the target of a direct function, + * we only need to update differences of filter_hash + */ in_old = !!ftrace_lookup_ip(old_hash, rec->ip); in_new = !!ftrace_lookup_ip(new_hash, rec->ip); - if (in_old == in_new) + if (!update_target && (in_old == in_new)) continue; if (in_new) { @@ -2020,7 +2024,16 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops, if (is_ipmodify) goto rollback; - FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT); + /* + * If this is called by __modify_ftrace_direct() + * then it is only changing where the direct + * pointer is jumping to, and the record already + * points to a direct trampoline. If it isn't, + * then it is a bug to update ipmodify on a direct + * caller. + */ + FTRACE_WARN_ON(!update_target && + (rec->flags & FTRACE_FL_DIRECT)); /* * Another ops with IPMODIFY is already @@ -2076,7 +2089,7 @@ static int ftrace_hash_ipmodify_enable(struct ftrace_ops *ops) if (ftrace_hash_empty(hash)) hash = NULL; - return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash); + return __ftrace_hash_update_ipmodify(ops, EMPTY_HASH, hash, false); } /* Disabling always succeeds */ @@ -2087,7 +2100,7 @@ static void ftrace_hash_ipmodify_disable(struct ftrace_ops *ops) if (ftrace_hash_empty(hash)) hash = NULL; - __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH); + __ftrace_hash_update_ipmodify(ops, hash, EMPTY_HASH, false); } static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, @@ -2101,7 +2114,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, if (ftrace_hash_empty(new_hash)) new_hash = NULL; - return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); + return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash, false); } static void print_ip_ins(const char *fmt, const unsigned char *p) @@ -5953,6 +5966,17 @@ static void register_ftrace_direct_cb(struct rcu_head *rhp) free_ftrace_hash(fhp); } +static void reset_direct(struct ftrace_ops *ops, unsigned long addr) +{ + struct ftrace_hash *hash = ops->func_hash->filter_hash; + + remove_direct_functions_hash(hash, addr); + + /* cleanup for possible another register call */ + ops->func = NULL; + ops->trampoline = 0; +} + /** * register_ftrace_direct - Call a custom trampoline directly * for multiple functions registered in @ops @@ -6048,6 +6072,8 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) ops->direct_call = addr; err = register_ftrace_function_nolock(ops); + if (err) + reset_direct(ops, addr); out_unlock: mutex_unlock(&direct_mutex); @@ -6080,7 +6106,6 @@ EXPORT_SYMBOL_GPL(register_ftrace_direct); int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, bool free_filters) { - struct ftrace_hash *hash = ops->func_hash->filter_hash; int err; if (check_direct_multi(ops)) @@ -6090,13 +6115,9 @@ int unregister_ftrace_direct(struct ftrace_ops *ops, unsigned long addr, mutex_lock(&direct_mutex); err = unregister_ftrace_function(ops); - remove_direct_functions_hash(hash, addr); + reset_direct(ops, addr); mutex_unlock(&direct_mutex); - /* cleanup for possible another register call */ - ops->func = NULL; - ops->trampoline = 0; - if (free_filters) ftrace_free_filter(ops); return err; @@ -6106,7 +6127,7 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_direct); static int __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) { - struct ftrace_hash *hash; + struct ftrace_hash *hash = ops->func_hash->filter_hash; struct ftrace_func_entry *entry, *iter; static struct ftrace_ops tmp_ops = { .func = ftrace_stub, @@ -6127,12 +6148,20 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) return err; /* + * Call __ftrace_hash_update_ipmodify() here, so that we can call + * ops->ops_func for the ops. This is needed because the above + * register_ftrace_function_nolock() worked on tmp_ops. + */ + err = __ftrace_hash_update_ipmodify(ops, hash, hash, true); + if (err) + goto out; + + /* * Now the ftrace_ops_list_func() is called to do the direct callers. * We can safely change the direct functions attached to each entry. */ mutex_lock(&ftrace_lock); - hash = ops->func_hash->filter_hash; size = 1 << hash->size_bits; for (i = 0; i < size; i++) { hlist_for_each_entry(iter, &hash->buckets[i], hlist) { @@ -6147,6 +6176,7 @@ __modify_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) mutex_unlock(&ftrace_lock); +out: /* Removing the tmp_ops will add the updated direct callers to the functions */ unregister_ftrace_function(&tmp_ops); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1244d2c5c384..afcd3747264d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -7344,6 +7344,10 @@ consume: goto out; } + /* Did the reader catch up with the writer? */ + if (cpu_buffer->reader_page == cpu_buffer->commit_page) + goto out; + reader = rb_get_reader_page(cpu_buffer); if (WARN_ON(!reader)) goto out; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d1e527cf2aae..304e93597126 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -8781,8 +8781,18 @@ static void tracing_buffers_mmap_close(struct vm_area_struct *vma) put_snapshot_map(iter->tr); } +static int tracing_buffers_may_split(struct vm_area_struct *vma, unsigned long addr) +{ + /* + * Trace buffer mappings require the complete buffer including + * the meta page. Partial mappings are not supported. + */ + return -EINVAL; +} + static const struct vm_operations_struct tracing_buffers_vmops = { .close = tracing_buffers_mmap_close, + .may_split = tracing_buffers_may_split, }; static int tracing_buffers_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 1d536219b624..6bfaf1210dd2 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3272,14 +3272,16 @@ static struct field_var *create_field_var(struct hist_trigger_data *hist_data, var = create_var(hist_data, file, field_name, val->size, val->type); if (IS_ERR(var)) { hist_err(tr, HIST_ERR_VAR_CREATE_FIND_FAIL, errpos(field_name)); - kfree(val); + destroy_hist_field(val, 0); ret = PTR_ERR(var); goto err; } field_var = kzalloc(sizeof(struct field_var), GFP_KERNEL); if (!field_var) { - kfree(val); + destroy_hist_field(val, 0); + kfree_const(var->type); + kfree(var->var.name); kfree(var); ret = -ENOMEM; goto err; diff --git a/kernel/trace/trace_events_user.c b/kernel/trace/trace_events_user.c index c428dafe7496..b15854c75d4f 100644 --- a/kernel/trace/trace_events_user.c +++ b/kernel/trace/trace_events_user.c @@ -1449,12 +1449,7 @@ static struct trace_event_functions user_event_funcs = { static int user_event_set_call_visible(struct user_event *user, bool visible) { - int ret; - const struct cred *old_cred; - struct cred *cred; - - cred = prepare_creds(); - + CLASS(prepare_creds, cred)(); if (!cred) return -ENOMEM; @@ -1469,17 +1464,12 @@ static int user_event_set_call_visible(struct user_event *user, bool visible) */ cred->fsuid = GLOBAL_ROOT_UID; - old_cred = override_creds(cred); - - if (visible) - ret = trace_add_event_call(&user->call); - else - ret = trace_remove_event_call(&user->call); - - revert_creds(old_cred); - put_cred(cred); + scoped_with_creds(cred) { + if (visible) + return trace_add_event_call(&user->call); - return ret; + return trace_remove_event_call(&user->call); + } } static int destroy_user_event(struct user_event *user) diff --git a/kernel/trace/trace_fprobe.c b/kernel/trace/trace_fprobe.c index ad9d6347b5fa..8001dbf16891 100644 --- a/kernel/trace/trace_fprobe.c +++ b/kernel/trace/trace_fprobe.c @@ -106,13 +106,14 @@ static struct tracepoint_user *__tracepoint_user_init(const char *name, struct t if (!tuser->name) return NULL; + /* Register tracepoint if it is loaded. */ if (tpoint) { + tuser->tpoint = tpoint; ret = tracepoint_user_register(tuser); if (ret) return ERR_PTR(ret); } - tuser->tpoint = tpoint; tuser->refcount = 1; INIT_LIST_HEAD(&tuser->list); list_add(&tuser->list, &tracepoint_user_list); @@ -1513,6 +1514,10 @@ static int disable_trace_fprobe(struct trace_event_call *call, if (!trace_probe_is_enabled(tp)) { list_for_each_entry(tf, trace_probe_probe_list(tp), tp.list) { unregister_fprobe(&tf->fp); + if (tf->tuser) { + tracepoint_user_put(tf->tuser); + tf->tuser = NULL; + } } } diff --git a/kernel/unwind/deferred.c b/kernel/unwind/deferred.c index dc6040aae3ee..a88fb481c4a3 100644 --- a/kernel/unwind/deferred.c +++ b/kernel/unwind/deferred.c @@ -53,7 +53,7 @@ DEFINE_STATIC_SRCU(unwind_srcu); static inline bool unwind_pending(struct unwind_task_info *info) { - return test_bit(UNWIND_PENDING_BIT, &info->unwind_mask); + return atomic_long_read(&info->unwind_mask) & UNWIND_PENDING; } /* @@ -79,6 +79,8 @@ static u64 get_cookie(struct unwind_task_info *info) { u32 cnt = 1; + lockdep_assert_irqs_disabled(); + if (info->id.cpu) return info->id.id; @@ -126,23 +128,20 @@ int unwind_user_faultable(struct unwind_stacktrace *trace) cache = info->cache; trace->entries = cache->entries; - - if (cache->nr_entries) { - /* - * The user stack has already been previously unwound in this - * entry context. Skip the unwind and use the cache. - */ - trace->nr = cache->nr_entries; + trace->nr = cache->nr_entries; + /* + * The user stack has already been previously unwound in this + * entry context. Skip the unwind and use the cache. + */ + if (trace->nr) return 0; - } - trace->nr = 0; unwind_user(trace, UNWIND_MAX_ENTRIES); cache->nr_entries = trace->nr; /* Clear nr_entries on way back to user space */ - set_bit(UNWIND_USED_BIT, &info->unwind_mask); + atomic_long_or(UNWIND_USED, &info->unwind_mask); return 0; } @@ -160,7 +159,7 @@ static void process_unwind_deferred(struct task_struct *task) /* Clear pending bit but make sure to have the current bits */ bits = atomic_long_fetch_andnot(UNWIND_PENDING, - (atomic_long_t *)&info->unwind_mask); + &info->unwind_mask); /* * From here on out, the callback must always be called, even if it's * just an empty trace. @@ -231,6 +230,7 @@ void unwind_deferred_task_exit(struct task_struct *task) int unwind_deferred_request(struct unwind_work *work, u64 *cookie) { struct unwind_task_info *info = ¤t->unwind_info; + int twa_mode = TWA_RESUME; unsigned long old, bits; unsigned long bit; int ret; @@ -246,8 +246,11 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) * Trigger a warning to make it obvious that an architecture * is using this in NMI when it should not be. */ - if (WARN_ON_ONCE(!CAN_USE_IN_NMI && in_nmi())) - return -EINVAL; + if (in_nmi()) { + if (WARN_ON_ONCE(!CAN_USE_IN_NMI)) + return -EINVAL; + twa_mode = TWA_NMI_CURRENT; + } /* Do not allow cancelled works to request again */ bit = READ_ONCE(work->bit); @@ -261,7 +264,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) *cookie = get_cookie(info); - old = READ_ONCE(info->unwind_mask); + old = atomic_long_read(&info->unwind_mask); /* Is this already queued or executed */ if (old & bit) @@ -274,7 +277,7 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) * to have a callback. */ bits = UNWIND_PENDING | bit; - old = atomic_long_fetch_or(bits, (atomic_long_t *)&info->unwind_mask); + old = atomic_long_fetch_or(bits, &info->unwind_mask); if (old & bits) { /* * If the work's bit was set, whatever set it had better @@ -285,10 +288,10 @@ int unwind_deferred_request(struct unwind_work *work, u64 *cookie) } /* The work has been claimed, now schedule it. */ - ret = task_work_add(current, &info->work, TWA_RESUME); + ret = task_work_add(current, &info->work, twa_mode); if (WARN_ON_ONCE(ret)) - WRITE_ONCE(info->unwind_mask, 0); + atomic_long_set(&info->unwind_mask, 0); return ret; } @@ -320,7 +323,8 @@ void unwind_deferred_cancel(struct unwind_work *work) guard(rcu)(); /* Clear this bit from all threads */ for_each_process_thread(g, t) { - clear_bit(bit, &t->unwind_info.unwind_mask); + atomic_long_andnot(BIT(bit), + &t->unwind_info.unwind_mask); if (t->unwind_info.cache) clear_bit(bit, &t->unwind_info.cache->unwind_completed); } @@ -350,7 +354,7 @@ void unwind_task_init(struct task_struct *task) memset(info, 0, sizeof(*info)); init_task_work(&info->work, unwind_deferred_task_work); - info->unwind_mask = 0; + atomic_long_set(&info->unwind_mask, 0); } void unwind_task_free(struct task_struct *task) diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c index 97a8415e3216..39e270789444 100644 --- a/kernel/unwind/user.c +++ b/kernel/unwind/user.c @@ -8,18 +8,28 @@ #include <linux/unwind_user.h> #include <linux/uaccess.h> -static const struct unwind_user_frame fp_frame = { - ARCH_INIT_USER_FP_FRAME -}; - #define for_each_user_frame(state) \ for (unwind_user_start(state); !(state)->done; unwind_user_next(state)) -static int unwind_user_next_fp(struct unwind_user_state *state) +static inline int +get_user_word(unsigned long *word, unsigned long base, int off, unsigned int ws) +{ + unsigned long __user *addr = (void __user *)base + off; +#ifdef CONFIG_COMPAT + if (ws == sizeof(int)) { + unsigned int data; + int ret = get_user(data, (unsigned int __user *)addr); + *word = data; + return ret; + } +#endif + return get_user(*word, addr); +} + +static int unwind_user_next_common(struct unwind_user_state *state, + const struct unwind_user_frame *frame) { - const struct unwind_user_frame *frame = &fp_frame; unsigned long cfa, fp, ra; - unsigned int shift; if (frame->use_fp) { if (state->fp < state->sp) @@ -37,24 +47,45 @@ static int unwind_user_next_fp(struct unwind_user_state *state) return -EINVAL; /* Make sure that the address is word aligned */ - shift = sizeof(long) == 4 ? 2 : 3; - if (cfa & ((1 << shift) - 1)) + if (cfa & (state->ws - 1)) return -EINVAL; /* Find the Return Address (RA) */ - if (get_user(ra, (unsigned long *)(cfa + frame->ra_off))) + if (get_user_word(&ra, cfa, frame->ra_off, state->ws)) return -EINVAL; - if (frame->fp_off && get_user(fp, (unsigned long __user *)(cfa + frame->fp_off))) + if (frame->fp_off && get_user_word(&fp, cfa, frame->fp_off, state->ws)) return -EINVAL; state->ip = ra; state->sp = cfa; if (frame->fp_off) state->fp = fp; + state->topmost = false; return 0; } +static int unwind_user_next_fp(struct unwind_user_state *state) +{ +#ifdef CONFIG_HAVE_UNWIND_USER_FP + struct pt_regs *regs = task_pt_regs(current); + + if (state->topmost && unwind_user_at_function_start(regs)) { + const struct unwind_user_frame fp_entry_frame = { + ARCH_INIT_USER_FP_ENTRY_FRAME(state->ws) + }; + return unwind_user_next_common(state, &fp_entry_frame); + } + + const struct unwind_user_frame fp_frame = { + ARCH_INIT_USER_FP_FRAME(state->ws) + }; + return unwind_user_next_common(state, &fp_frame); +#else + return -EINVAL; +#endif +} + static int unwind_user_next(struct unwind_user_state *state) { unsigned long iter_mask = state->available_types; @@ -102,6 +133,12 @@ static int unwind_user_start(struct unwind_user_state *state) state->ip = instruction_pointer(regs); state->sp = user_stack_pointer(regs); state->fp = frame_pointer(regs); + state->ws = unwind_user_word_size(regs); + if (!state->ws) { + state->done = true; + return -EINVAL; + } + state->topmost = true; return 0; } diff --git a/kernel/user.c b/kernel/user.c index 0163665914c9..7aef4e679a6a 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -35,6 +35,7 @@ EXPORT_SYMBOL_GPL(init_binfmt_misc); * and 1 for... ? */ struct user_namespace init_user_ns = { + .ns = NS_COMMON_INIT(init_user_ns), .uid_map = { { .extent[0] = { @@ -65,14 +66,8 @@ struct user_namespace init_user_ns = { .nr_extents = 1, }, }, - .ns.ns_type = ns_common_type(&init_user_ns), - .ns.__ns_ref = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, - .ns.inum = ns_init_inum(&init_user_ns), -#ifdef CONFIG_USER_NS - .ns.ops = &userns_operations, -#endif .flags = USERNS_INIT_FLAGS, #ifdef CONFIG_KEYS .keyring_name_list = LIST_HEAD_INIT(init_user_ns.keyring_name_list), diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 7e45559521af..52f89f1137da 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -119,9 +119,9 @@ static bool post_one_notification(struct watch_queue *wqueue, offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; get_page(page); len = n->info & WATCH_INFO_LENGTH; - p = kmap_atomic(page); + p = kmap_local_page(page); memcpy(p + offset, n, len); - kunmap_atomic(p); + kunmap_local(p); buf = pipe_buf(pipe, head); buf->page = page; |