summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c19
-rw-r--r--kernel/entry/common.c39
-rw-r--r--kernel/entry/syscall-common.c8
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/ptrace.c6
-rw-r--r--kernel/rseq.c655
-rw-r--r--kernel/sched/core.c830
-rw-r--r--kernel/sched/membarrier.c8
-rw-r--r--kernel/sched/sched.h392
-rw-r--r--kernel/signal.c2
11 files changed, 893 insertions, 1074 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index db9f6c539b28..b674fdf96208 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -3085,10 +3085,13 @@ EXPORT_SYMBOL(cpu_all_bits);
#ifdef CONFIG_INIT_ALL_POSSIBLE
struct cpumask __cpu_possible_mask __ro_after_init
= {CPU_BITS_ALL};
+unsigned int __num_possible_cpus __ro_after_init = NR_CPUS;
#else
struct cpumask __cpu_possible_mask __ro_after_init;
+unsigned int __num_possible_cpus __ro_after_init;
#endif
EXPORT_SYMBOL(__cpu_possible_mask);
+EXPORT_SYMBOL(__num_possible_cpus);
struct cpumask __cpu_online_mask __read_mostly;
EXPORT_SYMBOL(__cpu_online_mask);
@@ -3116,6 +3119,7 @@ void init_cpu_present(const struct cpumask *src)
void init_cpu_possible(const struct cpumask *src)
{
cpumask_copy(&__cpu_possible_mask, src);
+ __num_possible_cpus = cpumask_weight(&__cpu_possible_mask);
}
void set_cpu_online(unsigned int cpu, bool online)
@@ -3140,6 +3144,21 @@ void set_cpu_online(unsigned int cpu, bool online)
}
/*
+ * This should be marked __init, but there is a boatload of call sites
+ * which need to be fixed up to do so. Sigh...
+ */
+void set_cpu_possible(unsigned int cpu, bool possible)
+{
+ if (possible) {
+ if (!cpumask_test_and_set_cpu(cpu, &__cpu_possible_mask))
+ __num_possible_cpus++;
+ } else {
+ if (cpumask_test_and_clear_cpu(cpu, &__cpu_possible_mask))
+ __num_possible_cpus--;
+ }
+}
+
+/*
* Activate the first processor.
*/
void __init boot_cpu_init(void)
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index f62e1d1b2063..5c792b30c58a 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -11,19 +11,20 @@
/* Workaround to allow gradual conversion of architecture code */
void __weak arch_do_signal_or_restart(struct pt_regs *regs) { }
-/**
- * exit_to_user_mode_loop - do any pending work before leaving to user space
- * @regs: Pointer to pt_regs on entry stack
- * @ti_work: TIF work flags as read by the caller
- */
-__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
- unsigned long ti_work)
+#ifdef CONFIG_HAVE_GENERIC_TIF_BITS
+#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK & ~_TIF_RSEQ)
+#else
+#define EXIT_TO_USER_MODE_WORK_LOOP (EXIT_TO_USER_MODE_WORK)
+#endif
+
+static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *regs,
+ unsigned long ti_work)
{
/*
* Before returning to user space ensure that all pending work
* items have been completed.
*/
- while (ti_work & EXIT_TO_USER_MODE_WORK) {
+ while (ti_work & EXIT_TO_USER_MODE_WORK_LOOP) {
local_irq_enable_exit_to_user(ti_work);
@@ -62,17 +63,21 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
return ti_work;
}
-noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
+/**
+ * exit_to_user_mode_loop - do any pending work before leaving to user space
+ * @regs: Pointer to pt_regs on entry stack
+ * @ti_work: TIF work flags as read by the caller
+ */
+__always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
+ unsigned long ti_work)
{
- enter_from_user_mode(regs);
-}
+ for (;;) {
+ ti_work = __exit_to_user_mode_loop(regs, ti_work);
-noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
-{
- instrumentation_begin();
- exit_to_user_mode_prepare(regs);
- instrumentation_end();
- exit_to_user_mode();
+ if (likely(!rseq_exit_to_user_mode_restart(regs, ti_work)))
+ return ti_work;
+ ti_work = read_thread_flags();
+ }
}
noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
diff --git a/kernel/entry/syscall-common.c b/kernel/entry/syscall-common.c
index 66e6ba7fa80c..940a597ded40 100644
--- a/kernel/entry/syscall-common.c
+++ b/kernel/entry/syscall-common.c
@@ -63,14 +63,6 @@ long syscall_trace_enter(struct pt_regs *regs, long syscall,
return ret ? : syscall;
}
-noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
-{
- enter_from_user_mode(regs);
- instrumentation_begin();
- local_irq_enable();
- instrumentation_end();
-}
-
/*
* If SYSCALL_EMU is set, then the only reason to report is when
* SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
diff --git a/kernel/exit.c b/kernel/exit.c
index fdfd05d1826c..b9667ffcf7b3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -911,6 +911,7 @@ void __noreturn do_exit(long code)
user_events_exit(tsk);
io_uring_files_cancel();
+ sched_mm_cid_exit(tsk);
exit_signals(tsk); /* sets PF_EXITING */
seccomp_filter_release(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index f1857672426e..83e05d6f2307 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -955,10 +955,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#endif
#ifdef CONFIG_SCHED_MM_CID
- tsk->mm_cid = -1;
- tsk->last_mm_cid = -1;
- tsk->mm_cid_active = 0;
- tsk->migrate_from_cpu = -1;
+ tsk->mm_cid.cid = MM_CID_UNSET;
+ tsk->mm_cid.active = 0;
#endif
return tsk;
@@ -2456,6 +2454,7 @@ bad_fork_cleanup_namespaces:
exit_nsproxy_namespaces(p);
bad_fork_cleanup_mm:
if (p->mm) {
+ sched_mm_cid_exit(p);
mm_clear_owner(p->mm, p);
mmput(p->mm);
}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 75a84efad40f..392ec2f75f01 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -793,9 +793,9 @@ static long ptrace_get_rseq_configuration(struct task_struct *task,
unsigned long size, void __user *data)
{
struct ptrace_rseq_configuration conf = {
- .rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
- .rseq_abi_size = task->rseq_len,
- .signature = task->rseq_sig,
+ .rseq_abi_pointer = (u64)(uintptr_t)task->rseq.usrptr,
+ .rseq_abi_size = task->rseq.len,
+ .signature = task->rseq.sig,
.flags = 0,
};
diff --git a/kernel/rseq.c b/kernel/rseq.c
index 2452b7366b00..395d8b002350 100644
--- a/kernel/rseq.c
+++ b/kernel/rseq.c
@@ -8,98 +8,7 @@
* Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
*/
-#include <linux/sched.h>
-#include <linux/uaccess.h>
-#include <linux/syscalls.h>
-#include <linux/rseq.h>
-#include <linux/types.h>
-#include <linux/ratelimit.h>
-#include <asm/ptrace.h>
-
-#define CREATE_TRACE_POINTS
-#include <trace/events/rseq.h>
-
-/* The original rseq structure size (including padding) is 32 bytes. */
-#define ORIG_RSEQ_SIZE 32
-
-#define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \
- RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \
- RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE)
-
-#ifdef CONFIG_DEBUG_RSEQ
-static struct rseq *rseq_kernel_fields(struct task_struct *t)
-{
- return (struct rseq *) t->rseq_fields;
-}
-
-static int rseq_validate_ro_fields(struct task_struct *t)
-{
- static DEFINE_RATELIMIT_STATE(_rs,
- DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
- u32 cpu_id_start, cpu_id, node_id, mm_cid;
- struct rseq __user *rseq = t->rseq;
-
- /*
- * Validate fields which are required to be read-only by
- * user-space.
- */
- if (!user_read_access_begin(rseq, t->rseq_len))
- goto efault;
- unsafe_get_user(cpu_id_start, &rseq->cpu_id_start, efault_end);
- unsafe_get_user(cpu_id, &rseq->cpu_id, efault_end);
- unsafe_get_user(node_id, &rseq->node_id, efault_end);
- unsafe_get_user(mm_cid, &rseq->mm_cid, efault_end);
- user_read_access_end();
-
- if ((cpu_id_start != rseq_kernel_fields(t)->cpu_id_start ||
- cpu_id != rseq_kernel_fields(t)->cpu_id ||
- node_id != rseq_kernel_fields(t)->node_id ||
- mm_cid != rseq_kernel_fields(t)->mm_cid) && __ratelimit(&_rs)) {
-
- pr_warn("Detected rseq corruption for pid: %d, name: %s\n"
- "\tcpu_id_start: %u ?= %u\n"
- "\tcpu_id: %u ?= %u\n"
- "\tnode_id: %u ?= %u\n"
- "\tmm_cid: %u ?= %u\n",
- t->pid, t->comm,
- cpu_id_start, rseq_kernel_fields(t)->cpu_id_start,
- cpu_id, rseq_kernel_fields(t)->cpu_id,
- node_id, rseq_kernel_fields(t)->node_id,
- mm_cid, rseq_kernel_fields(t)->mm_cid);
- }
-
- /* For now, only print a console warning on mismatch. */
- return 0;
-
-efault_end:
- user_read_access_end();
-efault:
- return -EFAULT;
-}
-
-/*
- * Update an rseq field and its in-kernel copy in lock-step to keep a coherent
- * state.
- */
-#define rseq_unsafe_put_user(t, value, field, error_label) \
- do { \
- unsafe_put_user(value, &t->rseq->field, error_label); \
- rseq_kernel_fields(t)->field = value; \
- } while (0)
-
-#else
-static int rseq_validate_ro_fields(struct task_struct *t)
-{
- return 0;
-}
-
-#define rseq_unsafe_put_user(t, value, field, error_label) \
- unsafe_put_user(value, &t->rseq->field, error_label)
-#endif
-
/*
- *
* Restartable sequences are a lightweight interface that allows
* user-level code to be executed atomically relative to scheduler
* preemption and signal delivery. Typically used for implementing
@@ -158,356 +67,356 @@ static int rseq_validate_ro_fields(struct task_struct *t)
* F1. <failure>
*/
-static int rseq_update_cpu_node_id(struct task_struct *t)
-{
- struct rseq __user *rseq = t->rseq;
- u32 cpu_id = raw_smp_processor_id();
- u32 node_id = cpu_to_node(cpu_id);
- u32 mm_cid = task_mm_cid(t);
+/* Required to select the proper per_cpu ops for rseq_stats_inc() */
+#define RSEQ_BUILD_SLOW_PATH
- /*
- * Validate read-only rseq fields.
- */
- if (rseq_validate_ro_fields(t))
- goto efault;
- WARN_ON_ONCE((int) mm_cid < 0);
- if (!user_write_access_begin(rseq, t->rseq_len))
- goto efault;
+#include <linux/debugfs.h>
+#include <linux/ratelimit.h>
+#include <linux/rseq_entry.h>
+#include <linux/sched.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+#include <linux/types.h>
+#include <asm/ptrace.h>
- rseq_unsafe_put_user(t, cpu_id, cpu_id_start, efault_end);
- rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end);
- rseq_unsafe_put_user(t, node_id, node_id, efault_end);
- rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
+#define CREATE_TRACE_POINTS
+#include <trace/events/rseq.h>
- /*
- * Additional feature fields added after ORIG_RSEQ_SIZE
- * need to be conditionally updated only if
- * t->rseq_len != ORIG_RSEQ_SIZE.
- */
- user_write_access_end();
- trace_rseq_update(t);
- return 0;
+DEFINE_STATIC_KEY_MAYBE(CONFIG_RSEQ_DEBUG_DEFAULT_ENABLE, rseq_debug_enabled);
-efault_end:
- user_write_access_end();
-efault:
- return -EFAULT;
+static inline void rseq_control_debug(bool on)
+{
+ if (on)
+ static_branch_enable(&rseq_debug_enabled);
+ else
+ static_branch_disable(&rseq_debug_enabled);
}
-static int rseq_reset_rseq_cpu_node_id(struct task_struct *t)
+static int __init rseq_setup_debug(char *str)
{
- struct rseq __user *rseq = t->rseq;
- u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0,
- mm_cid = 0;
-
- /*
- * Validate read-only rseq fields.
- */
- if (rseq_validate_ro_fields(t))
- goto efault;
+ bool on;
- if (!user_write_access_begin(rseq, t->rseq_len))
- goto efault;
-
- /*
- * Reset all fields to their initial state.
- *
- * All fields have an initial state of 0 except cpu_id which is set to
- * RSEQ_CPU_ID_UNINITIALIZED, so that any user coming in after
- * unregistration can figure out that rseq needs to be registered
- * again.
- */
- rseq_unsafe_put_user(t, cpu_id_start, cpu_id_start, efault_end);
- rseq_unsafe_put_user(t, cpu_id, cpu_id, efault_end);
- rseq_unsafe_put_user(t, node_id, node_id, efault_end);
- rseq_unsafe_put_user(t, mm_cid, mm_cid, efault_end);
-
- /*
- * Additional feature fields added after ORIG_RSEQ_SIZE
- * need to be conditionally reset only if
- * t->rseq_len != ORIG_RSEQ_SIZE.
- */
- user_write_access_end();
- return 0;
-
-efault_end:
- user_write_access_end();
-efault:
- return -EFAULT;
+ if (kstrtobool(str, &on))
+ return -EINVAL;
+ rseq_control_debug(on);
+ return 1;
}
+__setup("rseq_debug=", rseq_setup_debug);
+#ifdef CONFIG_TRACEPOINTS
/*
- * Get the user-space pointer value stored in the 'rseq_cs' field.
+ * Out of line, so the actual update functions can be in a header to be
+ * inlined into the exit to user code.
*/
-static int rseq_get_rseq_cs_ptr_val(struct rseq __user *rseq, u64 *rseq_cs)
+void __rseq_trace_update(struct task_struct *t)
{
- if (!rseq_cs)
- return -EFAULT;
-
-#ifdef CONFIG_64BIT
- if (get_user(*rseq_cs, &rseq->rseq_cs))
- return -EFAULT;
-#else
- if (copy_from_user(rseq_cs, &rseq->rseq_cs, sizeof(*rseq_cs)))
- return -EFAULT;
-#endif
+ trace_rseq_update(t);
+}
- return 0;
+void __rseq_trace_ip_fixup(unsigned long ip, unsigned long start_ip,
+ unsigned long offset, unsigned long abort_ip)
+{
+ trace_rseq_ip_fixup(ip, start_ip, offset, abort_ip);
}
+#endif /* CONFIG_TRACEPOINTS */
-/*
- * If the rseq_cs field of 'struct rseq' contains a valid pointer to
- * user-space, copy 'struct rseq_cs' from user-space and validate its fields.
- */
-static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
+#ifdef CONFIG_DEBUG_FS
+#ifdef CONFIG_RSEQ_STATS
+DEFINE_PER_CPU(struct rseq_stats, rseq_stats);
+
+static int rseq_stats_show(struct seq_file *m, void *p)
{
- struct rseq_cs __user *urseq_cs;
- u64 ptr;
- u32 __user *usig;
- u32 sig;
- int ret;
-
- ret = rseq_get_rseq_cs_ptr_val(t->rseq, &ptr);
- if (ret)
- return ret;
-
- /* If the rseq_cs pointer is NULL, return a cleared struct rseq_cs. */
- if (!ptr) {
- memset(rseq_cs, 0, sizeof(*rseq_cs));
- return 0;
+ struct rseq_stats stats = { };
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ stats.exit += data_race(per_cpu(rseq_stats.exit, cpu));
+ stats.signal += data_race(per_cpu(rseq_stats.signal, cpu));
+ stats.slowpath += data_race(per_cpu(rseq_stats.slowpath, cpu));
+ stats.fastpath += data_race(per_cpu(rseq_stats.fastpath, cpu));
+ stats.ids += data_race(per_cpu(rseq_stats.ids, cpu));
+ stats.cs += data_race(per_cpu(rseq_stats.cs, cpu));
+ stats.clear += data_race(per_cpu(rseq_stats.clear, cpu));
+ stats.fixup += data_race(per_cpu(rseq_stats.fixup, cpu));
}
- /* Check that the pointer value fits in the user-space process space. */
- if (ptr >= TASK_SIZE)
- return -EINVAL;
- urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr;
- if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs)))
- return -EFAULT;
- if (rseq_cs->start_ip >= TASK_SIZE ||
- rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE ||
- rseq_cs->abort_ip >= TASK_SIZE ||
- rseq_cs->version > 0)
- return -EINVAL;
- /* Check for overflow. */
- if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip)
- return -EINVAL;
- /* Ensure that abort_ip is not in the critical section. */
- if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset)
- return -EINVAL;
+ seq_printf(m, "exit: %16lu\n", stats.exit);
+ seq_printf(m, "signal: %16lu\n", stats.signal);
+ seq_printf(m, "slowp: %16lu\n", stats.slowpath);
+ seq_printf(m, "fastp: %16lu\n", stats.fastpath);
+ seq_printf(m, "ids: %16lu\n", stats.ids);
+ seq_printf(m, "cs: %16lu\n", stats.cs);
+ seq_printf(m, "clear: %16lu\n", stats.clear);
+ seq_printf(m, "fixup: %16lu\n", stats.fixup);
+ return 0;
+}
- usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32));
- ret = get_user(sig, usig);
- if (ret)
- return ret;
+static int rseq_stats_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rseq_stats_show, inode->i_private);
+}
- if (current->rseq_sig != sig) {
- printk_ratelimited(KERN_WARNING
- "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n",
- sig, current->rseq_sig, current->pid, usig);
- return -EINVAL;
- }
+static const struct file_operations stat_ops = {
+ .open = rseq_stats_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init rseq_stats_init(struct dentry *root_dir)
+{
+ debugfs_create_file("stats", 0444, root_dir, NULL, &stat_ops);
return 0;
}
+#else
+static inline void rseq_stats_init(struct dentry *root_dir) { }
+#endif /* CONFIG_RSEQ_STATS */
-static bool rseq_warn_flags(const char *str, u32 flags)
+static int rseq_debug_show(struct seq_file *m, void *p)
{
- u32 test_flags;
-
- if (!flags)
- return false;
- test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS;
- if (test_flags)
- pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str);
- test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS;
- if (test_flags)
- pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str);
- return true;
+ bool on = static_branch_unlikely(&rseq_debug_enabled);
+
+ seq_printf(m, "%d\n", on);
+ return 0;
}
-static int rseq_need_restart(struct task_struct *t, u32 cs_flags)
+static ssize_t rseq_debug_write(struct file *file, const char __user *ubuf,
+ size_t count, loff_t *ppos)
{
- u32 flags, event_mask;
- int ret;
+ bool on;
- if (rseq_warn_flags("rseq_cs", cs_flags))
+ if (kstrtobool_from_user(ubuf, count, &on))
return -EINVAL;
- /* Get thread flags. */
- ret = get_user(flags, &t->rseq->flags);
- if (ret)
- return ret;
+ rseq_control_debug(on);
+ return count;
+}
- if (rseq_warn_flags("rseq", flags))
- return -EINVAL;
+static int rseq_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, rseq_debug_show, inode->i_private);
+}
- /*
- * Load and clear event mask atomically with respect to
- * scheduler preemption and membarrier IPIs.
- */
- scoped_guard(RSEQ_EVENT_GUARD) {
- event_mask = t->rseq_event_mask;
- t->rseq_event_mask = 0;
- }
+static const struct file_operations debug_ops = {
+ .open = rseq_debug_open,
+ .read = seq_read,
+ .write = rseq_debug_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init rseq_debugfs_init(void)
+{
+ struct dentry *root_dir = debugfs_create_dir("rseq", NULL);
- return !!event_mask;
+ debugfs_create_file("debug", 0644, root_dir, NULL, &debug_ops);
+ rseq_stats_init(root_dir);
+ return 0;
}
+__initcall(rseq_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
-static int clear_rseq_cs(struct rseq __user *rseq)
+static bool rseq_set_ids(struct task_struct *t, struct rseq_ids *ids, u32 node_id)
{
- /*
- * The rseq_cs field is set to NULL on preemption or signal
- * delivery on top of rseq assembly block, as well as on top
- * of code outside of the rseq assembly block. This performs
- * a lazy clear of the rseq_cs field.
- *
- * Set rseq_cs to NULL.
- */
-#ifdef CONFIG_64BIT
- return put_user(0UL, &rseq->rseq_cs);
-#else
- if (clear_user(&rseq->rseq_cs, sizeof(rseq->rseq_cs)))
- return -EFAULT;
- return 0;
-#endif
+ return rseq_set_ids_get_csaddr(t, ids, node_id, NULL);
}
-/*
- * Unsigned comparison will be true when ip >= start_ip, and when
- * ip < start_ip + post_commit_offset.
- */
-static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs)
+static bool rseq_handle_cs(struct task_struct *t, struct pt_regs *regs)
{
- return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset;
+ struct rseq __user *urseq = t->rseq.usrptr;
+ u64 csaddr;
+
+ scoped_user_read_access(urseq, efault)
+ unsafe_get_user(csaddr, &urseq->rseq_cs, efault);
+ if (likely(!csaddr))
+ return true;
+ return rseq_update_user_cs(t, regs, csaddr);
+efault:
+ return false;
}
-static int rseq_ip_fixup(struct pt_regs *regs)
+static void rseq_slowpath_update_usr(struct pt_regs *regs)
{
- unsigned long ip = instruction_pointer(regs);
+ /*
+ * Preserve rseq state and user_irq state. The generic entry code
+ * clears user_irq on the way out, the non-generic entry
+ * architectures are not having user_irq.
+ */
+ const struct rseq_event evt_mask = { .has_rseq = true, .user_irq = true, };
struct task_struct *t = current;
- struct rseq_cs rseq_cs;
- int ret;
+ struct rseq_ids ids;
+ u32 node_id;
+ bool event;
+
+ if (unlikely(t->flags & PF_EXITING))
+ return;
- ret = rseq_get_rseq_cs(t, &rseq_cs);
- if (ret)
- return ret;
+ rseq_stat_inc(rseq_stats.slowpath);
/*
- * Handle potentially not being within a critical section.
- * If not nested over a rseq critical section, restart is useless.
- * Clear the rseq_cs pointer and return.
+ * Read and clear the event pending bit first. If the task
+ * was not preempted or migrated or a signal is on the way,
+ * there is no point in doing any of the heavy lifting here
+ * on production kernels. In that case TIF_NOTIFY_RESUME
+ * was raised by some other functionality.
+ *
+ * This is correct because the read/clear operation is
+ * guarded against scheduler preemption, which makes it CPU
+ * local atomic. If the task is preempted right after
+ * re-enabling preemption then TIF_NOTIFY_RESUME is set
+ * again and this function is invoked another time _before_
+ * the task is able to return to user mode.
+ *
+ * On a debug kernel, invoke the fixup code unconditionally
+ * with the result handed in to allow the detection of
+ * inconsistencies.
*/
- if (!in_rseq_cs(ip, &rseq_cs))
- return clear_rseq_cs(t->rseq);
- ret = rseq_need_restart(t, rseq_cs.flags);
- if (ret <= 0)
- return ret;
- ret = clear_rseq_cs(t->rseq);
- if (ret)
- return ret;
- trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset,
- rseq_cs.abort_ip);
- instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip);
- return 0;
+ scoped_guard(irq) {
+ event = t->rseq.event.sched_switch;
+ t->rseq.event.all &= evt_mask.all;
+ ids.cpu_id = task_cpu(t);
+ ids.mm_cid = task_mm_cid(t);
+ }
+
+ if (!event)
+ return;
+
+ node_id = cpu_to_node(ids.cpu_id);
+
+ if (unlikely(!rseq_update_usr(t, regs, &ids, node_id))) {
+ /*
+ * Clear the errors just in case this might survive magically, but
+ * leave the rest intact.
+ */
+ t->rseq.event.error = 0;
+ force_sig(SIGSEGV);
+ }
}
-/*
- * This resume handler must always be executed between any of:
- * - preemption,
- * - signal delivery,
- * and return to user-space.
- *
- * This is how we can ensure that the entire rseq critical section
- * will issue the commit instruction only if executed atomically with
- * respect to other threads scheduled on the same CPU, and with respect
- * to signal handlers.
- */
-void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
+void __rseq_handle_slowpath(struct pt_regs *regs)
{
- struct task_struct *t = current;
- int ret, sig;
-
- if (unlikely(t->flags & PF_EXITING))
+ /*
+ * If invoked from hypervisors before entering the guest via
+ * resume_user_mode_work(), then @regs is a NULL pointer.
+ *
+ * resume_user_mode_work() clears TIF_NOTIFY_RESUME and re-raises
+ * it before returning from the ioctl() to user space when
+ * rseq_event.sched_switch is set.
+ *
+ * So it's safe to ignore here instead of pointlessly updating it
+ * in the vcpu_run() loop.
+ */
+ if (!regs)
return;
+ rseq_slowpath_update_usr(regs);
+}
+
+void __rseq_signal_deliver(int sig, struct pt_regs *regs)
+{
+ rseq_stat_inc(rseq_stats.signal);
/*
- * regs is NULL if and only if the caller is in a syscall path. Skip
- * fixup and leave rseq_cs as is so that rseq_sycall() will detect and
- * kill a misbehaving userspace on debug kernels.
+ * Don't update IDs, they are handled on exit to user if
+ * necessary. The important thing is to abort a critical section of
+ * the interrupted context as after this point the instruction
+ * pointer in @regs points to the signal handler.
*/
- if (regs) {
- ret = rseq_ip_fixup(regs);
- if (unlikely(ret < 0))
- goto error;
+ if (unlikely(!rseq_handle_cs(current, regs))) {
+ /*
+ * Clear the errors just in case this might survive
+ * magically, but leave the rest intact.
+ */
+ current->rseq.event.error = 0;
+ force_sigsegv(sig);
}
- if (unlikely(rseq_update_cpu_node_id(t)))
- goto error;
- return;
-
-error:
- sig = ksig ? ksig->sig : 0;
- force_sigsegv(sig);
}
-#ifdef CONFIG_DEBUG_RSEQ
-
/*
* Terminate the process if a syscall is issued within a restartable
* sequence.
*/
-void rseq_syscall(struct pt_regs *regs)
+void __rseq_debug_syscall_return(struct pt_regs *regs)
{
- unsigned long ip = instruction_pointer(regs);
struct task_struct *t = current;
- struct rseq_cs rseq_cs;
+ u64 csaddr;
- if (!t->rseq)
+ if (!t->rseq.event.has_rseq)
return;
- if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
- force_sig(SIGSEGV);
+ if (get_user(csaddr, &t->rseq.usrptr->rseq_cs))
+ goto fail;
+ if (likely(!csaddr))
+ return;
+ if (unlikely(csaddr >= TASK_SIZE))
+ goto fail;
+ if (rseq_debug_update_user_cs(t, regs, csaddr))
+ return;
+fail:
+ force_sig(SIGSEGV);
}
+#ifdef CONFIG_DEBUG_RSEQ
+/* Kept around to keep GENERIC_ENTRY=n architectures supported. */
+void rseq_syscall(struct pt_regs *regs)
+{
+ __rseq_debug_syscall_return(regs);
+}
#endif
+static bool rseq_reset_ids(void)
+{
+ struct rseq_ids ids = {
+ .cpu_id = RSEQ_CPU_ID_UNINITIALIZED,
+ .mm_cid = 0,
+ };
+
+ /*
+ * If this fails, terminate it because this leaves the kernel in
+ * stupid state as exit to user space will try to fixup the ids
+ * again.
+ */
+ if (rseq_set_ids(current, &ids, 0))
+ return true;
+
+ force_sig(SIGSEGV);
+ return false;
+}
+
+/* The original rseq structure size (including padding) is 32 bytes. */
+#define ORIG_RSEQ_SIZE 32
+
/*
* sys_rseq - setup restartable sequences for caller thread.
*/
-SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
- int, flags, u32, sig)
+SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig)
{
- int ret;
- u64 rseq_cs;
-
if (flags & RSEQ_FLAG_UNREGISTER) {
if (flags & ~RSEQ_FLAG_UNREGISTER)
return -EINVAL;
/* Unregister rseq for current thread. */
- if (current->rseq != rseq || !current->rseq)
+ if (current->rseq.usrptr != rseq || !current->rseq.usrptr)
return -EINVAL;
- if (rseq_len != current->rseq_len)
+ if (rseq_len != current->rseq.len)
return -EINVAL;
- if (current->rseq_sig != sig)
+ if (current->rseq.sig != sig)
return -EPERM;
- ret = rseq_reset_rseq_cpu_node_id(current);
- if (ret)
- return ret;
- current->rseq = NULL;
- current->rseq_sig = 0;
- current->rseq_len = 0;
+ if (!rseq_reset_ids())
+ return -EFAULT;
+ rseq_reset(current);
return 0;
}
if (unlikely(flags))
return -EINVAL;
- if (current->rseq) {
+ if (current->rseq.usrptr) {
/*
* If rseq is already registered, check whether
* the provided address differs from the prior
* one.
*/
- if (current->rseq != rseq || rseq_len != current->rseq_len)
+ if (current->rseq.usrptr != rseq || rseq_len != current->rseq.len)
return -EINVAL;
- if (current->rseq_sig != sig)
+ if (current->rseq.sig != sig)
return -EPERM;
/* Already registered. */
return -EBUSY;
@@ -531,43 +440,39 @@ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len,
if (!access_ok(rseq, rseq_len))
return -EFAULT;
- /*
- * If the rseq_cs pointer is non-NULL on registration, clear it to
- * avoid a potential segfault on return to user-space. The proper thing
- * to do would have been to fail the registration but this would break
- * older libcs that reuse the rseq area for new threads without
- * clearing the fields.
- */
- if (rseq_get_rseq_cs_ptr_val(rseq, &rseq_cs))
- return -EFAULT;
- if (rseq_cs && clear_rseq_cs(rseq))
- return -EFAULT;
+ scoped_user_write_access(rseq, efault) {
+ /*
+ * If the rseq_cs pointer is non-NULL on registration, clear it to
+ * avoid a potential segfault on return to user-space. The proper thing
+ * to do would have been to fail the registration but this would break
+ * older libcs that reuse the rseq area for new threads without
+ * clearing the fields. Don't bother reading it, just reset it.
+ */
+ unsafe_put_user(0UL, &rseq->rseq_cs, efault);
+ /* Initialize IDs in user space */
+ unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id_start, efault);
+ unsafe_put_user(RSEQ_CPU_ID_UNINITIALIZED, &rseq->cpu_id, efault);
+ unsafe_put_user(0U, &rseq->node_id, efault);
+ unsafe_put_user(0U, &rseq->mm_cid, efault);
+ }
-#ifdef CONFIG_DEBUG_RSEQ
- /*
- * Initialize the in-kernel rseq fields copy for validation of
- * read-only fields.
- */
- if (get_user(rseq_kernel_fields(current)->cpu_id_start, &rseq->cpu_id_start) ||
- get_user(rseq_kernel_fields(current)->cpu_id, &rseq->cpu_id) ||
- get_user(rseq_kernel_fields(current)->node_id, &rseq->node_id) ||
- get_user(rseq_kernel_fields(current)->mm_cid, &rseq->mm_cid))
- return -EFAULT;
-#endif
/*
* Activate the registration by setting the rseq area address, length
* and signature in the task struct.
*/
- current->rseq = rseq;
- current->rseq_len = rseq_len;
- current->rseq_sig = sig;
+ current->rseq.usrptr = rseq;
+ current->rseq.len = rseq_len;
+ current->rseq.sig = sig;
/*
* If rseq was previously inactive, and has just been
* registered, ensure the cpu_id_start and cpu_id fields
* are updated before returning to user-space.
*/
- rseq_set_notify_resume(current);
-
+ current->rseq.event.has_rseq = true;
+ rseq_force_update();
return 0;
+
+efault:
+ return -EFAULT;
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0c4ff93eeb78..fc358c1b6ca9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2131,8 +2131,6 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_on_rq_migrating(p))
flags |= ENQUEUE_MIGRATED;
- if (flags & ENQUEUE_MIGRATED)
- sched_mm_cid_migrate_to(rq, p);
enqueue_task(rq, p, flags);
@@ -2643,6 +2641,8 @@ out_unlock:
return 0;
}
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const cpumask_t *affmask);
+
/*
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
@@ -2656,6 +2656,7 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx
cpumask_copy(&p->cpus_mask, ctx->new_mask);
p->nr_cpus_allowed = cpumask_weight(ctx->new_mask);
+ mm_update_cpus_allowed(p->mm, ctx->new_mask);
/*
* Swap in a new user_cpus_ptr if SCA_USER flag set
@@ -2667,10 +2668,8 @@ void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx
static void
do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx)
{
- scoped_guard (sched_change, p, DEQUEUE_SAVE) {
+ scoped_guard (sched_change, p, DEQUEUE_SAVE)
p->sched_class->set_cpus_allowed(p, ctx);
- mm_set_cpus_allowed(p->mm, ctx->new_mask);
- }
}
/*
@@ -3263,8 +3262,6 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
- rseq_migrate(p);
- sched_mm_cid_migrate_from(p);
perf_event_task_migrate(p);
}
@@ -4415,7 +4412,6 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
init_numa_balancing(clone_flags, p);
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
- init_sched_mm_cid(p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -4691,7 +4687,6 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
p->sched_task_group = tg;
}
#endif
- rseq_migrate(p);
/*
* We're setting the CPU for the first time, we don't migrate,
* so use __set_task_cpu().
@@ -4755,7 +4750,6 @@ void wake_up_new_task(struct task_struct *p)
* as we're not fully set-up yet.
*/
p->recent_used_cpu = task_cpu(p);
- rseq_migrate(p);
__set_task_cpu(p, select_task_rq(p, task_cpu(p), &wake_flags));
rq = __task_rq_lock(p, &rf);
update_rq_clock(rq);
@@ -5049,7 +5043,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
kcov_prepare_switch(prev);
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
- rseq_preempt(prev);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
@@ -5212,19 +5205,16 @@ context_switch(struct rq *rq, struct task_struct *prev,
*
* kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
- *
- * switch_mm_cid() needs to be updated if the barriers provided
- * by context_switch() are modified.
*/
- if (!next->mm) { // to kernel
+ if (!next->mm) { // to kernel
enter_lazy_tlb(prev->active_mm, next);
next->active_mm = prev->active_mm;
- if (prev->mm) // from user
+ if (prev->mm) // from user
mmgrab_lazy_tlb(prev->active_mm);
else
prev->active_mm = NULL;
- } else { // to user
+ } else { // to user
membarrier_switch_mm(rq, prev->active_mm, next->mm);
/*
* sys_membarrier() requires an smp_mb() between setting
@@ -5237,15 +5227,20 @@ context_switch(struct rq *rq, struct task_struct *prev,
switch_mm_irqs_off(prev->active_mm, next->mm, next);
lru_gen_use_mm(next->mm);
- if (!prev->mm) { // from kernel
+ if (!prev->mm) { // from kernel
/* will mmdrop_lazy_tlb() in finish_task_switch(). */
rq->prev_mm = prev->active_mm;
prev->active_mm = NULL;
}
}
- /* switch_mm_cid() requires the memory barriers above. */
- switch_mm_cid(rq, prev, next);
+ mm_cid_switch_to(prev, next);
+
+ /*
+ * Tell rseq that the task was scheduled in. Must be after
+ * switch_mm_cid() to get the TIF flag set.
+ */
+ rseq_sched_switch_event(next);
prepare_lock_switch(rq, next, rf);
@@ -5530,7 +5525,6 @@ void sched_tick(void)
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);
- task_tick_mm_cid(rq, donor);
scx_tick(rq);
rq_unlock(rq, &rf);
@@ -10260,525 +10254,501 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
}
#ifdef CONFIG_SCHED_MM_CID
-
-/*
- * @cid_lock: Guarantee forward-progress of cid allocation.
- *
- * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock
- * is only used when contention is detected by the lock-free allocation so
- * forward progress can be guaranteed.
- */
-DEFINE_RAW_SPINLOCK(cid_lock);
-
/*
- * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock.
+ * Concurrency IDentifier management
*
- * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is
- * detected, it is set to 1 to ensure that all newly coming allocations are
- * serialized by @cid_lock until the allocation which detected contention
- * completes and sets @use_cid_lock back to 0. This guarantees forward progress
- * of a cid allocation.
- */
-int use_cid_lock;
-
-/*
- * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid
- * concurrently with respect to the execution of the source runqueue context
- * switch.
- *
- * There is one basic properties we want to guarantee here:
- *
- * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively
- * used by a task. That would lead to concurrent allocation of the cid and
- * userspace corruption.
- *
- * Provide this guarantee by introducing a Dekker memory ordering to guarantee
- * that a pair of loads observe at least one of a pair of stores, which can be
- * shown as:
+ * Serialization rules:
*
- * X = Y = 0
+ * mm::mm_cid::mutex: Serializes fork() and exit() and therefore
+ * protects mm::mm_cid::users.
*
- * w[X]=1 w[Y]=1
- * MB MB
- * r[Y]=y r[X]=x
+ * mm::mm_cid::lock: Serializes mm_update_max_cids() and
+ * mm_update_cpus_allowed(). Nests in mm_cid::mutex
+ * and runqueue lock.
*
- * Which guarantees that x==0 && y==0 is impossible. But rather than using
- * values 0 and 1, this algorithm cares about specific state transitions of the
- * runqueue current task (as updated by the scheduler context switch), and the
- * per-mm/cpu cid value.
+ * The mm_cidmask bitmap is not protected by any of the mm::mm_cid locks
+ * and can only be modified with atomic operations.
*
- * Let's introduce task (Y) which has task->mm == mm and task (N) which has
- * task->mm != mm for the rest of the discussion. There are two scheduler state
- * transitions on context switch we care about:
+ * The mm::mm_cid:pcpu per CPU storage is protected by the CPUs runqueue
+ * lock.
*
- * (TSA) Store to rq->curr with transition from (N) to (Y)
+ * CID ownership:
*
- * (TSB) Store to rq->curr with transition from (Y) to (N)
+ * A CID is either owned by a task (stored in task_struct::mm_cid.cid) or
+ * by a CPU (stored in mm::mm_cid.pcpu::cid). CIDs owned by CPUs have the
+ * MM_CID_ONCPU bit set. During transition from CPU to task ownership mode,
+ * MM_CID_TRANSIT is set on the per task CIDs. When this bit is set the
+ * task needs to drop the CID into the pool when scheduling out. Both bits
+ * (ONCPU and TRANSIT) are filtered out by task_cid() when the CID is
+ * actually handed over to user space in the RSEQ memory.
*
- * On the remote-clear side, there is one transition we care about:
+ * Mode switching:
*
- * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag
+ * Switching to per CPU mode happens when the user count becomes greater
+ * than the maximum number of CIDs, which is calculated by:
*
- * There is also a transition to UNSET state which can be performed from all
- * sides (scheduler, remote-clear). It is always performed with a cmpxchg which
- * guarantees that only a single thread will succeed:
+ * opt_cids = min(mm_cid::nr_cpus_allowed, mm_cid::users);
+ * max_cids = min(1.25 * opt_cids, num_possible_cpus());
*
- * (TMB) cmpxchg to *pcpu_cid to mark UNSET
+ * The +25% allowance is useful for tight CPU masks in scenarios where only
+ * a few threads are created and destroyed to avoid frequent mode
+ * switches. Though this allowance shrinks, the closer opt_cids becomes to
+ * num_possible_cpus(), which is the (unfortunate) hard ABI limit.
*
- * Just to be clear, what we do _not_ want to happen is a transition to UNSET
- * when a thread is actively using the cid (property (1)).
+ * At the point of switching to per CPU mode the new user is not yet
+ * visible in the system, so the task which initiated the fork() runs the
+ * fixup function: mm_cid_fixup_tasks_to_cpu() walks the thread list and
+ * either transfers each tasks owned CID to the CPU the task runs on or
+ * drops it into the CID pool if a task is not on a CPU at that point in
+ * time. Tasks which schedule in before the task walk reaches them do the
+ * handover in mm_cid_schedin(). When mm_cid_fixup_tasks_to_cpus() completes
+ * it's guaranteed that no task related to that MM owns a CID anymore.
*
- * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions.
+ * Switching back to task mode happens when the user count goes below the
+ * threshold which was recorded on the per CPU mode switch:
*
- * Scenario A) (TSA)+(TMA) (from next task perspective)
+ * pcpu_thrs = min(opt_cids - (opt_cids / 4), num_possible_cpus() / 2);
*
- * CPU0 CPU1
+ * This threshold is updated when a affinity change increases the number of
+ * allowed CPUs for the MM, which might cause a switch back to per task
+ * mode.
*
- * Context switch CS-1 Remote-clear
- * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA)
- * (implied barrier after cmpxchg)
- * - switch_mm_cid()
- * - memory barrier (see switch_mm_cid()
- * comment explaining how this barrier
- * is combined with other scheduler
- * barriers)
- * - mm_cid_get (next)
- * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr)
+ * If the switch back was initiated by a exiting task, then that task runs
+ * the fixup function. If it was initiated by a affinity change, then it's
+ * run either in the deferred update function in context of a workqueue or
+ * by a task which forks a new one or by a task which exits. Whatever
+ * happens first. mm_cid_fixup_cpus_to_task() walks through the possible
+ * CPUs and either transfers the CPU owned CIDs to a related task which
+ * runs on the CPU or drops it into the pool. Tasks which schedule in on a
+ * CPU which the walk did not cover yet do the handover themself.
*
- * This Dekker ensures that either task (Y) is observed by the
- * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are
- * observed.
+ * This transition from CPU to per task ownership happens in two phases:
*
- * If task (Y) store is observed by rcu_dereference(), it means that there is
- * still an active task on the cpu. Remote-clear will therefore not transition
- * to UNSET, which fulfills property (1).
+ * 1) mm:mm_cid.transit contains MM_CID_TRANSIT This is OR'ed on the task
+ * CID and denotes that the CID is only temporarily owned by the
+ * task. When it schedules out the task drops the CID back into the
+ * pool if this bit is set.
*
- * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(),
- * it will move its state to UNSET, which clears the percpu cid perhaps
- * uselessly (which is not an issue for correctness). Because task (Y) is not
- * observed, CPU1 can move ahead to set the state to UNSET. Because moving
- * state to UNSET is done with a cmpxchg expecting that the old state has the
- * LAZY flag set, only one thread will successfully UNSET.
+ * 2) The initiating context walks the per CPU space and after completion
+ * clears mm:mm_cid.transit. So after that point the CIDs are strictly
+ * task owned again.
*
- * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0
- * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and
- * CPU1 will observe task (Y) and do nothing more, which is fine.
+ * This two phase transition is required to prevent CID space exhaustion
+ * during the transition as a direct transfer of ownership would fail if
+ * two tasks are scheduled in on the same CPU before the fixup freed per
+ * CPU CIDs.
*
- * What we are effectively preventing with this Dekker is a scenario where
- * neither LAZY flag nor store (Y) are observed, which would fail property (1)
- * because this would UNSET a cid which is actively used.
+ * When mm_cid_fixup_cpus_to_tasks() completes it's guaranteed that no CID
+ * related to that MM is owned by a CPU anymore.
*/
-void sched_mm_cid_migrate_from(struct task_struct *t)
+/*
+ * Update the CID range properties when the constraints change. Invoked via
+ * fork(), exit() and affinity changes
+ */
+static void __mm_update_max_cids(struct mm_mm_cid *mc)
{
- t->migrate_from_cpu = task_cpu(t);
-}
+ unsigned int opt_cids, max_cids;
-static
-int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
- struct task_struct *t,
- struct mm_cid *src_pcpu_cid)
-{
- struct mm_struct *mm = t->mm;
- struct task_struct *src_task;
- int src_cid, last_mm_cid;
+ /* Calculate the new optimal constraint */
+ opt_cids = min(mc->nr_cpus_allowed, mc->users);
- if (!mm)
- return -1;
-
- last_mm_cid = t->last_mm_cid;
- /*
- * If the migrated task has no last cid, or if the current
- * task on src rq uses the cid, it means the source cid does not need
- * to be moved to the destination cpu.
- */
- if (last_mm_cid == -1)
- return -1;
- src_cid = READ_ONCE(src_pcpu_cid->cid);
- if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid)
- return -1;
+ /* Adjust the maximum CIDs to +25% limited by the number of possible CPUs */
+ max_cids = min(opt_cids + (opt_cids / 4), num_possible_cpus());
+ WRITE_ONCE(mc->max_cids, max_cids);
+}
- /*
- * If we observe an active task using the mm on this rq, it means we
- * are not the last task to be migrated from this cpu for this mm, so
- * there is no need to move src_cid to the destination cpu.
- */
- guard(rcu)();
- src_task = rcu_dereference(src_rq->curr);
- if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- t->last_mm_cid = -1;
- return -1;
- }
+static inline unsigned int mm_cid_calc_pcpu_thrs(struct mm_mm_cid *mc)
+{
+ unsigned int opt_cids;
- return src_cid;
+ opt_cids = min(mc->nr_cpus_allowed, mc->users);
+ /* Has to be at least 1 because 0 indicates PCPU mode off */
+ return max(min(opt_cids - opt_cids / 4, num_possible_cpus() / 2), 1);
}
-static
-int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
- struct task_struct *t,
- struct mm_cid *src_pcpu_cid,
- int src_cid)
+static bool mm_update_max_cids(struct mm_struct *mm)
{
- struct task_struct *src_task;
- struct mm_struct *mm = t->mm;
- int lazy_cid;
-
- if (src_cid == -1)
- return -1;
+ struct mm_mm_cid *mc = &mm->mm_cid;
- /*
- * Attempt to clear the source cpu cid to move it to the destination
- * cpu.
- */
- lazy_cid = mm_cid_set_lazy_put(src_cid);
- if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid))
- return -1;
+ lockdep_assert_held(&mm->mm_cid.lock);
- /*
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm matches the scheduler barrier in context_switch()
- * between store to rq->curr and load of prev and next task's
- * per-mm/cpu cid.
- *
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm_cid_active matches the barrier in
- * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
- * sched_mm_cid_after_execve() between store to t->mm_cid_active and
- * load of per-mm/cpu cid.
- */
+ /* Clear deferred mode switch flag. A change is handled by the caller */
+ mc->update_deferred = false;
+ __mm_update_max_cids(mc);
- /*
- * If we observe an active task using the mm on this rq after setting
- * the lazy-put flag, this task will be responsible for transitioning
- * from lazy-put flag set to MM_CID_UNSET.
- */
- scoped_guard (rcu) {
- src_task = rcu_dereference(src_rq->curr);
- if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
- /*
- * We observed an active task for this mm, there is therefore
- * no point in moving this cid to the destination cpu.
- */
- t->last_mm_cid = -1;
- return -1;
- }
+ /* Check whether owner mode must be changed */
+ if (!mc->percpu) {
+ /* Enable per CPU mode when the number of users is above max_cids */
+ if (mc->users > mc->max_cids)
+ mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+ } else {
+ /* Switch back to per task if user count under threshold */
+ if (mc->users < mc->pcpu_thrs)
+ mc->pcpu_thrs = 0;
}
- /*
- * The src_cid is unused, so it can be unset.
- */
- if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
- return -1;
- WRITE_ONCE(src_pcpu_cid->recent_cid, MM_CID_UNSET);
- return src_cid;
+ /* Mode change required? */
+ if (!!mc->percpu == !!mc->pcpu_thrs)
+ return false;
+ /* When switching back to per TASK mode, set the transition flag */
+ if (!mc->pcpu_thrs)
+ WRITE_ONCE(mc->transit, MM_CID_TRANSIT);
+ WRITE_ONCE(mc->percpu, !!mc->pcpu_thrs);
+ return true;
}
-/*
- * Migration to dst cpu. Called with dst_rq lock held.
- * Interrupts are disabled, which keeps the window of cid ownership without the
- * source rq lock held small.
- */
-void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk)
{
- struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
- struct mm_struct *mm = t->mm;
- int src_cid, src_cpu;
- bool dst_cid_is_set;
- struct rq *src_rq;
+ struct cpumask *mm_allowed;
+ struct mm_mm_cid *mc;
+ unsigned int weight;
- lockdep_assert_rq_held(dst_rq);
-
- if (!mm)
- return;
- src_cpu = t->migrate_from_cpu;
- if (src_cpu == -1) {
- t->last_mm_cid = -1;
+ if (!mm || !READ_ONCE(mm->mm_cid.users))
return;
- }
/*
- * Move the src cid if the dst cid is unset. This keeps id
- * allocation closest to 0 in cases where few threads migrate around
- * many CPUs.
- *
- * If destination cid or recent cid is already set, we may have
- * to just clear the src cid to ensure compactness in frequent
- * migrations scenarios.
- *
- * It is not useful to clear the src cid when the number of threads is
- * greater or equal to the number of allowed CPUs, because user-space
- * can expect that the number of allowed cids can reach the number of
- * allowed CPUs.
- */
- dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
- dst_cid_is_set = !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->cid)) ||
- !mm_cid_is_unset(READ_ONCE(dst_pcpu_cid->recent_cid));
- if (dst_cid_is_set && atomic_read(&mm->mm_users) >= READ_ONCE(mm->nr_cpus_allowed))
+ * mm::mm_cid::mm_cpus_allowed is the superset of each threads
+ * allowed CPUs mask which means it can only grow.
+ */
+ mc = &mm->mm_cid;
+ guard(raw_spinlock)(&mc->lock);
+ mm_allowed = mm_cpus_allowed(mm);
+ weight = cpumask_weighted_or(mm_allowed, mm_allowed, affmsk);
+ if (weight == mc->nr_cpus_allowed)
return;
- src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
- src_rq = cpu_rq(src_cpu);
- src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid);
- if (src_cid == -1)
+
+ WRITE_ONCE(mc->nr_cpus_allowed, weight);
+ __mm_update_max_cids(mc);
+ if (!mc->percpu)
return;
- src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid,
- src_cid);
- if (src_cid == -1)
+
+ /* Adjust the threshold to the wider set */
+ mc->pcpu_thrs = mm_cid_calc_pcpu_thrs(mc);
+ /* Switch back to per task mode? */
+ if (mc->users >= mc->pcpu_thrs)
return;
- if (dst_cid_is_set) {
- __mm_cid_put(mm, src_cid);
+
+ /* Don't queue twice */
+ if (mc->update_deferred)
return;
- }
- /* Move src_cid to dst cpu. */
- mm_cid_snapshot_time(dst_rq, mm);
- WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
- WRITE_ONCE(dst_pcpu_cid->recent_cid, src_cid);
+
+ /* Queue the irq work, which schedules the real work */
+ mc->update_deferred = true;
+ irq_work_queue(&mc->irq_work);
}
-static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
- int cpu)
+static inline void mm_cid_transit_to_task(struct task_struct *t, struct mm_cid_pcpu *pcp)
{
- struct rq *rq = cpu_rq(cpu);
- struct task_struct *t;
- int cid, lazy_cid;
+ if (cid_on_cpu(t->mm_cid.cid)) {
+ unsigned int cid = cpu_cid_to_cid(t->mm_cid.cid);
- cid = READ_ONCE(pcpu_cid->cid);
- if (!mm_cid_is_valid(cid))
- return;
+ t->mm_cid.cid = cid_to_transit_cid(cid);
+ pcp->cid = t->mm_cid.cid;
+ }
+}
- /*
- * Clear the cpu cid if it is set to keep cid allocation compact. If
- * there happens to be other tasks left on the source cpu using this
- * mm, the next task using this mm will reallocate its cid on context
- * switch.
- */
- lazy_cid = mm_cid_set_lazy_put(cid);
- if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid))
- return;
+static void mm_cid_fixup_cpus_to_tasks(struct mm_struct *mm)
+{
+ unsigned int cpu;
- /*
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm matches the scheduler barrier in context_switch()
- * between store to rq->curr and load of prev and next task's
- * per-mm/cpu cid.
- *
- * The implicit barrier after cmpxchg per-mm/cpu cid before loading
- * rq->curr->mm_cid_active matches the barrier in
- * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
- * sched_mm_cid_after_execve() between store to t->mm_cid_active and
- * load of per-mm/cpu cid.
- */
+ /* Walk the CPUs and fixup all stale CIDs */
+ for_each_possible_cpu(cpu) {
+ struct mm_cid_pcpu *pcp = per_cpu_ptr(mm->mm_cid.pcpu, cpu);
+ struct rq *rq = cpu_rq(cpu);
- /*
- * If we observe an active task using the mm on this rq after setting
- * the lazy-put flag, that task will be responsible for transitioning
- * from lazy-put flag set to MM_CID_UNSET.
- */
- scoped_guard (rcu) {
- t = rcu_dereference(rq->curr);
- if (READ_ONCE(t->mm_cid_active) && t->mm == mm)
- return;
+ /* Remote access to mm::mm_cid::pcpu requires rq_lock */
+ guard(rq_lock_irq)(rq);
+ /* Is the CID still owned by the CPU? */
+ if (cid_on_cpu(pcp->cid)) {
+ /*
+ * If rq->curr has @mm, transfer it with the
+ * transition bit set. Otherwise drop it.
+ */
+ if (rq->curr->mm == mm && rq->curr->mm_cid.active)
+ mm_cid_transit_to_task(rq->curr, pcp);
+ else
+ mm_drop_cid_on_cpu(mm, pcp);
+
+ } else if (rq->curr->mm == mm && rq->curr->mm_cid.active) {
+ unsigned int cid = rq->curr->mm_cid.cid;
+
+ /* Ensure it has the transition bit set */
+ if (!cid_in_transit(cid)) {
+ cid = cid_to_transit_cid(cid);
+ rq->curr->mm_cid.cid = cid;
+ pcp->cid = cid;
+ }
+ }
}
+ /* Clear the transition bit */
+ WRITE_ONCE(mm->mm_cid.transit, 0);
+}
- /*
- * The cid is unused, so it can be unset.
- * Disable interrupts to keep the window of cid ownership without rq
- * lock small.
- */
- scoped_guard (irqsave) {
- if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
- __mm_cid_put(mm, cid);
+static inline void mm_cid_transfer_to_cpu(struct task_struct *t, struct mm_cid_pcpu *pcp)
+{
+ if (cid_on_task(t->mm_cid.cid)) {
+ t->mm_cid.cid = cid_to_cpu_cid(t->mm_cid.cid);
+ pcp->cid = t->mm_cid.cid;
}
}
-static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
+static bool mm_cid_fixup_task_to_cpu(struct task_struct *t, struct mm_struct *mm)
{
- struct rq *rq = cpu_rq(cpu);
- struct mm_cid *pcpu_cid;
- struct task_struct *curr;
- u64 rq_clock;
+ /* Remote access to mm::mm_cid::pcpu requires rq_lock */
+ guard(task_rq_lock)(t);
+ /* If the task is not active it is not in the users count */
+ if (!t->mm_cid.active)
+ return false;
+ if (cid_on_task(t->mm_cid.cid)) {
+ /* If running on the CPU, transfer the CID, otherwise drop it */
+ if (task_rq(t)->curr == t)
+ mm_cid_transfer_to_cpu(t, per_cpu_ptr(mm->mm_cid.pcpu, task_cpu(t)));
+ else
+ mm_unset_cid_on_task(t);
+ }
+ return true;
+}
- /*
- * rq->clock load is racy on 32-bit but one spurious clear once in a
- * while is irrelevant.
- */
- rq_clock = READ_ONCE(rq->clock);
- pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+static void mm_cid_fixup_tasks_to_cpus(void)
+{
+ struct mm_struct *mm = current->mm;
+ struct task_struct *p, *t;
+ unsigned int users;
/*
- * In order to take care of infrequently scheduled tasks, bump the time
- * snapshot associated with this cid if an active task using the mm is
- * observed on this rq.
+ * This can obviously race with a concurrent affinity change, which
+ * increases the number of allowed CPUs for this mm, but that does
+ * not affect the mode and only changes the CID constraints. A
+ * possible switch back to per task mode happens either in the
+ * deferred handler function or in the next fork()/exit().
+ *
+ * The caller has already transferred. The newly incoming task is
+ * already accounted for, but not yet visible.
*/
- scoped_guard (rcu) {
- curr = rcu_dereference(rq->curr);
- if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) {
- WRITE_ONCE(pcpu_cid->time, rq_clock);
- return;
- }
+ users = mm->mm_cid.users - 2;
+ if (!users)
+ return;
+
+ guard(rcu)();
+ for_other_threads(current, t) {
+ if (mm_cid_fixup_task_to_cpu(t, mm))
+ users--;
}
- if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS)
+ if (!users)
return;
- sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+
+ /* Happens only for VM_CLONE processes. */
+ for_each_process_thread(p, t) {
+ if (t == current || t->mm != mm)
+ continue;
+ if (mm_cid_fixup_task_to_cpu(t, mm)) {
+ if (--users == 0)
+ return;
+ }
+ }
}
-static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
- int weight)
+static bool sched_mm_cid_add_user(struct task_struct *t, struct mm_struct *mm)
{
- struct mm_cid *pcpu_cid;
- int cid;
-
- pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
- cid = READ_ONCE(pcpu_cid->cid);
- if (!mm_cid_is_valid(cid) || cid < weight)
- return;
- sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
+ t->mm_cid.active = 1;
+ mm->mm_cid.users++;
+ return mm_update_max_cids(mm);
}
-static void task_mm_cid_work(struct callback_head *work)
+void sched_mm_cid_fork(struct task_struct *t)
{
- unsigned long now = jiffies, old_scan, next_scan;
- struct task_struct *t = current;
- struct cpumask *cidmask;
- struct mm_struct *mm;
- int weight, cpu;
+ struct mm_struct *mm = t->mm;
+ bool percpu;
- WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
+ WARN_ON_ONCE(!mm || t->mm_cid.cid != MM_CID_UNSET);
- work->next = work; /* Prevent double-add */
- if (t->flags & PF_EXITING)
- return;
- mm = t->mm;
- if (!mm)
- return;
- old_scan = READ_ONCE(mm->mm_cid_next_scan);
- next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
- if (!old_scan) {
- unsigned long res;
-
- res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan);
- if (res != old_scan)
- old_scan = res;
+ guard(mutex)(&mm->mm_cid.mutex);
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ struct mm_cid_pcpu *pcp = this_cpu_ptr(mm->mm_cid.pcpu);
+
+ /* First user ? */
+ if (!mm->mm_cid.users) {
+ sched_mm_cid_add_user(t, mm);
+ t->mm_cid.cid = mm_get_cid(mm);
+ /* Required for execve() */
+ pcp->cid = t->mm_cid.cid;
+ return;
+ }
+
+ if (!sched_mm_cid_add_user(t, mm)) {
+ if (!mm->mm_cid.percpu)
+ t->mm_cid.cid = mm_get_cid(mm);
+ return;
+ }
+
+ /* Handle the mode change and transfer current's CID */
+ percpu = !!mm->mm_cid.percpu;
+ if (!percpu)
+ mm_cid_transit_to_task(current, pcp);
else
- old_scan = next_scan;
+ mm_cid_transfer_to_cpu(current, pcp);
}
- if (time_before(now, old_scan))
- return;
- if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
- return;
- cidmask = mm_cidmask(mm);
- /* Clear cids that were not recently used. */
- for_each_possible_cpu(cpu)
- sched_mm_cid_remote_clear_old(mm, cpu);
- weight = cpumask_weight(cidmask);
- /*
- * Clear cids that are greater or equal to the cidmask weight to
- * recompact it.
- */
- for_each_possible_cpu(cpu)
- sched_mm_cid_remote_clear_weight(mm, cpu, weight);
-}
-
-void init_sched_mm_cid(struct task_struct *t)
-{
- struct mm_struct *mm = t->mm;
- int mm_users = 0;
- if (mm) {
- mm_users = atomic_read(&mm->mm_users);
- if (mm_users == 1)
- mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
+ if (percpu) {
+ mm_cid_fixup_tasks_to_cpus();
+ } else {
+ mm_cid_fixup_cpus_to_tasks(mm);
+ t->mm_cid.cid = mm_get_cid(mm);
}
- t->cid_work.next = &t->cid_work; /* Protect against double add */
- init_task_work(&t->cid_work, task_mm_cid_work);
}
-void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
+static bool sched_mm_cid_remove_user(struct task_struct *t)
{
- struct callback_head *work = &curr->cid_work;
- unsigned long now = jiffies;
-
- if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
- work->next != work)
- return;
- if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
- return;
-
- /* No page allocation under rq lock */
- task_work_add(curr, work, TWA_RESUME);
+ t->mm_cid.active = 0;
+ scoped_guard(preempt) {
+ /* Clear the transition bit */
+ t->mm_cid.cid = cid_from_transit_cid(t->mm_cid.cid);
+ mm_unset_cid_on_task(t);
+ }
+ t->mm->mm_cid.users--;
+ return mm_update_max_cids(t->mm);
}
-void sched_mm_cid_exit_signals(struct task_struct *t)
+static bool __sched_mm_cid_exit(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq *rq;
- if (!mm)
- return;
-
- preempt_disable();
- rq = this_rq();
- guard(rq_lock_irqsave)(rq);
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 0);
+ if (!sched_mm_cid_remove_user(t))
+ return false;
+ /*
+ * Contrary to fork() this only deals with a switch back to per
+ * task mode either because the above decreased users or an
+ * affinity change increased the number of allowed CPUs and the
+ * deferred fixup did not run yet.
+ */
+ if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ return false;
/*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
+ * A failed fork(2) cleanup never gets here, so @current must have
+ * the same MM as @t. That's true for exit() and the failed
+ * pthread_create() cleanup case.
*/
- smp_mb();
- mm_cid_put(mm);
- t->last_mm_cid = t->mm_cid = -1;
+ if (WARN_ON_ONCE(current->mm != mm))
+ return false;
+ return true;
}
-void sched_mm_cid_before_execve(struct task_struct *t)
+/*
+ * When a task exits, the MM CID held by the task is not longer required as
+ * the task cannot return to user space.
+ */
+void sched_mm_cid_exit(struct task_struct *t)
{
struct mm_struct *mm = t->mm;
- struct rq *rq;
- if (!mm)
+ if (!mm || !t->mm_cid.active)
return;
+ /*
+ * Ensure that only one instance is doing MM CID operations within
+ * a MM. The common case is uncontended. The rare fixup case adds
+ * some overhead.
+ */
+ scoped_guard(mutex, &mm->mm_cid.mutex) {
+ /* mm_cid::mutex is sufficient to protect mm_cid::users */
+ if (likely(mm->mm_cid.users > 1)) {
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ if (!__sched_mm_cid_exit(t))
+ return;
+ /* Mode change required. Transfer currents CID */
+ mm_cid_transit_to_task(current, this_cpu_ptr(mm->mm_cid.pcpu));
+ }
+ mm_cid_fixup_cpus_to_tasks(mm);
+ return;
+ }
+ /* Last user */
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ /* Required across execve() */
+ if (t == current)
+ mm_cid_transit_to_task(t, this_cpu_ptr(mm->mm_cid.pcpu));
+ /* Ignore mode change. There is nothing to do. */
+ sched_mm_cid_remove_user(t);
+ }
+ }
- preempt_disable();
- rq = this_rq();
- guard(rq_lock_irqsave)(rq);
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 0);
/*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
+ * As this is the last user (execve(), process exit or failed
+ * fork(2)) there is no concurrency anymore.
+ *
+ * Synchronize eventually pending work to ensure that there are no
+ * dangling references left. @t->mm_cid.users is zero so nothing
+ * can queue this work anymore.
*/
- smp_mb();
- mm_cid_put(mm);
- t->last_mm_cid = t->mm_cid = -1;
+ irq_work_sync(&mm->mm_cid.irq_work);
+ cancel_work_sync(&mm->mm_cid.work);
+}
+
+/* Deactivate MM CID allocation across execve() */
+void sched_mm_cid_before_execve(struct task_struct *t)
+{
+ sched_mm_cid_exit(t);
}
+/* Reactivate MM CID after successful execve() */
void sched_mm_cid_after_execve(struct task_struct *t)
{
- struct mm_struct *mm = t->mm;
- struct rq *rq;
+ sched_mm_cid_fork(t);
+}
+
+static void mm_cid_work_fn(struct work_struct *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.work);
- if (!mm)
+ guard(mutex)(&mm->mm_cid.mutex);
+ /* Did the last user task exit already? */
+ if (!mm->mm_cid.users)
return;
- preempt_disable();
- rq = this_rq();
- scoped_guard (rq_lock_irqsave, rq) {
- preempt_enable_no_resched(); /* holding spinlock */
- WRITE_ONCE(t->mm_cid_active, 1);
- /*
- * Store t->mm_cid_active before loading per-mm/cpu cid.
- * Matches barrier in sched_mm_cid_remote_clear_old().
- */
- smp_mb();
- t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, mm);
+ scoped_guard(raw_spinlock_irq, &mm->mm_cid.lock) {
+ /* Have fork() or exit() handled it already? */
+ if (!mm->mm_cid.update_deferred)
+ return;
+ /* This clears mm_cid::update_deferred */
+ if (!mm_update_max_cids(mm))
+ return;
+ /* Affinity changes can only switch back to task mode */
+ if (WARN_ON_ONCE(mm->mm_cid.percpu))
+ return;
}
+ mm_cid_fixup_cpus_to_tasks(mm);
}
-void sched_mm_cid_fork(struct task_struct *t)
+static void mm_cid_irq_work(struct irq_work *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct, mm_cid.irq_work);
+
+ /*
+ * Needs to be unconditional because mm_cid::lock cannot be held
+ * when scheduling work as mm_update_cpus_allowed() nests inside
+ * rq::lock and schedule_work() might end up in wakeup...
+ */
+ schedule_work(&mm->mm_cid.work);
+}
+
+void mm_init_cid(struct mm_struct *mm, struct task_struct *p)
{
- WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
- t->mm_cid_active = 1;
+ mm->mm_cid.max_cids = 0;
+ mm->mm_cid.percpu = 0;
+ mm->mm_cid.transit = 0;
+ mm->mm_cid.nr_cpus_allowed = p->nr_cpus_allowed;
+ mm->mm_cid.users = 0;
+ mm->mm_cid.pcpu_thrs = 0;
+ mm->mm_cid.update_deferred = 0;
+ raw_spin_lock_init(&mm->mm_cid.lock);
+ mutex_init(&mm->mm_cid.mutex);
+ mm->mm_cid.irq_work = IRQ_WORK_INIT_HARD(mm_cid_irq_work);
+ INIT_WORK(&mm->mm_cid.work, mm_cid_work_fn);
+ cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
+ bitmap_zero(mm_cidmask(mm), num_possible_cpus());
}
-#endif /* CONFIG_SCHED_MM_CID */
+#else /* CONFIG_SCHED_MM_CID */
+static inline void mm_update_cpus_allowed(struct mm_struct *mm, const struct cpumask *affmsk) { }
+#endif /* !CONFIG_SCHED_MM_CID */
static DEFINE_PER_CPU(struct sched_change_ctx, sched_change_ctx);
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 62fba83b7bb1..623445603725 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -199,7 +199,7 @@ static void ipi_rseq(void *info)
* is negligible.
*/
smp_mb();
- rseq_preempt(current);
+ rseq_sched_switch_event(current);
}
static void ipi_sync_rq_state(void *info)
@@ -407,9 +407,9 @@ static int membarrier_private_expedited(int flags, int cpu_id)
* membarrier, we will end up with some thread in the mm
* running without a core sync.
*
- * For RSEQ, don't rseq_preempt() the caller. User code
- * is not supposed to issue syscalls at all from inside an
- * rseq critical section.
+ * For RSEQ, don't invoke rseq_sched_switch_event() on the
+ * caller. User code is not supposed to issue syscalls at
+ * all from inside an rseq critical section.
*/
if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
preempt_disable();
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b419a4d98461..8590113e4a60 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2223,6 +2223,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
smp_wmb();
WRITE_ONCE(task_thread_info(p)->cpu, cpu);
p->wake_cpu = cpu;
+ rseq_sched_set_ids_changed(p);
#endif /* CONFIG_SMP */
}
@@ -3679,283 +3680,212 @@ extern const char *preempt_modes[];
#ifdef CONFIG_SCHED_MM_CID
-#define SCHED_MM_CID_PERIOD_NS (100ULL * 1000000) /* 100ms */
-#define MM_CID_SCAN_DELAY 100 /* 100ms */
+static __always_inline bool cid_on_cpu(unsigned int cid)
+{
+ return cid & MM_CID_ONCPU;
+}
-extern raw_spinlock_t cid_lock;
-extern int use_cid_lock;
+static __always_inline bool cid_in_transit(unsigned int cid)
+{
+ return cid & MM_CID_TRANSIT;
+}
-extern void sched_mm_cid_migrate_from(struct task_struct *t);
-extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
-extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
-extern void init_sched_mm_cid(struct task_struct *t);
+static __always_inline unsigned int cpu_cid_to_cid(unsigned int cid)
+{
+ return cid & ~MM_CID_ONCPU;
+}
-static inline void __mm_cid_put(struct mm_struct *mm, int cid)
+static __always_inline unsigned int cid_to_cpu_cid(unsigned int cid)
{
- if (cid < 0)
- return;
- cpumask_clear_cpu(cid, mm_cidmask(mm));
+ return cid | MM_CID_ONCPU;
}
-/*
- * The per-mm/cpu cid can have the MM_CID_LAZY_PUT flag set or transition to
- * the MM_CID_UNSET state without holding the rq lock, but the rq lock needs to
- * be held to transition to other states.
- *
- * State transitions synchronized with cmpxchg or try_cmpxchg need to be
- * consistent across CPUs, which prevents use of this_cpu_cmpxchg.
- */
-static inline void mm_cid_put_lazy(struct task_struct *t)
+static __always_inline unsigned int cid_to_transit_cid(unsigned int cid)
{
- struct mm_struct *mm = t->mm;
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid;
+ return cid | MM_CID_TRANSIT;
+}
- lockdep_assert_irqs_disabled();
- cid = __this_cpu_read(pcpu_cid->cid);
- if (!mm_cid_is_lazy_put(cid) ||
- !try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
- return;
- __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+static __always_inline unsigned int cid_from_transit_cid(unsigned int cid)
+{
+ return cid & ~MM_CID_TRANSIT;
}
-static inline int mm_cid_pcpu_unset(struct mm_struct *mm)
+static __always_inline bool cid_on_task(unsigned int cid)
{
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid, res;
+ /* True if none of the MM_CID_ONCPU, MM_CID_TRANSIT, MM_CID_UNSET bits is set */
+ return cid < MM_CID_TRANSIT;
+}
- lockdep_assert_irqs_disabled();
- cid = __this_cpu_read(pcpu_cid->cid);
- for (;;) {
- if (mm_cid_is_unset(cid))
- return MM_CID_UNSET;
- /*
- * Attempt transition from valid or lazy-put to unset.
- */
- res = cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, cid, MM_CID_UNSET);
- if (res == cid)
- break;
- cid = res;
- }
- return cid;
+static __always_inline void mm_drop_cid(struct mm_struct *mm, unsigned int cid)
+{
+ clear_bit(cid, mm_cidmask(mm));
}
-static inline void mm_cid_put(struct mm_struct *mm)
+static __always_inline void mm_unset_cid_on_task(struct task_struct *t)
{
- int cid;
+ unsigned int cid = t->mm_cid.cid;
- lockdep_assert_irqs_disabled();
- cid = mm_cid_pcpu_unset(mm);
- if (cid == MM_CID_UNSET)
- return;
- __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+ t->mm_cid.cid = MM_CID_UNSET;
+ if (cid_on_task(cid))
+ mm_drop_cid(t->mm, cid);
}
-static inline int __mm_cid_try_get(struct task_struct *t, struct mm_struct *mm)
+static __always_inline void mm_drop_cid_on_cpu(struct mm_struct *mm, struct mm_cid_pcpu *pcp)
{
- struct cpumask *cidmask = mm_cidmask(mm);
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid, max_nr_cid, allowed_max_nr_cid;
+ /* Clear the ONCPU bit, but do not set UNSET in the per CPU storage */
+ pcp->cid = cpu_cid_to_cid(pcp->cid);
+ mm_drop_cid(mm, pcp->cid);
+}
- /*
- * After shrinking the number of threads or reducing the number
- * of allowed cpus, reduce the value of max_nr_cid so expansion
- * of cid allocation will preserve cache locality if the number
- * of threads or allowed cpus increase again.
- */
- max_nr_cid = atomic_read(&mm->max_nr_cid);
- while ((allowed_max_nr_cid = min_t(int, READ_ONCE(mm->nr_cpus_allowed),
- atomic_read(&mm->mm_users))),
- max_nr_cid > allowed_max_nr_cid) {
- /* atomic_try_cmpxchg loads previous mm->max_nr_cid into max_nr_cid. */
- if (atomic_try_cmpxchg(&mm->max_nr_cid, &max_nr_cid, allowed_max_nr_cid)) {
- max_nr_cid = allowed_max_nr_cid;
- break;
- }
- }
- /* Try to re-use recent cid. This improves cache locality. */
- cid = __this_cpu_read(pcpu_cid->recent_cid);
- if (!mm_cid_is_unset(cid) && cid < max_nr_cid &&
- !cpumask_test_and_set_cpu(cid, cidmask))
- return cid;
- /*
- * Expand cid allocation if the maximum number of concurrency
- * IDs allocated (max_nr_cid) is below the number cpus allowed
- * and number of threads. Expanding cid allocation as much as
- * possible improves cache locality.
- */
- cid = max_nr_cid;
- while (cid < READ_ONCE(mm->nr_cpus_allowed) && cid < atomic_read(&mm->mm_users)) {
- /* atomic_try_cmpxchg loads previous mm->max_nr_cid into cid. */
- if (!atomic_try_cmpxchg(&mm->max_nr_cid, &cid, cid + 1))
- continue;
- if (!cpumask_test_and_set_cpu(cid, cidmask))
- return cid;
- }
- /*
- * Find the first available concurrency id.
- * Retry finding first zero bit if the mask is temporarily
- * filled. This only happens during concurrent remote-clear
- * which owns a cid without holding a rq lock.
- */
- for (;;) {
- cid = cpumask_first_zero(cidmask);
- if (cid < READ_ONCE(mm->nr_cpus_allowed))
- break;
- cpu_relax();
- }
- if (cpumask_test_and_set_cpu(cid, cidmask))
- return -1;
+static inline unsigned int __mm_get_cid(struct mm_struct *mm, unsigned int max_cids)
+{
+ unsigned int cid = find_first_zero_bit(mm_cidmask(mm), max_cids);
+ if (cid >= max_cids)
+ return MM_CID_UNSET;
+ if (test_and_set_bit(cid, mm_cidmask(mm)))
+ return MM_CID_UNSET;
return cid;
}
-/*
- * Save a snapshot of the current runqueue time of this cpu
- * with the per-cpu cid value, allowing to estimate how recently it was used.
- */
-static inline void mm_cid_snapshot_time(struct rq *rq, struct mm_struct *mm)
+static inline unsigned int mm_get_cid(struct mm_struct *mm)
{
- struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(rq));
+ unsigned int cid = __mm_get_cid(mm, READ_ONCE(mm->mm_cid.max_cids));
- lockdep_assert_rq_held(rq);
- WRITE_ONCE(pcpu_cid->time, rq->clock);
+ while (cid == MM_CID_UNSET) {
+ cpu_relax();
+ cid = __mm_get_cid(mm, num_possible_cpus());
+ }
+ return cid;
}
-static inline int __mm_cid_get(struct rq *rq, struct task_struct *t,
- struct mm_struct *mm)
+static inline unsigned int mm_cid_converge(struct mm_struct *mm, unsigned int orig_cid,
+ unsigned int max_cids)
{
- int cid;
+ unsigned int new_cid, cid = cpu_cid_to_cid(orig_cid);
- /*
- * All allocations (even those using the cid_lock) are lock-free. If
- * use_cid_lock is set, hold the cid_lock to perform cid allocation to
- * guarantee forward progress.
- */
- if (!READ_ONCE(use_cid_lock)) {
- cid = __mm_cid_try_get(t, mm);
- if (cid >= 0)
- goto end;
- raw_spin_lock(&cid_lock);
- } else {
- raw_spin_lock(&cid_lock);
- cid = __mm_cid_try_get(t, mm);
- if (cid >= 0)
- goto unlock;
+ /* Is it in the optimal CID space? */
+ if (likely(cid < max_cids))
+ return orig_cid;
+
+ /* Try to find one in the optimal space. Otherwise keep the provided. */
+ new_cid = __mm_get_cid(mm, max_cids);
+ if (new_cid != MM_CID_UNSET) {
+ mm_drop_cid(mm, cid);
+ /* Preserve the ONCPU mode of the original CID */
+ return new_cid | (orig_cid & MM_CID_ONCPU);
}
+ return orig_cid;
+}
- /*
- * cid concurrently allocated. Retry while forcing following
- * allocations to use the cid_lock to ensure forward progress.
- */
- WRITE_ONCE(use_cid_lock, 1);
- /*
- * Set use_cid_lock before allocation. Only care about program order
- * because this is only required for forward progress.
- */
- barrier();
- /*
- * Retry until it succeeds. It is guaranteed to eventually succeed once
- * all newcoming allocations observe the use_cid_lock flag set.
- */
- do {
- cid = __mm_cid_try_get(t, mm);
- cpu_relax();
- } while (cid < 0);
- /*
- * Allocate before clearing use_cid_lock. Only care about
- * program order because this is for forward progress.
- */
- barrier();
- WRITE_ONCE(use_cid_lock, 0);
-unlock:
- raw_spin_unlock(&cid_lock);
-end:
- mm_cid_snapshot_time(rq, mm);
+static __always_inline void mm_cid_update_task_cid(struct task_struct *t, unsigned int cid)
+{
+ if (t->mm_cid.cid != cid) {
+ t->mm_cid.cid = cid;
+ rseq_sched_set_ids_changed(t);
+ }
+}
- return cid;
+static __always_inline void mm_cid_update_pcpu_cid(struct mm_struct *mm, unsigned int cid)
+{
+ __this_cpu_write(mm->mm_cid.pcpu->cid, cid);
}
-static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
- struct mm_struct *mm)
+static __always_inline void mm_cid_from_cpu(struct task_struct *t, unsigned int cpu_cid)
{
- struct mm_cid __percpu *pcpu_cid = mm->pcpu_cid;
- int cid;
+ unsigned int max_cids, tcid = t->mm_cid.cid;
+ struct mm_struct *mm = t->mm;
- lockdep_assert_rq_held(rq);
- cid = __this_cpu_read(pcpu_cid->cid);
- if (mm_cid_is_valid(cid)) {
- mm_cid_snapshot_time(rq, mm);
- return cid;
- }
- if (mm_cid_is_lazy_put(cid)) {
- if (try_cmpxchg(&this_cpu_ptr(pcpu_cid)->cid, &cid, MM_CID_UNSET))
- __mm_cid_put(mm, mm_cid_clear_lazy_put(cid));
+ max_cids = READ_ONCE(mm->mm_cid.max_cids);
+ /* Optimize for the common case where both have the ONCPU bit set */
+ if (likely(cid_on_cpu(cpu_cid & tcid))) {
+ if (likely(cpu_cid_to_cid(cpu_cid) < max_cids)) {
+ mm_cid_update_task_cid(t, cpu_cid);
+ return;
+ }
+ /* Try to converge into the optimal CID space */
+ cpu_cid = mm_cid_converge(mm, cpu_cid, max_cids);
+ } else {
+ /* Hand over or drop the task owned CID */
+ if (cid_on_task(tcid)) {
+ if (cid_on_cpu(cpu_cid))
+ mm_unset_cid_on_task(t);
+ else
+ cpu_cid = cid_to_cpu_cid(tcid);
+ }
+ /* Still nothing, allocate a new one */
+ if (!cid_on_cpu(cpu_cid))
+ cpu_cid = cid_to_cpu_cid(mm_get_cid(mm));
}
- cid = __mm_cid_get(rq, t, mm);
- __this_cpu_write(pcpu_cid->cid, cid);
- __this_cpu_write(pcpu_cid->recent_cid, cid);
-
- return cid;
+ mm_cid_update_pcpu_cid(mm, cpu_cid);
+ mm_cid_update_task_cid(t, cpu_cid);
}
-static inline void switch_mm_cid(struct rq *rq,
- struct task_struct *prev,
- struct task_struct *next)
+static __always_inline void mm_cid_from_task(struct task_struct *t, unsigned int cpu_cid)
{
- /*
- * Provide a memory barrier between rq->curr store and load of
- * {prev,next}->mm->pcpu_cid[cpu] on rq->curr->mm transition.
- *
- * Should be adapted if context_switch() is modified.
- */
- if (!next->mm) { // to kernel
- /*
- * user -> kernel transition does not guarantee a barrier, but
- * we can use the fact that it performs an atomic operation in
- * mmgrab().
- */
- if (prev->mm) // from user
- smp_mb__after_mmgrab();
- /*
- * kernel -> kernel transition does not change rq->curr->mm
- * state. It stays NULL.
- */
- } else { // to user
- /*
- * kernel -> user transition does not provide a barrier
- * between rq->curr store and load of {prev,next}->mm->pcpu_cid[cpu].
- * Provide it here.
- */
- if (!prev->mm) { // from kernel
- smp_mb();
- } else { // from user
- /*
- * user->user transition relies on an implicit
- * memory barrier in switch_mm() when
- * current->mm changes. If the architecture
- * switch_mm() does not have an implicit memory
- * barrier, it is emitted here. If current->mm
- * is unchanged, no barrier is needed.
- */
- smp_mb__after_switch_mm();
+ unsigned int max_cids, tcid = t->mm_cid.cid;
+ struct mm_struct *mm = t->mm;
+
+ max_cids = READ_ONCE(mm->mm_cid.max_cids);
+ /* Optimize for the common case, where both have the ONCPU bit clear */
+ if (likely(cid_on_task(tcid | cpu_cid))) {
+ if (likely(tcid < max_cids)) {
+ mm_cid_update_pcpu_cid(mm, tcid);
+ return;
}
+ /* Try to converge into the optimal CID space */
+ tcid = mm_cid_converge(mm, tcid, max_cids);
+ } else {
+ /* Hand over or drop the CPU owned CID */
+ if (cid_on_cpu(cpu_cid)) {
+ if (cid_on_task(tcid))
+ mm_drop_cid_on_cpu(mm, this_cpu_ptr(mm->mm_cid.pcpu));
+ else
+ tcid = cpu_cid_to_cid(cpu_cid);
+ }
+ /* Still nothing, allocate a new one */
+ if (!cid_on_task(tcid))
+ tcid = mm_get_cid(mm);
+ /* Set the transition mode flag if required */
+ tcid |= READ_ONCE(mm->mm_cid.transit);
}
- if (prev->mm_cid_active) {
- mm_cid_snapshot_time(rq, prev->mm);
- mm_cid_put_lazy(prev);
- prev->mm_cid = -1;
- }
- if (next->mm_cid_active)
- next->last_mm_cid = next->mm_cid = mm_cid_get(rq, next, next->mm);
+ mm_cid_update_pcpu_cid(mm, tcid);
+ mm_cid_update_task_cid(t, tcid);
+}
+
+static __always_inline void mm_cid_schedin(struct task_struct *next)
+{
+ struct mm_struct *mm = next->mm;
+ unsigned int cpu_cid;
+
+ if (!next->mm_cid.active)
+ return;
+
+ cpu_cid = __this_cpu_read(mm->mm_cid.pcpu->cid);
+ if (likely(!READ_ONCE(mm->mm_cid.percpu)))
+ mm_cid_from_task(next, cpu_cid);
+ else
+ mm_cid_from_cpu(next, cpu_cid);
+}
+
+static __always_inline void mm_cid_schedout(struct task_struct *prev)
+{
+ /* During mode transitions CIDs are temporary and need to be dropped */
+ if (likely(!cid_in_transit(prev->mm_cid.cid)))
+ return;
+
+ mm_drop_cid(prev->mm, cid_from_transit_cid(prev->mm_cid.cid));
+ prev->mm_cid.cid = MM_CID_UNSET;
+}
+
+static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next)
+{
+ mm_cid_schedout(prev);
+ mm_cid_schedin(next);
}
#else /* !CONFIG_SCHED_MM_CID: */
-static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
-static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
-static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
-static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
-static inline void init_sched_mm_cid(struct task_struct *t) { }
+static inline void mm_cid_switch_to(struct task_struct *prev, struct task_struct *next) { }
#endif /* !CONFIG_SCHED_MM_CID */
extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
diff --git a/kernel/signal.c b/kernel/signal.c
index fe9190d84f28..e42b8bd6922f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3125,7 +3125,6 @@ void exit_signals(struct task_struct *tsk)
cgroup_threadgroup_change_begin(tsk);
if (thread_group_empty(tsk) || (tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
- sched_mm_cid_exit_signals(tsk);
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);
return;
@@ -3136,7 +3135,6 @@ void exit_signals(struct task_struct *tsk)
* From now this task is not visible for group-wide signals,
* see wants_signal(), do_signal_stop().
*/
- sched_mm_cid_exit_signals(tsk);
tsk->flags |= PF_EXITING;
cgroup_threadgroup_change_end(tsk);