diff options
| author | Thomas Gleixner <tglx@linutronix.de> | 2025-03-11 23:07:44 +0100 |
|---|---|---|
| committer | Thomas Gleixner <tglx@linutronix.de> | 2025-03-13 12:07:18 +0100 |
| commit | ec2d0c04624b3c8a7eb1682e006717fa20cfbe24 (patch) | |
| tree | 8e9ca7af0327fc93d76e5437296984e5b6f200f7 /kernel/time/posix-timers.c | |
| parent | 2dc4dbf89cf186639c25c1b04a07c11496f060ad (diff) | |
posix-timers: Provide a mechanism to allocate a given timer ID
Checkpoint/Restore in Userspace (CRIU) requires to reconstruct posix timers
with the same timer ID on restore. It uses sys_timer_create() and relies on
the monotonic increasing timer ID provided by this syscall. It creates and
deletes timers until the desired ID is reached. This is can loop for a long
time, when the checkpointed process had a very sparse timer ID range.
It has been debated to implement a new syscall to allow the creation of
timers with a given timer ID, but that's tideous due to the 32/64bit compat
issues of sigevent_t and of dubious value.
The restore mechanism of CRIU creates the timers in a state where all
threads of the restored process are held on a barrier and cannot issue
syscalls. That means the restorer task has exclusive control.
This allows to address this issue with a prctl() so that the restorer
thread can do:
if (prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_ON))
goto linear_mode;
create_timers_with_explicit_ids();
prctl(PR_TIMER_CREATE_RESTORE_IDS, PR_TIMER_CREATE_RESTORE_IDS_OFF);
This is backwards compatible because the prctl() fails on older kernels and
CRIU can fall back to the linear timer ID mechanism. CRIU versions which do
not know about the prctl() just work as before.
Implement the prctl() and modify timer_create() so that it copies the
requested timer ID from userspace by utilizing the existing timer_t
pointer, which is used to copy out the allocated timer ID on success.
If the prctl() is disabled, which it is by default, timer_create() works as
before and does not try to read from the userspace pointer.
There is no problem when a broken or rogue user space application enables
the prctl(). If the user space pointer does not contain a valid ID, then
timer_create() fails. If the data is not initialized, but constains a
random valid ID, timer_create() will create that random timer ID or fail if
the ID is already given out.
As CRIU must use the raw syscall to avoid manipulating the internal state
of the restored process, this has no library dependencies and can be
adopted by CRIU right away.
Recreating two timers with IDs 1000000 and 2000000 takes 1.5 seconds with
the create/delete method. With the prctl() it takes 3 microseconds.
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Reviewed-by: Cyrill Gorcunov <gorcunov@gmail.com>
Tested-by: Cyrill Gorcunov <gorcunov@gmail.com>
Link: https://lore.kernel.org/all/87jz8vz0en.ffs@tglx
Diffstat (limited to 'kernel/time/posix-timers.c')
| -rw-r--r-- | kernel/time/posix-timers.c | 105 |
1 files changed, 79 insertions, 26 deletions
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index b917a162d84a..2ca1c55d38a2 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -19,6 +19,7 @@ #include <linux/nospec.h> #include <linux/posix-clock.h> #include <linux/posix-timers.h> +#include <linux/prctl.h> #include <linux/sched/task.h> #include <linux/slab.h> #include <linux/syscalls.h> @@ -57,6 +58,8 @@ static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); static const struct k_clock clock_realtime, clock_monotonic; +#define TIMER_ANY_ID INT_MIN + /* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */ #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) @@ -128,38 +131,60 @@ static bool posix_timer_hashed(struct timer_hash_bucket *bucket, struct signal_s return false; } -static int posix_timer_add(struct k_itimer *timer) +static bool posix_timer_add_at(struct k_itimer *timer, struct signal_struct *sig, unsigned int id) +{ + struct timer_hash_bucket *bucket = hash_bucket(sig, id); + + scoped_guard (spinlock, &bucket->lock) { + /* + * Validate under the lock as this could have raced against + * another thread ending up with the same ID, which is + * highly unlikely, but possible. + */ + if (!posix_timer_hashed(bucket, sig, id)) { + /* + * Set the timer ID and the signal pointer to make + * it identifiable in the hash table. The signal + * pointer has bit 0 set to indicate that it is not + * yet fully initialized. posix_timer_hashed() + * masks this bit out, but the syscall lookup fails + * to match due to it being set. This guarantees + * that there can't be duplicate timer IDs handed + * out. + */ + timer->it_id = (timer_t)id; + timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL); + hlist_add_head_rcu(&timer->t_hash, &bucket->head); + return true; + } + } + return false; +} + +static int posix_timer_add(struct k_itimer *timer, int req_id) { struct signal_struct *sig = current->signal; + if (unlikely(req_id != TIMER_ANY_ID)) { + if (!posix_timer_add_at(timer, sig, req_id)) + return -EBUSY; + + /* + * Move the ID counter past the requested ID, so that after + * switching back to normal mode the IDs are outside of the + * exact allocated region. That avoids ID collisions on the + * next regular timer_create() invocations. + */ + atomic_set(&sig->next_posix_timer_id, req_id + 1); + return req_id; + } + for (unsigned int cnt = 0; cnt <= INT_MAX; cnt++) { /* Get the next timer ID and clamp it to positive space */ unsigned int id = atomic_fetch_inc(&sig->next_posix_timer_id) & INT_MAX; - struct timer_hash_bucket *bucket = hash_bucket(sig, id); - scoped_guard (spinlock, &bucket->lock) { - /* - * Validate under the lock as this could have raced - * against another thread ending up with the same - * ID, which is highly unlikely, but possible. - */ - if (!posix_timer_hashed(bucket, sig, id)) { - /* - * Set the timer ID and the signal pointer to make - * it identifiable in the hash table. The signal - * pointer has bit 0 set to indicate that it is not - * yet fully initialized. posix_timer_hashed() - * masks this bit out, but the syscall lookup fails - * to match due to it being set. This guarantees - * that there can't be duplicate timer IDs handed - * out. - */ - timer->it_id = (timer_t)id; - timer->it_signal = (struct signal_struct *)((unsigned long)sig | 1UL); - hlist_add_head_rcu(&timer->t_hash, &bucket->head); - return id; - } - } + if (posix_timer_add_at(timer, sig, id)) + return id; cond_resched(); } /* POSIX return code when no timer ID could be allocated */ @@ -364,6 +389,21 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } +long posixtimer_create_prctl(unsigned long ctrl) +{ + switch (ctrl) { + case PR_TIMER_CREATE_RESTORE_IDS_OFF: + current->signal->timer_create_restore_ids = 0; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_ON: + current->signal->timer_create_restore_ids = 1; + return 0; + case PR_TIMER_CREATE_RESTORE_IDS_GET: + return current->signal->timer_create_restore_ids; + } + return -EINVAL; +} + static struct pid *good_sigevent(sigevent_t * event) { struct pid *pid = task_tgid(current); @@ -435,6 +475,7 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, timer_t __user *created_timer_id) { const struct k_clock *kc = clockid_to_kclock(which_clock); + timer_t req_id = TIMER_ANY_ID; struct k_itimer *new_timer; int error, new_timer_id; @@ -449,11 +490,20 @@ static int do_timer_create(clockid_t which_clock, struct sigevent *event, spin_lock_init(&new_timer->it_lock); + /* Special case for CRIU to restore timers with a given timer ID. */ + if (unlikely(current->signal->timer_create_restore_ids)) { + if (copy_from_user(&req_id, created_timer_id, sizeof(req_id))) + return -EFAULT; + /* Valid IDs are 0..INT_MAX */ + if ((unsigned int)req_id > INT_MAX) + return -EINVAL; + } + /* * Add the timer to the hash table. The timer is not yet valid * after insertion, but has a unique ID allocated. */ - new_timer_id = posix_timer_add(new_timer); + new_timer_id = posix_timer_add(new_timer, req_id); if (new_timer_id < 0) { posixtimer_free_timer(new_timer); return new_timer_id; @@ -1041,6 +1091,9 @@ void exit_itimers(struct task_struct *tsk) struct hlist_node *next; struct k_itimer *timer; + /* Clear restore mode for exec() */ + tsk->signal->timer_create_restore_ids = 0; + if (hlist_empty(&tsk->signal->posix_timers)) return; |