diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-03 13:04:07 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2025-12-03 13:04:07 -0800 |
| commit | 8449d3252c2603a51ffc7c36cb5bd94874378b7d (patch) | |
| tree | e834b0c0569532e33e622a6966ae67632d2cab66 /kernel/cgroup/cgroup.c | |
| parent | 2b60145734a0e5a4b73952a540928d2c4f4fed64 (diff) | |
| parent | b1bcaed1e39a9e0dfbe324a15d2ca4253deda316 (diff) | |
Merge tag 'cgroup-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo:
- Defer task cgroup unlink until after the dying task's final context
switch so that controllers see the cgroup properly populated until
the task is truly gone
- cpuset cleanups and simplifications.
Enforce that domain isolated CPUs stay in root or isolated partitions
and fail if isolated+nohz_full would leave no housekeeping CPU. Fix
sched/deadline root domain handling during CPU hot-unplug and race
for tasks in attaching cpusets
- Misc fixes including memory reclaim protection documentation and
selftest KTAP conformance
* tag 'cgroup-for-6.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (21 commits)
cpuset: Treat cpusets in attaching as populated
sched/deadline: Walk up cpuset hierarchy to decide root domain when hot-unplug
cgroup/cpuset: Introduce cpuset_cpus_allowed_locked()
docs: cgroup: No special handling of unpopulated memcgs
docs: cgroup: Note about sibling relative reclaim protection
docs: cgroup: Explain reclaim protection target
selftests/cgroup: conform test to KTAP format output
cpuset: remove need_rebuild_sched_domains
cpuset: remove global remote_children list
cpuset: simplify node setting on error
cgroup: include missing header for struct irq_work
cgroup: Fix sleeping from invalid context warning on PREEMPT_RT
cgroup/cpuset: Globally track isolated_cpus update
cgroup/cpuset: Ensure domain isolated CPUs stay in root or isolated partition
cgroup/cpuset: Move up prstate_housekeeping_conflict() helper
cgroup/cpuset: Fail if isolated and nohz_full don't leave any housekeeping
cgroup/cpuset: Rename update_unbound_workqueue_cpumask() to update_isolation_cpumasks()
cgroup: Defer task cgroup unlink until after the task is done switching out
cgroup: Move dying_tasks cleanup from cgroup_task_release() to cgroup_task_free()
cgroup: Rename cgroup lifecycle hooks to cgroup_task_*()
...
Diffstat (limited to 'kernel/cgroup/cgroup.c')
| -rw-r--r-- | kernel/cgroup/cgroup.c | 91 |
1 files changed, 76 insertions, 15 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index ae1eb7a85eb4..fa08ea288737 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -60,6 +60,7 @@ #include <linux/sched/deadline.h> #include <linux/psi.h> #include <linux/nstree.h> +#include <linux/irq_work.h> #include <net/sock.h> #define CREATE_TRACE_POINTS @@ -287,6 +288,7 @@ static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype cfts[], bool is_add); +static void cgroup_rt_init(void); #ifdef CONFIG_DEBUG_CGROUP_REF #define CGROUP_REF_FN_ATTRS noinline @@ -941,7 +943,8 @@ static void css_set_move_task(struct task_struct *task, /* * We are synchronized through cgroup_threadgroup_rwsem * against PF_EXITING setting such that we can't race - * against cgroup_exit()/cgroup_free() dropping the css_set. + * against cgroup_task_dead()/cgroup_task_free() dropping + * the css_set. */ WARN_ON_ONCE(task->flags & PF_EXITING); @@ -6354,6 +6357,7 @@ int __init cgroup_init(void) BUG_ON(ss_rstat_init(NULL)); get_user_ns(init_cgroup_ns.user_ns); + cgroup_rt_init(); cgroup_lock(); @@ -6967,19 +6971,29 @@ void cgroup_post_fork(struct task_struct *child, } /** - * cgroup_exit - detach cgroup from exiting task + * cgroup_task_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process * * Description: Detach cgroup from @tsk. * */ -void cgroup_exit(struct task_struct *tsk) +void cgroup_task_exit(struct task_struct *tsk) { struct cgroup_subsys *ss; - struct css_set *cset; int i; - spin_lock_irq(&css_set_lock); + /* see cgroup_post_fork() for details */ + do_each_subsys_mask(ss, i, have_exit_callback) { + ss->exit(tsk); + } while_each_subsys_mask(); +} + +static void do_cgroup_task_dead(struct task_struct *tsk) +{ + struct css_set *cset; + unsigned long flags; + + spin_lock_irqsave(&css_set_lock, flags); WARN_ON_ONCE(list_empty(&tsk->cg_list)); cset = task_css_set(tsk); @@ -6997,15 +7011,61 @@ void cgroup_exit(struct task_struct *tsk) test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags))) cgroup_update_frozen(task_dfl_cgroup(tsk)); - spin_unlock_irq(&css_set_lock); + spin_unlock_irqrestore(&css_set_lock, flags); +} - /* see cgroup_post_fork() for details */ - do_each_subsys_mask(ss, i, have_exit_callback) { - ss->exit(tsk); - } while_each_subsys_mask(); +#ifdef CONFIG_PREEMPT_RT +/* + * cgroup_task_dead() is called from finish_task_switch() which doesn't allow + * scheduling even in RT. As the task_dead path requires grabbing css_set_lock, + * this lead to sleeping in the invalid context warning bug. css_set_lock is too + * big to become a raw_spinlock. The task_dead path doesn't need to run + * synchronously but can't be delayed indefinitely either as the dead task pins + * the cgroup and task_struct can be pinned indefinitely. Bounce through lazy + * irq_work to allow batching while ensuring timely completion. + */ +static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks); +static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork); + +static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork) +{ + struct llist_node *lnode; + struct task_struct *task, *next; + + lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks)); + llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) { + do_cgroup_task_dead(task); + put_task_struct(task); + } +} + +static void __init cgroup_rt_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu)); + per_cpu(cgrp_dead_tasks_iwork, cpu) = + IRQ_WORK_INIT_LAZY(cgrp_dead_tasks_iwork_fn); + } +} + +void cgroup_task_dead(struct task_struct *task) +{ + get_task_struct(task); + llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks)); + irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork)); } +#else /* CONFIG_PREEMPT_RT */ +static void __init cgroup_rt_init(void) {} -void cgroup_release(struct task_struct *task) +void cgroup_task_dead(struct task_struct *task) +{ + do_cgroup_task_dead(task); +} +#endif /* CONFIG_PREEMPT_RT */ + +void cgroup_task_release(struct task_struct *task) { struct cgroup_subsys *ss; int ssid; @@ -7013,6 +7073,11 @@ void cgroup_release(struct task_struct *task) do_each_subsys_mask(ss, ssid, have_release_callback) { ss->release(task); } while_each_subsys_mask(); +} + +void cgroup_task_free(struct task_struct *task) +{ + struct css_set *cset = task_css_set(task); if (!list_empty(&task->cg_list)) { spin_lock_irq(&css_set_lock); @@ -7020,11 +7085,7 @@ void cgroup_release(struct task_struct *task) list_del_init(&task->cg_list); spin_unlock_irq(&css_set_lock); } -} -void cgroup_free(struct task_struct *task) -{ - struct css_set *cset = task_css_set(task); put_css_set(cset); } |