[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <aQufmyZ7X7NdfiCL@slm.duckdns.org>
Date: Wed, 5 Nov 2025 09:03:55 -1000
From: Tejun Heo <tj@...nel.org>
To: Calvin Owens <calvin@...nvd.org>
Cc: linux-kernel@...r.kernel.org, Dan Schatzberg <dschatzberg@...a.com>,
Peter Zijlstra <peterz@...radead.org>,
Sebastian Andrzej Siewior <bigeasy@...utronix.de>
Subject: [PATCH cgroup/for-6.19] cgroup: Fix sleeping from invalid context
warning on PREEMPT_RT
cgroup_task_dead() is called from finish_task_switch() which runs with
preemption disabled and doesn't allow scheduling even on PREEMPT_RT. The
function needs to acquire css_set_lock which is a regular spinlock that can
sleep on RT kernels, leading to "sleeping function called from invalid
context" warnings.
css_set_lock is too large in scope to convert to a raw_spinlock. However,
the unlinking operations don't need to run synchronously - they just need
to complete after the task is done running.
On PREEMPT_RT, defer the work through irq_work.
Fixes: d245698d727a ("cgroup: Defer task cgroup unlink until after the task is done switching out")
Reported-by: Calvin Owens <calvin@...nvd.org>
Link: https://lore.kernel.org/r/20251104181114.489391-1-calvin@wbinvd.org
Signed-off-by: Tejun Heo <tj@...nel.org>
---
Hello,
Calvin, this seems to work fine here but can you please try it out?
Sebastian, Peter, does this look okay to you guys?
Thanks.
include/linux/sched.h | 5 +++-
kernel/cgroup/cgroup.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 56 insertions(+), 2 deletions(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1324,7 +1324,10 @@ struct task_struct {
struct css_set __rcu *cgroups;
/* cg_list protected by css_set_lock and tsk->alloc_lock: */
struct list_head cg_list;
-#endif
+#ifdef CONFIG_PREEMPT_RT
+ struct llist_node cg_dead_lnode;
+#endif /* CONFIG_PREEMPT_RT */
+#endif /* CONFIG_CGROUPS */
#ifdef CONFIG_X86_CPU_RESCTRL
u32 closid;
u32 rmid;
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -290,6 +290,7 @@ static void kill_css(struct cgroup_subsy
static int cgroup_addrm_files(struct cgroup_subsys_state *css,
struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
+static void cgroup_rt_init(void);
#ifdef CONFIG_DEBUG_CGROUP_REF
#define CGROUP_REF_FN_ATTRS noinline
@@ -6360,6 +6361,7 @@ int __init cgroup_init(void)
BUG_ON(ss_rstat_init(NULL));
get_user_ns(init_cgroup_ns.user_ns);
+ cgroup_rt_init();
cgroup_lock();
@@ -6990,7 +6992,7 @@ void cgroup_task_exit(struct task_struct
} while_each_subsys_mask();
}
-void cgroup_task_dead(struct task_struct *tsk)
+static void do_cgroup_task_dead(struct task_struct *tsk)
{
struct css_set *cset;
unsigned long flags;
@@ -7016,6 +7018,55 @@ void cgroup_task_dead(struct task_struct
spin_unlock_irqrestore(&css_set_lock, flags);
}
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * cgroup_task_dead() is called from finish_task_switch() which doesn't allow
+ * scheduling even in RT. As the task_dead path requires grabbing css_set_lock,
+ * this lead to sleeping in the invalid context warning bug. css_set_lock is too
+ * big to become a raw_spinlock. The task_dead path doesn't need to run
+ * synchronously. Bounce through irq_work instead.
+ */
+static DEFINE_PER_CPU(struct llist_head, cgrp_dead_tasks);
+static DEFINE_PER_CPU(struct irq_work, cgrp_dead_tasks_iwork);
+
+static void cgrp_dead_tasks_iwork_fn(struct irq_work *iwork)
+{
+ struct llist_node *lnode;
+ struct task_struct *task, *next;
+
+ lnode = llist_del_all(this_cpu_ptr(&cgrp_dead_tasks));
+ llist_for_each_entry_safe(task, next, lnode, cg_dead_lnode) {
+ do_cgroup_task_dead(task);
+ put_task_struct(task);
+ }
+}
+
+static void __init cgroup_rt_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ init_llist_head(per_cpu_ptr(&cgrp_dead_tasks, cpu));
+ init_irq_work(per_cpu_ptr(&cgrp_dead_tasks_iwork, cpu),
+ cgrp_dead_tasks_iwork_fn);
+ }
+}
+
+void cgroup_task_dead(struct task_struct *task)
+{
+ get_task_struct(task);
+ llist_add(&task->cg_dead_lnode, this_cpu_ptr(&cgrp_dead_tasks));
+ irq_work_queue(this_cpu_ptr(&cgrp_dead_tasks_iwork));
+}
+#else /* CONFIG_PREEMPT_RT */
+static void __init cgroup_rt_init(void) {}
+
+void cgroup_task_dead(struct task_struct *task)
+{
+ do_cgroup_task_dead(task);
+}
+#endif /* CONFIG_PREEMPT_RT */
+
void cgroup_task_release(struct task_struct *task)
{
struct cgroup_subsys *ss;
Powered by blists - more mailing lists