[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251029061918.4179554-4-tj@kernel.org>
Date: Tue, 28 Oct 2025 20:19:17 -1000
From: Tejun Heo <tj@...nel.org>
To: David Vernet <void@...ifault.com>,
Andrea Righi <arighi@...dia.com>,
Changwoo Min <changwoo@...lia.com>
Cc: Dan Schatzberg <dschatzberg@...a.com>,
Peter Zijlstra <peterz@...radead.org>,
linux-kernel@...r.kernel.org,
cgroups@...r.kernel.org,
sched-ext@...ts.linux.dev,
Tejun Heo <tj@...nel.org>
Subject: [PATCH 3/4] cgroup: Defer task cgroup unlink until after the task is done switching out
When a task exits, css_set_move_task(tsk, cset, NULL, false) unlinks the task
from its cgroup. From the cgroup's perspective, the task is now gone. If this
makes the cgroup empty, it can be removed, triggering ->css_offline() callbacks
that notify controllers the cgroup is going offline resource-wise.
However, the exiting task can still run, perform memory operations, and schedule
until the final context switch in finish_task_switch(). This creates a confusing
situation where controllers are told a cgroup is offline while resource
activities are still happening in it. While this hasn't broken existing
controllers, it has caused direct confusion for sched_ext schedulers.
Split cgroup_task_exit() into two functions. cgroup_task_exit() now only calls
the subsystem exit callbacks and continues to be called from do_exit(). The
css_set cleanup is moved to the new cgroup_task_dead() which is called from
finish_task_switch() after the final context switch, so that the cgroup only
appears empty after the task is truly done running.
This also reorders operations so that subsys->exit() is now called before
unlinking from the cgroup, which shouldn't break anything.
Cc: Dan Schatzberg <dschatzberg@...a.com>
Cc: Peter Zijlstra <peterz@...radead.org>
Signed-off-by: Tejun Heo <tj@...nel.org>
---
include/linux/cgroup.h | 2 ++
kernel/cgroup/cgroup.c | 23 ++++++++++++++---------
kernel/sched/core.c | 2 ++
3 files changed, 18 insertions(+), 9 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 4068035176c4..bc892e3b37ee 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -138,6 +138,7 @@ extern void cgroup_cancel_fork(struct task_struct *p,
extern void cgroup_post_fork(struct task_struct *p,
struct kernel_clone_args *kargs);
void cgroup_task_exit(struct task_struct *p);
+void cgroup_task_dead(struct task_struct *p);
void cgroup_task_release(struct task_struct *p);
void cgroup_task_free(struct task_struct *p);
@@ -681,6 +682,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p,
static inline void cgroup_post_fork(struct task_struct *p,
struct kernel_clone_args *kargs) {}
static inline void cgroup_task_exit(struct task_struct *p) {}
+static inline void cgroup_task_dead(struct task_struct *p) {}
static inline void cgroup_task_release(struct task_struct *p) {}
static inline void cgroup_task_free(struct task_struct *p) {}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index b3c27900c5d2..aae180d56c8c 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -944,7 +944,7 @@ static void css_set_move_task(struct task_struct *task,
/*
* We are synchronized through cgroup_threadgroup_rwsem
* against PF_EXITING setting such that we can't race
- * against cgroup_task_exit()/cgroup_task_free() dropping
+ * against cgroup_task_dead()/cgroup_task_free() dropping
* the css_set.
*/
WARN_ON_ONCE(task->flags & PF_EXITING);
@@ -6982,10 +6982,20 @@ void cgroup_post_fork(struct task_struct *child,
void cgroup_task_exit(struct task_struct *tsk)
{
struct cgroup_subsys *ss;
- struct css_set *cset;
int i;
- spin_lock_irq(&css_set_lock);
+ /* see cgroup_post_fork() for details */
+ do_each_subsys_mask(ss, i, have_exit_callback) {
+ ss->exit(tsk);
+ } while_each_subsys_mask();
+}
+
+void cgroup_task_dead(struct task_struct *tsk)
+{
+ struct css_set *cset;
+ unsigned long flags;
+
+ spin_lock_irqsave(&css_set_lock, flags);
WARN_ON_ONCE(list_empty(&tsk->cg_list));
cset = task_css_set(tsk);
@@ -7003,12 +7013,7 @@ void cgroup_task_exit(struct task_struct *tsk)
test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
cgroup_update_frozen(task_dfl_cgroup(tsk));
- spin_unlock_irq(&css_set_lock);
-
- /* see cgroup_post_fork() for details */
- do_each_subsys_mask(ss, i, have_exit_callback) {
- ss->exit(tsk);
- } while_each_subsys_mask();
+ spin_unlock_irqrestore(&css_set_lock, flags);
}
void cgroup_task_release(struct task_struct *task)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f1ebf67b48e2..40f12e37f60f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5222,6 +5222,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
if (prev->sched_class->task_dead)
prev->sched_class->task_dead(prev);
+ cgroup_task_dead(prev);
+
/* Task is done with its stack. */
put_task_stack(prev);
--
2.51.1
Powered by blists - more mailing lists