[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <roisfgpkd7tapp7cfjavmih2e2riwh2nczv4nqk25gik7of4pa@3ohyptw6nvb3>
Date: Thu, 29 Jan 2026 14:22:32 +0100
From: Michal Koutný <mkoutny@...e.com>
To: Mateusz Guzik <mjguzik@...il.com>
Cc: tj@...nel.org, hannes@...xchg.org, brauner@...nel.org,
linux-kernel@...r.kernel.org, cgroups@...r.kernel.org
Subject: Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork()
On Tue, Jan 27, 2026 at 07:18:44PM +0100, Mateusz Guzik <mjguzik@...il.com> wrote:
> Not sure what you mean here.
Let me add a patch for illustration.
Does that clarify? (How would that change your watched metric?)
...
> It may need some tidy ups but for the purpose of this discussion we
> can pretend it landed.
That I follow.
> clone + exit codepaths are globally serialized as follows:
> - pidmap -- once per side
> - tasklist -- once on clone, twice on exit
> - cgroups -- twice on clone, once on exit
> - some lock for random harvesting, can probably just go
...
> So ye, css very much can be considered a problem here.
Acknowledged.
>
> This comment:
> > - effect of css_set_lock in cgroup_post_fork().
>
> ... I don't get whatsoever.
I meant that css_set_lock is taken 2nd time in cgroup_post_fork() (also
after this rework it remains there).
And I'm wondering whether removal only in cgroup_css_set_fork() improves
parallelism because the tasks (before patching) are queued on the first
css_set_lock, serialized through the first critical section and when
they arrive to the second critical section in cgroup_post_fork() their
arrival rate is already reduced because they had to pass through the
first critical section. Hence the 2nd pass through the critical section
should be less contended (w/out waiting).
I understand it's good to reduce the overall hold time of (every
mentioned) lock but I'm unsure how much helps eliminating css_set_lock
from two to one pass on the clone path.
>
> Stability of cgroup placement aside, to my reading the lock is needed
> in part to serialize addition of the task to the cgroup list. No
> matter what this will have to be serialized both ways with the same
> thing.
Yes, the modification of css_set->tasks list still needs css_set_lock in
post fork.
>
> Perhaps said stability can be assured in other ways and the list can
> be decomposed, but that's some complexity which is not warranted.
The decomposition is not obvious to me :-/
--- 8< ---
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bc892e3b37eea..a176fd60ba08f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -413,11 +413,13 @@ static inline void cgroup_unlock(void)
* as locks used during the cgroup_subsys::attach() methods.
*/
#ifdef CONFIG_PROVE_RCU
+extern rwlock_t css_set_clone_lock;
#define task_css_set_check(task, __c) \
rcu_dereference_check((task)->cgroups, \
rcu_read_lock_sched_held() || \
lockdep_is_held(&cgroup_mutex) || \
lockdep_is_held(&css_set_lock) || \
+ lockdep_is_held(&css_set_clone_lock) || \
((task)->flags & PF_EXITING) || (__c))
#else
#define task_css_set_check(task, __c) \
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5f0d33b049102..4e28e922e5668 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -83,15 +83,21 @@
* css_set_lock protects task->cgroups pointer, the list of css_set
* objects, and the chain of tasks off each css_set.
*
+ * css_set_clone_lock synchronizes access to task->cgroups and cgroup->kill_seq
+ * instead of css_set_lock on clone path
+ *
* These locks are exported if CONFIG_PROVE_RCU so that accessors in
* cgroup.h can use them for lockdep annotations.
*/
DEFINE_MUTEX(cgroup_mutex);
-DEFINE_SPINLOCK(css_set_lock);
+__cacheline_aligned DEFINE_SPINLOCK(css_set_lock);
+__cacheline_aligned DEFINE_RWLOCK(css_set_clone_lock);
+
#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
EXPORT_SYMBOL_GPL(cgroup_mutex);
EXPORT_SYMBOL_GPL(css_set_lock);
+EXPORT_SYMBOL_GPL(css_set_clone_lock);
#endif
struct blocking_notifier_head cgroup_lifetime_notifier =
@@ -901,6 +907,10 @@ static void css_set_skip_task_iters(struct css_set *cset,
css_task_iter_skip(it, task);
}
+enum css_set_move_flags {
+ CSET_MOVE_USE_MG_TASKS = (1 << 0),
+ CSET_MOVE_SKIP_LOCK = (1 << 1),
+};
/**
* css_set_move_task - move a task from one css_set to another
* @task: task being moved
@@ -918,13 +928,23 @@ static void css_set_skip_task_iters(struct css_set *cset,
*/
static void css_set_move_task(struct task_struct *task,
struct css_set *from_cset, struct css_set *to_cset,
- bool use_mg_tasks)
+ enum css_set_move_flags flags)
{
+ /*
+ * Self task cannot move and clone concurrently and disassociation
+ * doesn't modify task->cgroups that's relevant for clone
+ */
+ bool skip_clone_lock = task == current || to_cset == NULL;
+ skip_clone_lock |= flags & CSET_MOVE_SKIP_LOCK;
+
lockdep_assert_held(&css_set_lock);
if (to_cset && !css_set_populated(to_cset))
css_set_update_populated(to_cset, true);
+ if (!skip_clone_lock)
+ write_lock(&css_set_clone_lock);
+
if (from_cset) {
WARN_ON_ONCE(list_empty(&task->cg_list));
@@ -946,9 +966,13 @@ static void css_set_move_task(struct task_struct *task,
WARN_ON_ONCE(task->flags & PF_EXITING);
cgroup_move_task(task, to_cset);
- list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
- &to_cset->tasks);
+ list_add_tail(&task->cg_list,
+ (flags & CSET_MOVE_USE_MG_TASKS) ? &to_cset->mg_tasks :
+ &to_cset->tasks);
}
+
+ if (!skip_clone_lock)
+ write_unlock(&css_set_clone_lock);
}
/*
@@ -2723,7 +2747,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
get_css_set(to_cset);
to_cset->nr_tasks++;
- css_set_move_task(task, from_cset, to_cset, true);
+ css_set_move_task(task, from_cset, to_cset, CSET_MOVE_USE_MG_TASKS);
from_cset->nr_tasks--;
/*
* If the source or destination cgroup is frozen,
@@ -4183,7 +4207,9 @@ static void __cgroup_kill(struct cgroup *cgrp)
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
+ write_lock(&css_set_clone_lock);
cgrp->kill_seq++;
+ write_unlock(&css_set_clone_lock);
spin_unlock_irq(&css_set_lock);
css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
@@ -6696,14 +6722,15 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
cgroup_threadgroup_change_begin(current);
- spin_lock_irq(&css_set_lock);
+ read_lock(&css_set_clone_lock);
+
cset = task_css_set(current);
get_css_set(cset);
if (kargs->cgrp)
kargs->kill_seq = kargs->cgrp->kill_seq;
else
kargs->kill_seq = cset->dfl_cgrp->kill_seq;
- spin_unlock_irq(&css_set_lock);
+ read_unlock(&css_set_clone_lock);
if (!(kargs->flags & CLONE_INTO_CGROUP)) {
kargs->cset = cset;
@@ -6893,6 +6920,7 @@ void cgroup_post_fork(struct task_struct *child,
cset = kargs->cset;
kargs->cset = NULL;
+ // XXX could this be read_lock(css_set_clone_lock) ?
spin_lock_irq(&css_set_lock);
/* init tasks are special, only link regular threads */
@@ -6907,7 +6935,8 @@ void cgroup_post_fork(struct task_struct *child,
WARN_ON_ONCE(!list_empty(&child->cg_list));
cset->nr_tasks++;
- css_set_move_task(child, NULL, cset, false);
+ /* child cannot run (another) clone, skip lock */
+ css_set_move_task(child, NULL, cset, CSET_MOVE_SKIP_LOCK);
} else {
put_css_set(cset);
cset = NULL;
@@ -6995,7 +7024,7 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
WARN_ON_ONCE(list_empty(&tsk->cg_list));
cset = task_css_set(tsk);
- css_set_move_task(tsk, cset, NULL, false);
+ css_set_move_task(tsk, cset, NULL, CSET_MOVE_SKIP_LOCK);
cset->nr_tasks--;
/* matches the signal->live check in css_task_iter_advance() */
if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))
Download attachment "signature.asc" of type "application/pgp-signature" (266 bytes)
Powered by blists - more mailing lists