linux-kernel - Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <roisfgpkd7tapp7cfjavmih2e2riwh2nczv4nqk25gik7of4pa@3ohyptw6nvb3>
Date: Thu, 29 Jan 2026 14:22:32 +0100
From: Michal Koutný <mkoutny@...e.com>
To: Mateusz Guzik <mjguzik@...il.com>
Cc: tj@...nel.org, hannes@...xchg.org, brauner@...nel.org, 
	linux-kernel@...r.kernel.org, cgroups@...r.kernel.org
Subject: Re: [PATCH v2] cgroup: avoid css_set_lock in cgroup_css_set_fork()

On Tue, Jan 27, 2026 at 07:18:44PM +0100, Mateusz Guzik <mjguzik@...il.com> wrote:
> Not sure what you mean here.

Let me add a patch for illustration.
Does that clarify? (How would that change your watched metric?)

...
> It may need some tidy ups but for the purpose of this discussion we
> can pretend it landed.

That I follow.

> clone + exit codepaths are globally serialized as follows:
> - pidmap -- once per side
> - tasklist -- once on clone, twice on exit
> - cgroups -- twice on clone, once on exit
> - some lock for random harvesting, can probably just go
...
> So ye, css very much can be considered a problem here.

Acknowledged.

> 
> This comment:
> > - effect of css_set_lock in cgroup_post_fork().
> 
> ... I don't get whatsoever.

I meant that css_set_lock is taken 2nd time in cgroup_post_fork() (also
after this rework it remains there).

And I'm wondering whether removal only in cgroup_css_set_fork() improves
parallelism because the tasks (before patching) are queued on the first
css_set_lock, serialized through the first critical section and when
they arrive to the second critical section in cgroup_post_fork() their
arrival rate is already reduced because they had to pass through the
first critical section. Hence the 2nd pass through the critical section
should be less contended (w/out waiting).

I understand it's good to reduce the overall hold time of (every
mentioned) lock but I'm unsure how much helps eliminating css_set_lock
from two to one pass on the clone path.

> 
> Stability of cgroup placement aside, to my reading the lock is needed
> in part to serialize addition of the task to the cgroup list. No
> matter what this will have to be serialized both ways with the same
> thing.

Yes, the modification of css_set->tasks list still needs css_set_lock in
post fork.

> 
> Perhaps said stability can be assured in other ways and the list can
> be decomposed, but that's some complexity which is not warranted.

The decomposition is not obvious to me :-/


--- 8< ---
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bc892e3b37eea..a176fd60ba08f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -413,11 +413,13 @@ static inline void cgroup_unlock(void)
  * as locks used during the cgroup_subsys::attach() methods.
  */
 #ifdef CONFIG_PROVE_RCU
+extern rwlock_t css_set_clone_lock;
 #define task_css_set_check(task, __c)					\
 	rcu_dereference_check((task)->cgroups,				\
 		rcu_read_lock_sched_held() ||				\
 		lockdep_is_held(&cgroup_mutex) ||			\
 		lockdep_is_held(&css_set_lock) ||			\
+		lockdep_is_held(&css_set_clone_lock) ||			\
 		((task)->flags & PF_EXITING) || (__c))
 #else
 #define task_css_set_check(task, __c)					\
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 5f0d33b049102..4e28e922e5668 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -83,15 +83,21 @@
  * css_set_lock protects task->cgroups pointer, the list of css_set
  * objects, and the chain of tasks off each css_set.
  *
+ * css_set_clone_lock synchronizes access to task->cgroups and cgroup->kill_seq
+ * instead of css_set_lock on clone path
+ *
  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
  * cgroup.h can use them for lockdep annotations.
  */
 DEFINE_MUTEX(cgroup_mutex);
-DEFINE_SPINLOCK(css_set_lock);
+__cacheline_aligned DEFINE_SPINLOCK(css_set_lock);
+__cacheline_aligned DEFINE_RWLOCK(css_set_clone_lock);
+
 
 #if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP)
 EXPORT_SYMBOL_GPL(cgroup_mutex);
 EXPORT_SYMBOL_GPL(css_set_lock);
+EXPORT_SYMBOL_GPL(css_set_clone_lock);
 #endif
 
 struct blocking_notifier_head cgroup_lifetime_notifier =
@@ -901,6 +907,10 @@ static void css_set_skip_task_iters(struct css_set *cset,
 		css_task_iter_skip(it, task);
 }
 
+enum css_set_move_flags {
+	CSET_MOVE_USE_MG_TASKS	= (1 << 0),
+	CSET_MOVE_SKIP_LOCK	= (1 << 1),
+};
 /**
  * css_set_move_task - move a task from one css_set to another
  * @task: task being moved
@@ -918,13 +928,23 @@ static void css_set_skip_task_iters(struct css_set *cset,
  */
 static void css_set_move_task(struct task_struct *task,
 			      struct css_set *from_cset, struct css_set *to_cset,
-			      bool use_mg_tasks)
+			      enum css_set_move_flags flags)
 {
+	/*
+	 * Self task cannot move and clone concurrently and disassociation
+	 * doesn't modify task->cgroups that's relevant for clone
+	 */
+	bool skip_clone_lock = task == current || to_cset == NULL;
+	skip_clone_lock |= flags & CSET_MOVE_SKIP_LOCK;
+
 	lockdep_assert_held(&css_set_lock);
 
 	if (to_cset && !css_set_populated(to_cset))
 		css_set_update_populated(to_cset, true);
 
+	if (!skip_clone_lock)
+		write_lock(&css_set_clone_lock);
+
 	if (from_cset) {
 		WARN_ON_ONCE(list_empty(&task->cg_list));
 
@@ -946,9 +966,13 @@ static void css_set_move_task(struct task_struct *task,
 		WARN_ON_ONCE(task->flags & PF_EXITING);
 
 		cgroup_move_task(task, to_cset);
-		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
-							     &to_cset->tasks);
+		list_add_tail(&task->cg_list,
+			      (flags & CSET_MOVE_USE_MG_TASKS) ? &to_cset->mg_tasks :
+								 &to_cset->tasks);
 	}
+
+	if (!skip_clone_lock)
+		write_unlock(&css_set_clone_lock);
 }
 
 /*
@@ -2723,7 +2747,7 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
 
 			get_css_set(to_cset);
 			to_cset->nr_tasks++;
-			css_set_move_task(task, from_cset, to_cset, true);
+			css_set_move_task(task, from_cset, to_cset, CSET_MOVE_USE_MG_TASKS);
 			from_cset->nr_tasks--;
 			/*
 			 * If the source or destination cgroup is frozen,
@@ -4183,7 +4207,9 @@ static void __cgroup_kill(struct cgroup *cgrp)
 	lockdep_assert_held(&cgroup_mutex);
 
 	spin_lock_irq(&css_set_lock);
+	write_lock(&css_set_clone_lock);
 	cgrp->kill_seq++;
+	write_unlock(&css_set_clone_lock);
 	spin_unlock_irq(&css_set_lock);
 
 	css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
@@ -6696,14 +6722,15 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
 
 	cgroup_threadgroup_change_begin(current);
 
-	spin_lock_irq(&css_set_lock);
+	read_lock(&css_set_clone_lock);
+
 	cset = task_css_set(current);
 	get_css_set(cset);
 	if (kargs->cgrp)
 		kargs->kill_seq = kargs->cgrp->kill_seq;
 	else
 		kargs->kill_seq = cset->dfl_cgrp->kill_seq;
-	spin_unlock_irq(&css_set_lock);
+	read_unlock(&css_set_clone_lock);
 
 	if (!(kargs->flags & CLONE_INTO_CGROUP)) {
 		kargs->cset = cset;
@@ -6893,6 +6920,7 @@ void cgroup_post_fork(struct task_struct *child,
 	cset = kargs->cset;
 	kargs->cset = NULL;
 
+	// XXX could this be read_lock(css_set_clone_lock) ?
 	spin_lock_irq(&css_set_lock);
 
 	/* init tasks are special, only link regular threads */
@@ -6907,7 +6935,8 @@ void cgroup_post_fork(struct task_struct *child,
 
 		WARN_ON_ONCE(!list_empty(&child->cg_list));
 		cset->nr_tasks++;
-		css_set_move_task(child, NULL, cset, false);
+		/* child cannot run (another) clone, skip lock */
+		css_set_move_task(child, NULL, cset, CSET_MOVE_SKIP_LOCK);
 	} else {
 		put_css_set(cset);
 		cset = NULL;
@@ -6995,7 +7024,7 @@ static void do_cgroup_task_dead(struct task_struct *tsk)
 
 	WARN_ON_ONCE(list_empty(&tsk->cg_list));
 	cset = task_css_set(tsk);
-	css_set_move_task(tsk, cset, NULL, false);
+	css_set_move_task(tsk, cset, NULL, CSET_MOVE_SKIP_LOCK);
 	cset->nr_tasks--;
 	/* matches the signal->live check in css_task_iter_advance() */
 	if (thread_group_leader(tsk) && atomic_read(&tsk->signal->live))

Download attachment "signature.asc" of type "application/pgp-signature" (266 bytes)