lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <26471ac4-457e-4090-b1f6-275478dacec9@huaweicloud.com>
Date: Fri, 5 Sep 2025 10:02:36 +0800
From: Chen Ridong <chenridong@...weicloud.com>
To: Yi Tao <escape@...ux.alibaba.com>, tj@...nel.org, hannes@...xchg.org,
 mkoutny@...e.com
Cc: cgroups@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 1/1] cgroup: replace global percpu_rwsem with
 signal_struct->group_rwsem when writing cgroup.procs/threads



On 2025/9/4 19:39, Yi Tao wrote:
> As computer hardware advances, modern systems are typically equipped
> with many CPU cores and large amounts of memory, enabling the deployment
> of numerous applications. On such systems, container creation and
> deletion become frequent operations, making cgroup process migration no
> longer a cold path. This leads to noticeable contention with common
> process operations such as fork, exec, and exit.
> 
> To alleviate the contention between cgroup process migration and
> operations like process fork, this patch modifies lock to take the write
> lock on signal_struct->group_rwsem when writing pid to
> cgroup.procs/threads instead of holding a global write lock.
> 
> Cgroup process migration has historically relied on
> signal_struct->group_rwsem to protect thread group integrity. In commit
> <1ed1328792ff> ("sched, cgroup: replace signal_struct->group_rwsem with
> a global percpu_rwsem"), this was changed to a global
> cgroup_threadgroup_rwsem. The advantage of using a global lock was
> simplified handling of process group migrations. This patch retains the
> use of the global lock for protecting process group migration, while
> reducing contention by using per thread group lock during
> cgroup.procs/threads writes.
> 
> The locking behavior is as follows:
> 
> write cgroup.procs/threads  | process fork,exec,exit | process group migration
> ------------------------------------------------------------------------------
> cgroup_lock()               | down_read(&g_rwsem)    | cgroup_lock()
> down_write(&p_rwsem)        | down_read(&p_rwsem)    | down_write(&g_rwsem)
> critical section            | critical section       | critical section
> up_write(&p_rwsem)          | up_read(&p_rwsem)      | up_write(&g_rwsem)
> cgroup_unlock()             | up_read(&g_rwsem)      | cgroup_unlock()
> 
> g_rwsem denotes cgroup_threadgroup_rwsem, p_rwsem denotes
> signal_struct->group_rwsem.
> 
> This patch eliminates contention between cgroup migration and fork
> operations for threads that belong to different thread groups, thereby
> reducing the long-tail latency of cgroup migrations and lowering system
> load.
> 
> To avoid affecting other users, the per-thread-group rwsem is only used
> when the `favordynmods` flag is enabled.
> 
> Signed-off-by: Yi Tao <escape@...ux.alibaba.com>
> ---
>  include/linux/cgroup-defs.h     |  6 +++
>  include/linux/sched/signal.h    |  4 ++
>  init/init_task.c                |  3 ++
>  kernel/cgroup/cgroup-internal.h |  4 +-
>  kernel/cgroup/cgroup-v1.c       |  8 ++--
>  kernel/cgroup/cgroup.c          | 72 +++++++++++++++++++--------------
>  kernel/fork.c                   |  4 ++
>  7 files changed, 64 insertions(+), 37 deletions(-)
> 
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index 6b93a64115fe..0c068dc3e08d 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -828,6 +828,8 @@ struct cgroup_of_peak {
>  	struct list_head	list;
>  };
>  
> +extern bool have_favordynmods;
> +
>  /**
>   * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
>   * @tsk: target task
> @@ -838,6 +840,8 @@ struct cgroup_of_peak {
>  static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
>  {
>  	percpu_down_read(&cgroup_threadgroup_rwsem);
> +	if (have_favordynmods)
> +		down_read(&tsk->signal->group_rwsem);
>  }
>  
>  /**
> @@ -848,6 +852,8 @@ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
>   */
>  static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
>  {
> +	if (have_favordynmods)
> +		up_read(&tsk->signal->group_rwsem);
>  	percpu_up_read(&cgroup_threadgroup_rwsem);
>  }
>  
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 1ef1edbaaf79..86fbc99a9174 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -226,6 +226,10 @@ struct signal_struct {
>  	struct tty_audit_buf *tty_audit_buf;
>  #endif
>  
> +#ifdef CONFIG_CGROUPS
> +	struct rw_semaphore group_rwsem;
> +#endif
> +
>  	/*
>  	 * Thread is the potential origin of an oom condition; kill first on
>  	 * oom
> diff --git a/init/init_task.c b/init/init_task.c
> index e557f622bd90..0450093924a7 100644
> --- a/init/init_task.c
> +++ b/init/init_task.c
> @@ -27,6 +27,9 @@ static struct signal_struct init_signals = {
>  	},
>  	.multiprocess	= HLIST_HEAD_INIT,
>  	.rlim		= INIT_RLIMITS,
> +#ifdef CONFIG_CGROUPS
> +	.group_rwsem	= __RWSEM_INITIALIZER(init_signals.group_rwsem),
> +#endif
>  	.cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
>  	.exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock),
>  #ifdef CONFIG_POSIX_TIMERS
> diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
> index b14e61c64a34..35005543f0c7 100644
> --- a/kernel/cgroup/cgroup-internal.h
> +++ b/kernel/cgroup/cgroup-internal.h
> @@ -249,8 +249,8 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
>  
>  int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
>  		       bool threadgroup);
> -void cgroup_attach_lock(bool lock_threadgroup);
> -void cgroup_attach_unlock(bool lock_threadgroup);
> +void cgroup_attach_lock(struct task_struct *tsk, bool lock_threadgroup);
> +void cgroup_attach_unlock(struct task_struct *tsk, bool lock_threadgroup);
>  struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
>  					     bool *locked)
>  	__acquires(&cgroup_threadgroup_rwsem);
> diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
> index 2a4a387f867a..9f1a4d1fc741 100644
> --- a/kernel/cgroup/cgroup-v1.c
> +++ b/kernel/cgroup/cgroup-v1.c
> @@ -68,7 +68,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
>  	int retval = 0;
>  
>  	cgroup_lock();
> -	cgroup_attach_lock(true);
> +	cgroup_attach_lock(NULL, true);
>  	for_each_root(root) {
>  		struct cgroup *from_cgrp;
>  
> @@ -80,7 +80,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
>  		if (retval)
>  			break;
>  	}
> -	cgroup_attach_unlock(true);
> +	cgroup_attach_unlock(NULL, true);
>  	cgroup_unlock();
>  
>  	return retval;
> @@ -117,7 +117,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
>  
>  	cgroup_lock();
>  
> -	cgroup_attach_lock(true);
> +	cgroup_attach_lock(NULL, true);
>  
>  	/* all tasks in @from are being moved, all csets are source */
>  	spin_lock_irq(&css_set_lock);
> @@ -153,7 +153,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
>  	} while (task && !ret);
>  out_err:
>  	cgroup_migrate_finish(&mgctx);
> -	cgroup_attach_unlock(true);
> +	cgroup_attach_unlock(NULL, true);
>  	cgroup_unlock();
>  	return ret;
>  }
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 312c6a8b55bb..22b1659b623c 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -214,7 +214,7 @@ static u16 have_exit_callback __read_mostly;
>  static u16 have_release_callback __read_mostly;
>  static u16 have_canfork_callback __read_mostly;
>  
> -static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
> +bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
>  

If have_favordynmods is gating the acquisition of tsk->signal->group_rwsem, would it be better to
replace it with a static key?

>  /* cgroup namespace for init task */
>  struct cgroup_namespace init_cgroup_ns = {
> @@ -2459,7 +2459,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
>  
>  /**
>   * cgroup_attach_lock - Lock for ->attach()
> - * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
> + * @tsk: thread group to lock
> + * @lock_threadgroup: whether to down_write rwsem
>   *
>   * cgroup migration sometimes needs to stabilize threadgroups against forks and
>   * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
> @@ -2480,21 +2481,30 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
>   * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
>   * CPU hotplug is disabled on entry.
>   */
> -void cgroup_attach_lock(bool lock_threadgroup)
> +void cgroup_attach_lock(struct task_struct *tsk, bool lock_threadgroup)
>  {
>  	cpus_read_lock();
> -	if (lock_threadgroup)
> -		percpu_down_write(&cgroup_threadgroup_rwsem);
> +	if (lock_threadgroup) {
> +		if (tsk && favor_dynmods)
> +			down_write(&tsk->signal->group_rwsem);
> +		else
> +			percpu_down_write(&cgroup_threadgroup_rwsem);
> +	}
>  }
>  
>  /**
>   * cgroup_attach_unlock - Undo cgroup_attach_lock()
> - * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
> + * @tsk: thread group to lock
> + * @lock_threadgroup: whether to up_write rwsem
>   */
> -void cgroup_attach_unlock(bool lock_threadgroup)
> +void cgroup_attach_unlock(struct task_struct *tsk, bool lock_threadgroup)
>  {
> -	if (lock_threadgroup)
> -		percpu_up_write(&cgroup_threadgroup_rwsem);
> +	if (lock_threadgroup) {
> +		if (tsk && favor_dynmods)
> +			up_write(&tsk->signal->group_rwsem);
> +		else
> +			percpu_up_write(&cgroup_threadgroup_rwsem);
> +	}
>  	cpus_read_unlock();
>  }
>  
> @@ -2976,24 +2986,12 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
>  	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
>  		return ERR_PTR(-EINVAL);
>  
> -	/*
> -	 * If we migrate a single thread, we don't care about threadgroup
> -	 * stability. If the thread is `current`, it won't exit(2) under our
> -	 * hands or change PID through exec(2). We exclude
> -	 * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
> -	 * callers by cgroup_mutex.
> -	 * Therefore, we can skip the global lock.
> -	 */
> -	lockdep_assert_held(&cgroup_mutex);
> -	*threadgroup_locked = pid || threadgroup;
> -	cgroup_attach_lock(*threadgroup_locked);
> -
>  	rcu_read_lock();
>  	if (pid) {
>  		tsk = find_task_by_vpid(pid);
>  		if (!tsk) {
>  			tsk = ERR_PTR(-ESRCH);
> -			goto out_unlock_threadgroup;
> +			goto out_unlock_rcu;
>  		}
>  	} else {
>  		tsk = current;
> @@ -3010,15 +3008,27 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
>  	 */
>  	if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
>  		tsk = ERR_PTR(-EINVAL);
> -		goto out_unlock_threadgroup;
> +		goto out_unlock_rcu;
>  	}
> -
>  	get_task_struct(tsk);
> -	goto out_unlock_rcu;
>  
> -out_unlock_threadgroup:
> -	cgroup_attach_unlock(*threadgroup_locked);
> -	*threadgroup_locked = false;
> +	rcu_read_unlock();
> +
> +	/*
> +	 * If we migrate a single thread, we don't care about threadgroup
> +	 * stability. If the thread is `current`, it won't exit(2) under our
> +	 * hands or change PID through exec(2). We exclude
> +	 * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
> +	 * callers by cgroup_mutex.
> +	 * Therefore, we can skip the global lock.
> +	 */
> +	lockdep_assert_held(&cgroup_mutex);
> +	*threadgroup_locked = pid || threadgroup;
> +
> +	cgroup_attach_lock(tsk, *threadgroup_locked);
> +
> +	return tsk;
> +
>  out_unlock_rcu:
>  	rcu_read_unlock();
>  	return tsk;
> @@ -3032,7 +3042,7 @@ void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked
>  	/* release reference from cgroup_procs_write_start() */
>  	put_task_struct(task);
>  
> -	cgroup_attach_unlock(threadgroup_locked);
> +	cgroup_attach_unlock(task, threadgroup_locked);
>  
>  	for_each_subsys(ss, ssid)
>  		if (ss->post_attach)
> @@ -3119,7 +3129,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
>  	 * write-locking can be skipped safely.
>  	 */
>  	has_tasks = !list_empty(&mgctx.preloaded_src_csets);
> -	cgroup_attach_lock(has_tasks);
> +	cgroup_attach_lock(NULL, has_tasks);
>  
>  	/* NULL dst indicates self on default hierarchy */
>  	ret = cgroup_migrate_prepare_dst(&mgctx);
> @@ -3140,7 +3150,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
>  	ret = cgroup_migrate_execute(&mgctx);
>  out_finish:
>  	cgroup_migrate_finish(&mgctx);
> -	cgroup_attach_unlock(has_tasks);
> +	cgroup_attach_unlock(NULL, has_tasks);
>  	return ret;
>  }
>  
> diff --git a/kernel/fork.c b/kernel/fork.c
> index af673856499d..5218f9b93c77 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1688,6 +1688,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
>  	tty_audit_fork(sig);
>  	sched_autogroup_fork(sig);
>  
> +#ifdef CONFIG_CGROUPS
> +	init_rwsem(&sig->group_rwsem);
> +#endif
> +
>  	sig->oom_score_adj = current->signal->oom_score_adj;
>  	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
>  

-- 
Best regards,
Ridong


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ