[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <26471ac4-457e-4090-b1f6-275478dacec9@huaweicloud.com>
Date: Fri, 5 Sep 2025 10:02:36 +0800
From: Chen Ridong <chenridong@...weicloud.com>
To: Yi Tao <escape@...ux.alibaba.com>, tj@...nel.org, hannes@...xchg.org,
mkoutny@...e.com
Cc: cgroups@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 1/1] cgroup: replace global percpu_rwsem with
signal_struct->group_rwsem when writing cgroup.procs/threads
On 2025/9/4 19:39, Yi Tao wrote:
> As computer hardware advances, modern systems are typically equipped
> with many CPU cores and large amounts of memory, enabling the deployment
> of numerous applications. On such systems, container creation and
> deletion become frequent operations, making cgroup process migration no
> longer a cold path. This leads to noticeable contention with common
> process operations such as fork, exec, and exit.
>
> To alleviate the contention between cgroup process migration and
> operations like process fork, this patch modifies lock to take the write
> lock on signal_struct->group_rwsem when writing pid to
> cgroup.procs/threads instead of holding a global write lock.
>
> Cgroup process migration has historically relied on
> signal_struct->group_rwsem to protect thread group integrity. In commit
> <1ed1328792ff> ("sched, cgroup: replace signal_struct->group_rwsem with
> a global percpu_rwsem"), this was changed to a global
> cgroup_threadgroup_rwsem. The advantage of using a global lock was
> simplified handling of process group migrations. This patch retains the
> use of the global lock for protecting process group migration, while
> reducing contention by using per thread group lock during
> cgroup.procs/threads writes.
>
> The locking behavior is as follows:
>
> write cgroup.procs/threads | process fork,exec,exit | process group migration
> ------------------------------------------------------------------------------
> cgroup_lock() | down_read(&g_rwsem) | cgroup_lock()
> down_write(&p_rwsem) | down_read(&p_rwsem) | down_write(&g_rwsem)
> critical section | critical section | critical section
> up_write(&p_rwsem) | up_read(&p_rwsem) | up_write(&g_rwsem)
> cgroup_unlock() | up_read(&g_rwsem) | cgroup_unlock()
>
> g_rwsem denotes cgroup_threadgroup_rwsem, p_rwsem denotes
> signal_struct->group_rwsem.
>
> This patch eliminates contention between cgroup migration and fork
> operations for threads that belong to different thread groups, thereby
> reducing the long-tail latency of cgroup migrations and lowering system
> load.
>
> To avoid affecting other users, the per-thread-group rwsem is only used
> when the `favordynmods` flag is enabled.
>
> Signed-off-by: Yi Tao <escape@...ux.alibaba.com>
> ---
> include/linux/cgroup-defs.h | 6 +++
> include/linux/sched/signal.h | 4 ++
> init/init_task.c | 3 ++
> kernel/cgroup/cgroup-internal.h | 4 +-
> kernel/cgroup/cgroup-v1.c | 8 ++--
> kernel/cgroup/cgroup.c | 72 +++++++++++++++++++--------------
> kernel/fork.c | 4 ++
> 7 files changed, 64 insertions(+), 37 deletions(-)
>
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index 6b93a64115fe..0c068dc3e08d 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -828,6 +828,8 @@ struct cgroup_of_peak {
> struct list_head list;
> };
>
> +extern bool have_favordynmods;
> +
> /**
> * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
> * @tsk: target task
> @@ -838,6 +840,8 @@ struct cgroup_of_peak {
> static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
> {
> percpu_down_read(&cgroup_threadgroup_rwsem);
> + if (have_favordynmods)
> + down_read(&tsk->signal->group_rwsem);
> }
>
> /**
> @@ -848,6 +852,8 @@ static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
> */
> static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
> {
> + if (have_favordynmods)
> + up_read(&tsk->signal->group_rwsem);
> percpu_up_read(&cgroup_threadgroup_rwsem);
> }
>
> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
> index 1ef1edbaaf79..86fbc99a9174 100644
> --- a/include/linux/sched/signal.h
> +++ b/include/linux/sched/signal.h
> @@ -226,6 +226,10 @@ struct signal_struct {
> struct tty_audit_buf *tty_audit_buf;
> #endif
>
> +#ifdef CONFIG_CGROUPS
> + struct rw_semaphore group_rwsem;
> +#endif
> +
> /*
> * Thread is the potential origin of an oom condition; kill first on
> * oom
> diff --git a/init/init_task.c b/init/init_task.c
> index e557f622bd90..0450093924a7 100644
> --- a/init/init_task.c
> +++ b/init/init_task.c
> @@ -27,6 +27,9 @@ static struct signal_struct init_signals = {
> },
> .multiprocess = HLIST_HEAD_INIT,
> .rlim = INIT_RLIMITS,
> +#ifdef CONFIG_CGROUPS
> + .group_rwsem = __RWSEM_INITIALIZER(init_signals.group_rwsem),
> +#endif
> .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
> .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock),
> #ifdef CONFIG_POSIX_TIMERS
> diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
> index b14e61c64a34..35005543f0c7 100644
> --- a/kernel/cgroup/cgroup-internal.h
> +++ b/kernel/cgroup/cgroup-internal.h
> @@ -249,8 +249,8 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
>
> int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
> bool threadgroup);
> -void cgroup_attach_lock(bool lock_threadgroup);
> -void cgroup_attach_unlock(bool lock_threadgroup);
> +void cgroup_attach_lock(struct task_struct *tsk, bool lock_threadgroup);
> +void cgroup_attach_unlock(struct task_struct *tsk, bool lock_threadgroup);
> struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
> bool *locked)
> __acquires(&cgroup_threadgroup_rwsem);
> diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
> index 2a4a387f867a..9f1a4d1fc741 100644
> --- a/kernel/cgroup/cgroup-v1.c
> +++ b/kernel/cgroup/cgroup-v1.c
> @@ -68,7 +68,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
> int retval = 0;
>
> cgroup_lock();
> - cgroup_attach_lock(true);
> + cgroup_attach_lock(NULL, true);
> for_each_root(root) {
> struct cgroup *from_cgrp;
>
> @@ -80,7 +80,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
> if (retval)
> break;
> }
> - cgroup_attach_unlock(true);
> + cgroup_attach_unlock(NULL, true);
> cgroup_unlock();
>
> return retval;
> @@ -117,7 +117,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
>
> cgroup_lock();
>
> - cgroup_attach_lock(true);
> + cgroup_attach_lock(NULL, true);
>
> /* all tasks in @from are being moved, all csets are source */
> spin_lock_irq(&css_set_lock);
> @@ -153,7 +153,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
> } while (task && !ret);
> out_err:
> cgroup_migrate_finish(&mgctx);
> - cgroup_attach_unlock(true);
> + cgroup_attach_unlock(NULL, true);
> cgroup_unlock();
> return ret;
> }
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index 312c6a8b55bb..22b1659b623c 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -214,7 +214,7 @@ static u16 have_exit_callback __read_mostly;
> static u16 have_release_callback __read_mostly;
> static u16 have_canfork_callback __read_mostly;
>
> -static bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
> +bool have_favordynmods __ro_after_init = IS_ENABLED(CONFIG_CGROUP_FAVOR_DYNMODS);
>
If have_favordynmods is gating the acquisition of tsk->signal->group_rwsem, would it be better to
replace it with a static key?
> /* cgroup namespace for init task */
> struct cgroup_namespace init_cgroup_ns = {
> @@ -2459,7 +2459,8 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
>
> /**
> * cgroup_attach_lock - Lock for ->attach()
> - * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
> + * @tsk: thread group to lock
> + * @lock_threadgroup: whether to down_write rwsem
> *
> * cgroup migration sometimes needs to stabilize threadgroups against forks and
> * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
> @@ -2480,21 +2481,30 @@ EXPORT_SYMBOL_GPL(cgroup_path_ns);
> * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
> * CPU hotplug is disabled on entry.
> */
> -void cgroup_attach_lock(bool lock_threadgroup)
> +void cgroup_attach_lock(struct task_struct *tsk, bool lock_threadgroup)
> {
> cpus_read_lock();
> - if (lock_threadgroup)
> - percpu_down_write(&cgroup_threadgroup_rwsem);
> + if (lock_threadgroup) {
> + if (tsk && favor_dynmods)
> + down_write(&tsk->signal->group_rwsem);
> + else
> + percpu_down_write(&cgroup_threadgroup_rwsem);
> + }
> }
>
> /**
> * cgroup_attach_unlock - Undo cgroup_attach_lock()
> - * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
> + * @tsk: thread group to lock
> + * @lock_threadgroup: whether to up_write rwsem
> */
> -void cgroup_attach_unlock(bool lock_threadgroup)
> +void cgroup_attach_unlock(struct task_struct *tsk, bool lock_threadgroup)
> {
> - if (lock_threadgroup)
> - percpu_up_write(&cgroup_threadgroup_rwsem);
> + if (lock_threadgroup) {
> + if (tsk && favor_dynmods)
> + up_write(&tsk->signal->group_rwsem);
> + else
> + percpu_up_write(&cgroup_threadgroup_rwsem);
> + }
> cpus_read_unlock();
> }
>
> @@ -2976,24 +2986,12 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
> if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
> return ERR_PTR(-EINVAL);
>
> - /*
> - * If we migrate a single thread, we don't care about threadgroup
> - * stability. If the thread is `current`, it won't exit(2) under our
> - * hands or change PID through exec(2). We exclude
> - * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
> - * callers by cgroup_mutex.
> - * Therefore, we can skip the global lock.
> - */
> - lockdep_assert_held(&cgroup_mutex);
> - *threadgroup_locked = pid || threadgroup;
> - cgroup_attach_lock(*threadgroup_locked);
> -
> rcu_read_lock();
> if (pid) {
> tsk = find_task_by_vpid(pid);
> if (!tsk) {
> tsk = ERR_PTR(-ESRCH);
> - goto out_unlock_threadgroup;
> + goto out_unlock_rcu;
> }
> } else {
> tsk = current;
> @@ -3010,15 +3008,27 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
> */
> if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
> tsk = ERR_PTR(-EINVAL);
> - goto out_unlock_threadgroup;
> + goto out_unlock_rcu;
> }
> -
> get_task_struct(tsk);
> - goto out_unlock_rcu;
>
> -out_unlock_threadgroup:
> - cgroup_attach_unlock(*threadgroup_locked);
> - *threadgroup_locked = false;
> + rcu_read_unlock();
> +
> + /*
> + * If we migrate a single thread, we don't care about threadgroup
> + * stability. If the thread is `current`, it won't exit(2) under our
> + * hands or change PID through exec(2). We exclude
> + * cgroup_update_dfl_csses and other cgroup_{proc,thread}s_write
> + * callers by cgroup_mutex.
> + * Therefore, we can skip the global lock.
> + */
> + lockdep_assert_held(&cgroup_mutex);
> + *threadgroup_locked = pid || threadgroup;
> +
> + cgroup_attach_lock(tsk, *threadgroup_locked);
> +
> + return tsk;
> +
> out_unlock_rcu:
> rcu_read_unlock();
> return tsk;
> @@ -3032,7 +3042,7 @@ void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked
> /* release reference from cgroup_procs_write_start() */
> put_task_struct(task);
>
> - cgroup_attach_unlock(threadgroup_locked);
> + cgroup_attach_unlock(task, threadgroup_locked);
>
> for_each_subsys(ss, ssid)
> if (ss->post_attach)
> @@ -3119,7 +3129,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
> * write-locking can be skipped safely.
> */
> has_tasks = !list_empty(&mgctx.preloaded_src_csets);
> - cgroup_attach_lock(has_tasks);
> + cgroup_attach_lock(NULL, has_tasks);
>
> /* NULL dst indicates self on default hierarchy */
> ret = cgroup_migrate_prepare_dst(&mgctx);
> @@ -3140,7 +3150,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
> ret = cgroup_migrate_execute(&mgctx);
> out_finish:
> cgroup_migrate_finish(&mgctx);
> - cgroup_attach_unlock(has_tasks);
> + cgroup_attach_unlock(NULL, has_tasks);
> return ret;
> }
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index af673856499d..5218f9b93c77 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1688,6 +1688,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
> tty_audit_fork(sig);
> sched_autogroup_fork(sig);
>
> +#ifdef CONFIG_CGROUPS
> + init_rwsem(&sig->group_rwsem);
> +#endif
> +
> sig->oom_score_adj = current->signal->oom_score_adj;
> sig->oom_score_adj_min = current->signal->oom_score_adj_min;
>
--
Best regards,
Ridong
Powered by blists - more mailing lists