[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <b8a37b7f-839d-4456-b157-6ea7f59fec1a@redhat.com>
Date: Thu, 4 Sep 2025 11:14:39 -0400
From: Waiman Long <llong@...hat.com>
To: Chuyi Zhou <zhouchuyi@...edance.com>, tj@...nel.org, mkoutny@...e.com,
hannes@...xchg.org
Cc: linux-kernel@...r.kernel.org
Subject: Re: [PATCH 2/3] cpuset: Defer flushing of the cpuset_migrate_mm_wq to
task_work
On 9/4/25 3:45 AM, Chuyi Zhou wrote:
> Now in cpuset_attach(), we need to synchronously wait for
> flush_workqueue to complete. The execution time of flushing
> cpuset_migrate_mm_wq depends on the amount of mm migration initiated by
> cpusets at that time. When the cpuset.mems of a cgroup occupying a large
> amount of memory is modified, it may trigger extensive mm migration,
> causing cpuset_attach() to block on flush_workqueue for an extended period.
> This could be dangerous because cpuset_attach() is within the critical
> section of cgroup_mutex, which may ultimately cause all cgroup-related
> operations in the system to be blocked.
>
> This patch attempts to defer the flush_workqueue() operation until
> returning to userspace using the task_work which is originally proposed by
> tejun[1], so that flush happens after cgroup_mutex is dropped. That way we
> maintain the operation synchronicity while avoiding bothering anyone else.
>
> [1]: https://lore.kernel.org/cgroups/ZgMFPMjZRZCsq9Q-@slm.duckdns.org/T/#m117f606fa24f66f0823a60f211b36f24bd9e1883
>
> Originally-by: tejun heo <tj@...nel.org>
> Signed-off-by: Chuyi Zhou <zhouchuyi@...edance.com>
> ---
> kernel/cgroup/cpuset.c | 29 ++++++++++++++++++++++++-----
> 1 file changed, 24 insertions(+), 5 deletions(-)
>
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 3d8492581c8c4..ceb467079e41f 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -40,6 +40,7 @@
> #include <linux/sched/isolation.h>
> #include <linux/wait.h>
> #include <linux/workqueue.h>
> +#include <linux/task_work.h>
>
> DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
> DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
> @@ -2582,9 +2583,24 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
> }
> }
>
> -static void cpuset_post_attach(void)
> +static void flush_migrate_mm_task_workfn(struct callback_head *head)
> {
> flush_workqueue(cpuset_migrate_mm_wq);
> + kfree(head);
> +}
> +
> +static void schedule_flush_migrate_mm(void)
> +{
> + struct callback_head *flush_cb;
> +
> + flush_cb = kzalloc(sizeof(struct callback_head), GFP_KERNEL);
> + if (!flush_cb)
> + return;
> +
> + init_task_work(flush_cb, flush_migrate_mm_task_workfn);
> +
> + if (task_work_add(current, flush_cb, TWA_RESUME))
> + kfree(flush_cb);
> }
>
> /*
> @@ -3141,6 +3157,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
> struct cpuset *cs;
> struct cpuset *oldcs = cpuset_attach_old_cs;
> bool cpus_updated, mems_updated;
> + bool queue_task_work = false;
>
> cgroup_taskset_first(tset, &css);
> cs = css_cs(css);
> @@ -3191,15 +3208,18 @@ static void cpuset_attach(struct cgroup_taskset *tset)
> * @old_mems_allowed is the right nodesets that we
> * migrate mm from.
> */
> - if (is_memory_migrate(cs))
> + if (is_memory_migrate(cs)) {
> cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
> &cpuset_attach_nodemask_to);
> - else
> + queue_task_work = true;
> + } else
> mmput(mm);
> }
> }
>
> out:
> + if (queue_task_work)
> + schedule_flush_migrate_mm();
> cs->old_mems_allowed = cpuset_attach_nodemask_to;
>
> if (cs->nr_migrate_dl_tasks) {
> @@ -3257,7 +3277,7 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
> mutex_unlock(&cpuset_mutex);
> cpus_read_unlock();
> if (of_cft(of)->private == FILE_MEMLIST)
> - flush_workqueue(cpuset_migrate_mm_wq);
> + schedule_flush_migrate_mm();
> return retval ?: nbytes;
> }
>
> @@ -3725,7 +3745,6 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
> .can_attach = cpuset_can_attach,
> .cancel_attach = cpuset_cancel_attach,
> .attach = cpuset_attach,
> - .post_attach = cpuset_post_attach,
> .bind = cpuset_bind,
> .can_fork = cpuset_can_fork,
> .cancel_fork = cpuset_cancel_fork,
Reviewed-by: Waiman Long <longman@...hat.com>
Powered by blists - more mailing lists