[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8fc793e3-cdfc-4603-afe6-d2ed6785ffbb@efficios.com>
Date: Wed, 19 Feb 2025 10:13:28 -0500
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Gabriele Monaco <gmonaco@...hat.com>, linux-kernel@...r.kernel.org,
Andrew Morton <akpm@...ux-foundation.org>, Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>, "Paul E. McKenney"
<paulmck@...nel.org>, linux-mm@...ck.org
Cc: Ingo Molnar <mingo@...nel.org>, Shuah Khan <shuah@...nel.org>
Subject: Re: [PATCH v7 1/2] sched: Move task_mm_cid_work to mm work_struct
On 2025-02-19 06:31, Gabriele Monaco wrote:
> Currently, the task_mm_cid_work function is called in a task work
> triggered by a scheduler tick to frequently compact the mm_cids of each
> process. This can delay the execution of the corresponding thread for
> the entire duration of the function, negatively affecting the response
> in case of real time tasks. In practice, we observe task_mm_cid_work
> increasing the latency of 30-35us on a 128 cores system, this order of
> magnitude is meaningful under PREEMPT_RT.
>
> Run the task_mm_cid_work in a new work_struct connected to the
> mm_struct rather than in the task context before returning to
> userspace.
>
> This work_struct is initialised with the mm and disabled before freeing
> it. Its execution is no longer triggered by scheduler ticks: the queuing
> of the work happens while returning to userspace in
> __rseq_handle_notify_resume, maintaining the checks to avoid running
> more frequently than MM_CID_SCAN_DELAY.
>
> The main advantage of this change is that the function can be offloaded
> to a different CPU and even preempted by RT tasks.
>
> Moreover, this new behaviour is more predictable with periodic tasks
> with short runtime, which may rarely run during a scheduler tick.
> Now, the work is always scheduled when the task returns to userspace.
>
> The work is disabled during mmdrop, since the function cannot sleep in
> all kernel configurations, we cannot wait for possibly running work
> items to terminate. We make sure the mm is valid in case the task is
> terminating by reserving it with mmgrab/mmdrop, returning prematurely if
> we are really the last user before mmgrab.
> This situation is unlikely since we don't schedule the work for exiting
> tasks, but we cannot rule it out.
>
> Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid")
> Signed-off-by: Gabriele Monaco <gmonaco@...hat.com>
> ---
> include/linux/mm_types.h | 8 ++++++++
> include/linux/sched.h | 7 ++++++-
> kernel/rseq.c | 1 +
> kernel/sched/core.c | 33 ++++++++++++---------------------
> kernel/sched/sched.h | 2 --
> 5 files changed, 27 insertions(+), 24 deletions(-)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 0234f14f2aa6b..e748cf51e0c32 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -889,6 +889,10 @@ struct mm_struct {
> * mm nr_cpus_allowed updates.
> */
> raw_spinlock_t cpus_allowed_lock;
> + /*
> + * @cid_work: Work item to run the mm_cid scan.
> + */
> + struct work_struct cid_work;
> #endif
> #ifdef CONFIG_MMU
> atomic_long_t pgtables_bytes; /* size of all page tables */
> @@ -1185,6 +1189,8 @@ enum mm_cid_state {
> MM_CID_LAZY_PUT = (1U << 31),
> };
>
> +extern void task_mm_cid_work(struct work_struct *work);
> +
> static inline bool mm_cid_is_unset(int cid)
> {
> return cid == MM_CID_UNSET;
> @@ -1257,12 +1263,14 @@ static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *
> if (!mm->pcpu_cid)
> return -ENOMEM;
> mm_init_cid(mm, p);
> + INIT_WORK(&mm->cid_work, task_mm_cid_work);
> return 0;
> }
> #define mm_alloc_cid(...) alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__))
>
> static inline void mm_destroy_cid(struct mm_struct *mm)
> {
> + disable_work(&mm->cid_work);
> free_percpu(mm->pcpu_cid);
> mm->pcpu_cid = NULL;
> }
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 9632e3318e0d6..2fd65f125153d 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1397,7 +1397,6 @@ struct task_struct {
> int last_mm_cid; /* Most recent cid in mm */
> int migrate_from_cpu;
> int mm_cid_active; /* Whether cid bitmap is active */
> - struct callback_head cid_work;
> #endif
>
> struct tlbflush_unmap_batch tlb_ubc;
> @@ -2254,4 +2253,10 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo
> #define alloc_tag_restore(_tag, _old) do {} while (0)
> #endif
>
> +#ifdef CONFIG_SCHED_MM_CID
> +extern void task_queue_mm_cid(struct task_struct *curr);
> +#else
> +static inline void task_queue_mm_cid(struct task_struct *curr) { }
> +#endif
> +
> #endif
> diff --git a/kernel/rseq.c b/kernel/rseq.c
> index 442aba29bc4cf..f8394ebbb6f4d 100644
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -419,6 +419,7 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
> }
> if (unlikely(rseq_update_cpu_node_id(t)))
> goto error;
> + task_queue_mm_cid(t);
> return;
>
> error:
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9aecd914ac691..ee35f9962444b 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5663,7 +5663,6 @@ void sched_tick(void)
> resched_latency = cpu_resched_latency(rq);
> calc_global_load_tick(rq);
> sched_core_tick(rq);
> - task_tick_mm_cid(rq, donor);
I agree that this approach is promising, however I am concerned about
the fact that a task running alone on its runqueue (thus without
preemption) for a long time will never recompact mm_cid, and also
will never update its mm_cid field.
So I am tempted to insert this in the sched_tick to cover that scenario:
rseq_preempt(current);
This would ensure that the task runs __rseq_handle_notify_resume() at
least each tick.
> scx_tick(rq);
>
> rq_unlock(rq, &rf);
> @@ -10530,22 +10529,16 @@ static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
> sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
> }
>
> -static void task_mm_cid_work(struct callback_head *work)
> +void task_mm_cid_work(struct work_struct *work)
> {
> unsigned long now = jiffies, old_scan, next_scan;
> - struct task_struct *t = current;
> struct cpumask *cidmask;
> - struct mm_struct *mm;
> + struct mm_struct *mm = container_of(work, struct mm_struct, cid_work);
> int weight, cpu;
>
> - SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work));
> -
> - work->next = work; /* Prevent double-add */
> - if (t->flags & PF_EXITING)
> - return;
> - mm = t->mm;
> - if (!mm)
> + if (!atomic_read(&mm->mm_count))
> return;
> + mmgrab(mm);
AFAIU this is racy with respect to re-use of mm struct.
I recommend that you move mmgrab() to task_queue_mm_cid() just before
invoking schedule_work. That way you ensure that the mm count never
reaches 0 while there is work in flight (and therefore guarantee that
the mm is not re-used).
Thanks,
Mathieu
> old_scan = READ_ONCE(mm->mm_cid_next_scan);
> next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
> if (!old_scan) {
> @@ -10558,9 +10551,9 @@ static void task_mm_cid_work(struct callback_head *work)
> old_scan = next_scan;
> }
> if (time_before(now, old_scan))
> - return;
> + goto out_drop;
> if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
> - return;
> + goto out_drop;
> cidmask = mm_cidmask(mm);
> /* Clear cids that were not recently used. */
> for_each_possible_cpu(cpu)
> @@ -10572,6 +10565,8 @@ static void task_mm_cid_work(struct callback_head *work)
> */
> for_each_possible_cpu(cpu)
> sched_mm_cid_remote_clear_weight(mm, cpu, weight);
> +out_drop:
> + mmdrop(mm);
> }
>
> void init_sched_mm_cid(struct task_struct *t)
> @@ -10584,23 +10579,19 @@ void init_sched_mm_cid(struct task_struct *t)
> if (mm_users == 1)
> mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
> }
> - t->cid_work.next = &t->cid_work; /* Protect against double add */
> - init_task_work(&t->cid_work, task_mm_cid_work);
> }
>
> -void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
> +void task_queue_mm_cid(struct task_struct *curr)
> {
> - struct callback_head *work = &curr->cid_work;
> + struct work_struct *work = &curr->mm->cid_work;
> unsigned long now = jiffies;
>
> - if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
> - work->next != work)
> + if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)))
> return;
> if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
> return;
>
> - /* No page allocation under rq lock */
> - task_work_add(curr, work, TWA_RESUME);> + schedule_work(work);
> }
>
> void sched_mm_cid_exit_signals(struct task_struct *t)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index c8512a9fb0229..37a2e2328283e 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3630,7 +3630,6 @@ extern int use_cid_lock;
>
> extern void sched_mm_cid_migrate_from(struct task_struct *t);
> extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
> -extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
> extern void init_sched_mm_cid(struct task_struct *t);
>
> static inline void __mm_cid_put(struct mm_struct *mm, int cid)
> @@ -3899,7 +3898,6 @@ static inline void switch_mm_cid(struct rq *rq,
> static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
> static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
> static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
> -static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
> static inline void init_sched_mm_cid(struct task_struct *t) { }
> #endif /* !CONFIG_SCHED_MM_CID */
>
--
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com
Powered by blists - more mailing lists