[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <9c72396b5a9757913e061c0aa7f3d488b57a8a64.camel@redhat.com>
Date: Tue, 05 Aug 2025 14:42:05 +0200
From: Gabriele Monaco <gmonaco@...hat.com>
To: linux-kernel@...r.kernel.org, Mathieu Desnoyers
<mathieu.desnoyers@...icios.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>, David Hildenbrand
<david@...hat.com>, Ingo Molnar <mingo@...hat.com>, Peter Zijlstra
<peterz@...radead.org>
Subject: Re: [PATCH v2 3/4] sched: Compact RSEQ concurrency IDs in batches
On Wed, 2025-07-16 at 18:06 +0200, Gabriele Monaco wrote:
> Currently, task_mm_cid_work() is called from resume_user_mode_work().
> This can delay the execution of the corresponding thread for the
> entire duration of the function, negatively affecting the response in
> case of real time tasks.
> In practice, we observe task_mm_cid_work increasing the latency of
> 30-35us on a 128 cores system, this order of magnitude is meaningful
> under PREEMPT_RT.
>
> Run the task_mm_cid_work in batches of up to
> CONFIG_RSEQ_CID_SCAN_BATCH CPUs, this reduces the duration of the
> delay for each scan.
>
> The task_mm_cid_work contains a mechanism to avoid running more
> frequently than every 100ms. Keep this pseudo-periodicity only on
> complete scans.
> This means each call to task_mm_cid_work returns prematurely if the
> period did not elapse and a scan is not ongoing (i.e. the next batch
> to scan is not the first).
> This way full scans are not excessively delayed while still keeping
> each run, and introduced latency, short.
>
Mathieu, would you have some time to look at this implementation?
Thanks,
Gabriele
> Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by
> mm_cid")
> Signed-off-by: Gabriele Monaco <gmonaco@...hat.com>
> ---
> include/linux/mm_types.h | 15 +++++++++++++++
> init/Kconfig | 12 ++++++++++++
> kernel/sched/core.c | 37 ++++++++++++++++++++++++++++++++++---
> 3 files changed, 61 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index e6d6e468e64b4..a822966a584f3 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -995,6 +995,13 @@ struct mm_struct {
> * When the next mm_cid scan is due (in jiffies).
> */
> unsigned long mm_cid_next_scan;
> + /*
> + * @mm_cid_scan_batch: Counter for batch used in the
> next scan.
> + *
> + * Scan in batches of CONFIG_RSEQ_CID_SCAN_BATCH.
> This field
> + * increments at each scan and reset when all
> batches are done.
> + */
> + unsigned int mm_cid_scan_batch;
> /**
> * @nr_cpus_allowed: Number of CPUs allowed for mm.
> *
> @@ -1385,6 +1392,7 @@ static inline void mm_init_cid(struct mm_struct
> *mm, struct task_struct *p)
> raw_spin_lock_init(&mm->cpus_allowed_lock);
> cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
> cpumask_clear(mm_cidmask(mm));
> + mm->mm_cid_scan_batch = 0;
> }
>
> static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct
> task_struct *p)
> @@ -1423,8 +1431,15 @@ static inline void mm_set_cpus_allowed(struct
> mm_struct *mm, const struct cpumas
>
> static inline bool mm_cid_needs_scan(struct mm_struct *mm)
> {
> + unsigned int next_batch;
> +
> if (!mm)
> return false;
> + next_batch = READ_ONCE(mm->mm_cid_scan_batch);
> + /* Always needs scan unless it's the first batch. */
> + if (CONFIG_RSEQ_CID_SCAN_BATCH * next_batch <
> num_possible_cpus() &&
> + next_batch)
> + return true;
> return time_after(jiffies, READ_ONCE(mm->mm_cid_next_scan));
> }
> #else /* CONFIG_SCHED_MM_CID */
> diff --git a/init/Kconfig b/init/Kconfig
> index 666783eb50abd..98d7f078cd6df 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -1860,6 +1860,18 @@ config DEBUG_RSEQ
>
> If unsure, say N.
>
> +config RSEQ_CID_SCAN_BATCH
> + int "Number of CPUs to scan at every mm_cid compaction
> attempt"
> + range 1 NR_CPUS
> + default 8
> + depends on SCHED_MM_CID
> + help
> + CPUs are scanned pseudo-periodically to compact the CID of
> each task,
> + this operation can take a longer amount of time on systems
> with many
> + CPUs, resulting in higher scheduling latency for the
> current task.
> + A higher value means the CID is compacted faster, but
> results in
> + higher scheduling latency.
> +
> config CACHESTAT_SYSCALL
> bool "Enable cachestat() system call" if EXPERT
> default y
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 27b856a1cb0a9..eae4c8faf980b 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -10591,11 +10591,26 @@ static void
> sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
>
> void task_mm_cid_work(struct task_struct *t)
> {
> + int weight, cpu, from_cpu, this_batch, next_batch, idx;
> unsigned long now = jiffies, old_scan, next_scan;
> struct cpumask *cidmask;
> - int weight, cpu;
> struct mm_struct *mm = t->mm;
>
> + /*
> + * This function is called from __rseq_handle_notify_resume,
> which
> + * makes sure t is a user thread and is not exiting.
> + */
> + this_batch = READ_ONCE(mm->mm_cid_scan_batch);
> + next_batch = this_batch + 1;
> + from_cpu = cpumask_nth(this_batch *
> CONFIG_RSEQ_CID_SCAN_BATCH,
> + cpu_possible_mask);
> + if (from_cpu >= nr_cpu_ids) {
> + from_cpu = 0;
> + next_batch = 1;
> + }
> + /* Delay scan only if we are done with all cpus. */
> + if (from_cpu != 0)
> + goto cid_compact;
> old_scan = READ_ONCE(mm->mm_cid_next_scan);
> next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
> if (!old_scan) {
> @@ -10611,17 +10626,33 @@ void task_mm_cid_work(struct task_struct
> *t)
> return;
> if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan,
> next_scan))
> return;
> +
> +cid_compact:
> + if (!try_cmpxchg(&mm->mm_cid_scan_batch, &this_batch,
> next_batch))
> + return;
> cidmask = mm_cidmask(mm);
> /* Clear cids that were not recently used. */
> - for_each_possible_cpu(cpu)
> + idx = 0;
> + cpu = from_cpu;
> + for_each_cpu_from(cpu, cpu_possible_mask) {
> + if (idx == CONFIG_RSEQ_CID_SCAN_BATCH)
> + break;
> sched_mm_cid_remote_clear_old(mm, cpu);
> + ++idx;
> + }
> weight = cpumask_weight(cidmask);
> /*
> * Clear cids that are greater or equal to the cidmask
> weight to
> * recompact it.
> */
> - for_each_possible_cpu(cpu)
> + idx = 0;
> + cpu = from_cpu;
> + for_each_cpu_from(cpu, cpu_possible_mask) {
> + if (idx == CONFIG_RSEQ_CID_SCAN_BATCH)
> + break;
> sched_mm_cid_remote_clear_weight(mm, cpu, weight);
> + ++idx;
> + }
> }
>
> void init_sched_mm_cid(struct task_struct *t)
Powered by blists - more mailing lists