linux-kernel - Re: [PATCH v2 3/4] sched: Compact RSEQ concurrency IDs in batches

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <cf6fb15b-7a0f-4919-9bdd-a7bb7bb8f961@redhat.com>
Date: Wed, 6 Aug 2025 18:24:51 +0000 (UTC)
From: Gabriele Monaco <gmonaco@...hat.com>
To: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
Cc: linux-kernel@...r.kernel.org, Andrew Morton <akpm@...ux-foundation.org>,
	David Hildenbrand <david@...hat.com>, Ingo Molnar <mingo@...hat.com>,
	Peter Zijlstra <peterz@...radead.org>
Subject: Re: [PATCH v2 3/4] sched: Compact RSEQ concurrency IDs in batches

2025-08-06T16:57:51Z Mathieu Desnoyers <mathieu.desnoyers@...icios.com>:

> On 2025-08-05 08:42, Gabriele Monaco wrote:
>> On Wed, 2025-07-16 at 18:06 +0200, Gabriele Monaco wrote:
>>> Currently, task_mm_cid_work() is called from resume_user_mode_work().
>>> This can delay the execution of the corresponding thread for the
>>> entire duration of the function, negatively affecting the response in
>>> case of real time tasks.
>>> In practice, we observe task_mm_cid_work increasing the latency of
>>> 30-35us on a 128 cores system, this order of magnitude is meaningful
>>> under PREEMPT_RT.
>>>
>>> Run the task_mm_cid_work in batches of up to
>>> CONFIG_RSEQ_CID_SCAN_BATCH CPUs, this reduces the duration of the
>>> delay for each scan.
>>>
>>> The task_mm_cid_work contains a mechanism to avoid running more
>>> frequently than every 100ms. Keep this pseudo-periodicity only on
>>> complete scans.
>>> This means each call to task_mm_cid_work returns prematurely if the
>>> period did not elapse and a scan is not ongoing (i.e. the next batch
>>> to scan is not the first).
>>> This way full scans are not excessively delayed while still keeping
>>> each run, and introduced latency, short.
>>>
>> Mathieu, would you have some time to look at this implementation?
>
> Hi Gabriele,
>
> Please note that I am currently on vacation. I'll be back shortly
> before the end of August, but I'm afraid there are other tasks I
> need to focus on before I can get back to this. I'm adding this
> review to my todo list for September.
>

No problem, thanks for the update and enjoy your vacation!

Thanks,
Gabriele

> Thanks,
>
> Mathieu
>
>> Thanks,
>> Gabriele
>> Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by
>>> mm_cid")
>>> Signed-off-by: Gabriele Monaco <gmonaco@...hat.com>
>>> ---
>>>  include/linux/mm_types.h | 15 +++++++++++++++
>>>  init/Kconfig             | 12 ++++++++++++
>>>  kernel/sched/core.c      | 37 ++++++++++++++++++++++++++++++++++---
>>>  3 files changed, 61 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
>>> index e6d6e468e64b4..a822966a584f3 100644
>>> --- a/include/linux/mm_types.h
>>> +++ b/include/linux/mm_types.h
>>> @@ -995,6 +995,13 @@ struct mm_struct {
>>>         * When the next mm_cid scan is due (in jiffies).
>>>         */
>>>        unsigned long mm_cid_next_scan;
>>> +       /*
>>> +        * @mm_cid_scan_batch: Counter for batch used in the
>>> next scan.
>>> +        *
>>> +        * Scan in batches of CONFIG_RSEQ_CID_SCAN_BATCH.
>>> This field
>>> +        * increments at each scan and reset when all
>>> batches are done.
>>> +        */
>>> +       unsigned int mm_cid_scan_batch;
>>>        /**
>>>         * @nr_cpus_allowed: Number of CPUs allowed for mm.
>>>         *
>>> @@ -1385,6 +1392,7 @@ static inline void mm_init_cid(struct mm_struct
>>> *mm, struct task_struct *p)
>>>    raw_spin_lock_init(&mm->cpus_allowed_lock);
>>>    cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
>>>    cpumask_clear(mm_cidmask(mm));
>>> +   mm->mm_cid_scan_batch = 0;
>>>  }
>>>     static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct
>>> task_struct *p)
>>> @@ -1423,8 +1431,15 @@ static inline void mm_set_cpus_allowed(struct
>>> mm_struct *mm, const struct cpumas
>>>     static inline bool mm_cid_needs_scan(struct mm_struct *mm)
>>>  {
>>> +   unsigned int next_batch;
>>> +
>>>    if (!mm)
>>>        return false;
>>> +   next_batch = READ_ONCE(mm->mm_cid_scan_batch);
>>> +   /* Always needs scan unless it's the first batch. */
>>> +   if (CONFIG_RSEQ_CID_SCAN_BATCH * next_batch <
>>> num_possible_cpus() &&
>>> +       next_batch)
>>> +       return true;
>>>    return time_after(jiffies, READ_ONCE(mm->mm_cid_next_scan));
>>>  }
>>>  #else /* CONFIG_SCHED_MM_CID */
>>> diff --git a/init/Kconfig b/init/Kconfig
>>> index 666783eb50abd..98d7f078cd6df 100644
>>> --- a/init/Kconfig
>>> +++ b/init/Kconfig
>>> @@ -1860,6 +1860,18 @@ config DEBUG_RSEQ
>>>           If unsure, say N.
>>>   +config RSEQ_CID_SCAN_BATCH
>>> +   int "Number of CPUs to scan at every mm_cid compaction
>>> attempt"
>>> +   range 1 NR_CPUS
>>> +   default 8
>>> +   depends on SCHED_MM_CID
>>> +   help
>>> +     CPUs are scanned pseudo-periodically to compact the CID of
>>> each task,
>>> +     this operation can take a longer amount of time on systems
>>> with many
>>> +     CPUs, resulting in higher scheduling latency for the
>>> current task.
>>> +     A higher value means the CID is compacted faster, but
>>> results in
>>> +     higher scheduling latency.
>>> +
>>>  config CACHESTAT_SYSCALL
>>>    bool "Enable cachestat() system call" if EXPERT
>>>    default y
>>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>>> index 27b856a1cb0a9..eae4c8faf980b 100644
>>> --- a/kernel/sched/core.c
>>> +++ b/kernel/sched/core.c
>>> @@ -10591,11 +10591,26 @@ static void
>>> sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
>>>     void task_mm_cid_work(struct task_struct *t)
>>>  {
>>> +   int weight, cpu, from_cpu, this_batch, next_batch, idx;
>>>    unsigned long now = jiffies, old_scan, next_scan;
>>>    struct cpumask *cidmask;
>>> -   int weight, cpu;
>>>    struct mm_struct *mm = t->mm;
>>>   + /*
>>> +    * This function is called from __rseq_handle_notify_resume,
>>> which
>>> +    * makes sure t is a user thread and is not exiting.
>>> +    */
>>> +   this_batch = READ_ONCE(mm->mm_cid_scan_batch);
>>> +   next_batch = this_batch + 1;
>>> +   from_cpu = cpumask_nth(this_batch *
>>> CONFIG_RSEQ_CID_SCAN_BATCH,
>>> +                  cpu_possible_mask);
>>> +   if (from_cpu >= nr_cpu_ids) {
>>> +       from_cpu = 0;
>>> +       next_batch = 1;
>>> +   }
>>> +   /* Delay scan only if we are done with all cpus. */
>>> +   if (from_cpu != 0)
>>> +       goto cid_compact;
>>>    old_scan = READ_ONCE(mm->mm_cid_next_scan);
>>>    next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
>>>    if (!old_scan) {
>>> @@ -10611,17 +10626,33 @@ void task_mm_cid_work(struct task_struct
>>> *t)
>>>        return;
>>>    if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan,
>>> next_scan))
>>>        return;
>>> +
>>> +cid_compact:
>>> +   if (!try_cmpxchg(&mm->mm_cid_scan_batch, &this_batch,
>>> next_batch))
>>> +       return;
>>>    cidmask = mm_cidmask(mm);
>>>    /* Clear cids that were not recently used. */
>>> -   for_each_possible_cpu(cpu)
>>> +   idx = 0;
>>> +   cpu = from_cpu;
>>> +   for_each_cpu_from(cpu, cpu_possible_mask) {
>>> +       if (idx == CONFIG_RSEQ_CID_SCAN_BATCH)
>>> +           break;
>>>        sched_mm_cid_remote_clear_old(mm, cpu);
>>> +       ++idx;
>>> +   }
>>>    weight = cpumask_weight(cidmask);
>>>    /*
>>>     * Clear cids that are greater or equal to the cidmask
>>> weight to
>>>     * recompact it.
>>>     */
>>> -   for_each_possible_cpu(cpu)
>>> +   idx = 0;
>>> +   cpu = from_cpu;
>>> +   for_each_cpu_from(cpu, cpu_possible_mask) {
>>> +       if (idx == CONFIG_RSEQ_CID_SCAN_BATCH)
>>> +           break;
>>>        sched_mm_cid_remote_clear_weight(mm, cpu, weight);
>>> +       ++idx;
>>> +   }
>>>  }
>>>     void init_sched_mm_cid(struct task_struct *t)
>>
>
>
> --
> Mathieu Desnoyers
> EfficiOS Inc.
> https://www.efficios.com