linux-kernel - Re: [PATCH v2 3/4] sched: Compact RSEQ concurrency IDs in batches

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <6144e652-7599-4b04-a08e-9f059ff81e26@efficios.com>
Date: Wed, 6 Aug 2025 12:57:40 -0400
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Gabriele Monaco <gmonaco@...hat.com>, linux-kernel@...r.kernel.org
Cc: Andrew Morton <akpm@...ux-foundation.org>,
 David Hildenbrand <david@...hat.com>, Ingo Molnar <mingo@...hat.com>,
 Peter Zijlstra <peterz@...radead.org>
Subject: Re: [PATCH v2 3/4] sched: Compact RSEQ concurrency IDs in batches

On 2025-08-05 08:42, Gabriele Monaco wrote:
> On Wed, 2025-07-16 at 18:06 +0200, Gabriele Monaco wrote:
>> Currently, task_mm_cid_work() is called from resume_user_mode_work().
>> This can delay the execution of the corresponding thread for the
>> entire duration of the function, negatively affecting the response in
>> case of real time tasks.
>> In practice, we observe task_mm_cid_work increasing the latency of
>> 30-35us on a 128 cores system, this order of magnitude is meaningful
>> under PREEMPT_RT.
>>
>> Run the task_mm_cid_work in batches of up to
>> CONFIG_RSEQ_CID_SCAN_BATCH CPUs, this reduces the duration of the
>> delay for each scan.
>>
>> The task_mm_cid_work contains a mechanism to avoid running more
>> frequently than every 100ms. Keep this pseudo-periodicity only on
>> complete scans.
>> This means each call to task_mm_cid_work returns prematurely if the
>> period did not elapse and a scan is not ongoing (i.e. the next batch
>> to scan is not the first).
>> This way full scans are not excessively delayed while still keeping
>> each run, and introduced latency, short.
>>
> 
> Mathieu, would you have some time to look at this implementation?

Hi Gabriele,

Please note that I am currently on vacation. I'll be back shortly
before the end of August, but I'm afraid there are other tasks I
need to focus on before I can get back to this. I'm adding this
review to my todo list for September.

Thanks,

Mathieu

> 
> Thanks,
> Gabriele
> 
>> Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by
>> mm_cid")
>> Signed-off-by: Gabriele Monaco <gmonaco@...hat.com>
>> ---
>>   include/linux/mm_types.h | 15 +++++++++++++++
>>   init/Kconfig             | 12 ++++++++++++
>>   kernel/sched/core.c      | 37 ++++++++++++++++++++++++++++++++++---
>>   3 files changed, 61 insertions(+), 3 deletions(-)
>>
>> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
>> index e6d6e468e64b4..a822966a584f3 100644
>> --- a/include/linux/mm_types.h
>> +++ b/include/linux/mm_types.h
>> @@ -995,6 +995,13 @@ struct mm_struct {
>>   		 * When the next mm_cid scan is due (in jiffies).
>>   		 */
>>   		unsigned long mm_cid_next_scan;
>> +		/*
>> +		 * @mm_cid_scan_batch: Counter for batch used in the
>> next scan.
>> +		 *
>> +		 * Scan in batches of CONFIG_RSEQ_CID_SCAN_BATCH.
>> This field
>> +		 * increments at each scan and reset when all
>> batches are done.
>> +		 */
>> +		unsigned int mm_cid_scan_batch;
>>   		/**
>>   		 * @nr_cpus_allowed: Number of CPUs allowed for mm.
>>   		 *
>> @@ -1385,6 +1392,7 @@ static inline void mm_init_cid(struct mm_struct
>> *mm, struct task_struct *p)
>>   	raw_spin_lock_init(&mm->cpus_allowed_lock);
>>   	cpumask_copy(mm_cpus_allowed(mm), &p->cpus_mask);
>>   	cpumask_clear(mm_cidmask(mm));
>> +	mm->mm_cid_scan_batch = 0;
>>   }
>>   
>>   static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct
>> task_struct *p)
>> @@ -1423,8 +1431,15 @@ static inline void mm_set_cpus_allowed(struct
>> mm_struct *mm, const struct cpumas
>>   
>>   static inline bool mm_cid_needs_scan(struct mm_struct *mm)
>>   {
>> +	unsigned int next_batch;
>> +
>>   	if (!mm)
>>   		return false;
>> +	next_batch = READ_ONCE(mm->mm_cid_scan_batch);
>> +	/* Always needs scan unless it's the first batch. */
>> +	if (CONFIG_RSEQ_CID_SCAN_BATCH * next_batch <
>> num_possible_cpus() &&
>> +	    next_batch)
>> +		return true;
>>   	return time_after(jiffies, READ_ONCE(mm->mm_cid_next_scan));
>>   }
>>   #else /* CONFIG_SCHED_MM_CID */
>> diff --git a/init/Kconfig b/init/Kconfig
>> index 666783eb50abd..98d7f078cd6df 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -1860,6 +1860,18 @@ config DEBUG_RSEQ
>>   
>>   	  If unsure, say N.
>>   
>> +config RSEQ_CID_SCAN_BATCH
>> +	int "Number of CPUs to scan at every mm_cid compaction
>> attempt"
>> +	range 1 NR_CPUS
>> +	default 8
>> +	depends on SCHED_MM_CID
>> +	help
>> +	  CPUs are scanned pseudo-periodically to compact the CID of
>> each task,
>> +	  this operation can take a longer amount of time on systems
>> with many
>> +	  CPUs, resulting in higher scheduling latency for the
>> current task.
>> +	  A higher value means the CID is compacted faster, but
>> results in
>> +	  higher scheduling latency.
>> +
>>   config CACHESTAT_SYSCALL
>>   	bool "Enable cachestat() system call" if EXPERT
>>   	default y
>> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
>> index 27b856a1cb0a9..eae4c8faf980b 100644
>> --- a/kernel/sched/core.c
>> +++ b/kernel/sched/core.c
>> @@ -10591,11 +10591,26 @@ static void
>> sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
>>   
>>   void task_mm_cid_work(struct task_struct *t)
>>   {
>> +	int weight, cpu, from_cpu, this_batch, next_batch, idx;
>>   	unsigned long now = jiffies, old_scan, next_scan;
>>   	struct cpumask *cidmask;
>> -	int weight, cpu;
>>   	struct mm_struct *mm = t->mm;
>>   
>> +	/*
>> +	 * This function is called from __rseq_handle_notify_resume,
>> which
>> +	 * makes sure t is a user thread and is not exiting.
>> +	 */
>> +	this_batch = READ_ONCE(mm->mm_cid_scan_batch);
>> +	next_batch = this_batch + 1;
>> +	from_cpu = cpumask_nth(this_batch *
>> CONFIG_RSEQ_CID_SCAN_BATCH,
>> +			       cpu_possible_mask);
>> +	if (from_cpu >= nr_cpu_ids) {
>> +		from_cpu = 0;
>> +		next_batch = 1;
>> +	}
>> +	/* Delay scan only if we are done with all cpus. */
>> +	if (from_cpu != 0)
>> +		goto cid_compact;
>>   	old_scan = READ_ONCE(mm->mm_cid_next_scan);
>>   	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
>>   	if (!old_scan) {
>> @@ -10611,17 +10626,33 @@ void task_mm_cid_work(struct task_struct
>> *t)
>>   		return;
>>   	if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan,
>> next_scan))
>>   		return;
>> +
>> +cid_compact:
>> +	if (!try_cmpxchg(&mm->mm_cid_scan_batch, &this_batch,
>> next_batch))
>> +		return;
>>   	cidmask = mm_cidmask(mm);
>>   	/* Clear cids that were not recently used. */
>> -	for_each_possible_cpu(cpu)
>> +	idx = 0;
>> +	cpu = from_cpu;
>> +	for_each_cpu_from(cpu, cpu_possible_mask) {
>> +		if (idx == CONFIG_RSEQ_CID_SCAN_BATCH)
>> +			break;
>>   		sched_mm_cid_remote_clear_old(mm, cpu);
>> +		++idx;
>> +	}
>>   	weight = cpumask_weight(cidmask);
>>   	/*
>>   	 * Clear cids that are greater or equal to the cidmask
>> weight to
>>   	 * recompact it.
>>   	 */
>> -	for_each_possible_cpu(cpu)
>> +	idx = 0;
>> +	cpu = from_cpu;
>> +	for_each_cpu_from(cpu, cpu_possible_mask) {
>> +		if (idx == CONFIG_RSEQ_CID_SCAN_BATCH)
>> +			break;
>>   		sched_mm_cid_remote_clear_weight(mm, cpu, weight);
>> +		++idx;
>> +	}
>>   }
>>   
>>   void init_sched_mm_cid(struct task_struct *t)
> 


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com