linux-kernel - Re: [PATCH v2 2/4] rseq: Run the mm_cid_compaction from rseq_handle_notify

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d5183516-92ea-4a76-9506-2f7e4da0b0ad@efficios.com>
Date: Tue, 26 Aug 2025 14:01:12 -0400
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Gabriele Monaco <gmonaco@...hat.com>, linux-kernel@...r.kernel.org,
 Andrew Morton <akpm@...ux-foundation.org>,
 David Hildenbrand <david@...hat.com>, Ingo Molnar <mingo@...hat.com>,
 Peter Zijlstra <peterz@...radead.org>, "Paul E. McKenney"
 <paulmck@...nel.org>, linux-mm@...ck.org,
 Thomas Gleixner <tglx@...utronix.de>
Cc: Ingo Molnar <mingo@...hat.org>
Subject: Re: [PATCH v2 2/4] rseq: Run the mm_cid_compaction from
 rseq_handle_notify_resume()

On 2025-07-16 12:06, Gabriele Monaco wrote:
> Currently the mm_cid_compaction is triggered by the scheduler tick and
> runs in a task_work, behaviour is more unpredictable with periodic tasks
> with short runtime, which may rarely run during a tick.
> 
> Run the mm_cid_compaction from the rseq_handle_notify_resume() call,
> which runs from resume_user_mode_work. Since the context is the same
> where the task_work would run, skip this step and call the compaction
> function directly.
> The compaction function still exits prematurely in case the scan is not
> required, that is when the pseudo-period of 100ms did not elapse.
> 
> Keep a tick handler used for long running tasks that are never preempted
> (i.e. that never call rseq_handle_notify_resume), which triggers a
> compaction and mm_cid update only in that case.

Your approach looks good, but please note that this will probably
need to be rebased on top of the rseq rework from Thomas Gleixner.

Latest version can be found here:

https://lore.kernel.org/lkml/20250823161326.635281786@linutronix.de/

Thanks,

Mathieu

> 
> Signed-off-by: Gabriele Monaco <gmonaco@...hat.com>
> ---
>   include/linux/mm.h       |  2 ++
>   include/linux/mm_types.h | 11 ++++++++
>   include/linux/sched.h    |  2 +-
>   kernel/rseq.c            |  2 ++
>   kernel/sched/core.c      | 55 +++++++++++++++++++++++++---------------
>   kernel/sched/sched.h     |  2 ++
>   6 files changed, 53 insertions(+), 21 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index fa538feaa8d95..cc8c1c9ae26c1 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2294,6 +2294,7 @@ void sched_mm_cid_before_execve(struct task_struct *t);
>   void sched_mm_cid_after_execve(struct task_struct *t);
>   void sched_mm_cid_fork(struct task_struct *t);
>   void sched_mm_cid_exit_signals(struct task_struct *t);
> +void task_mm_cid_work(struct task_struct *t);
>   static inline int task_mm_cid(struct task_struct *t)
>   {
>   	return t->mm_cid;
> @@ -2303,6 +2304,7 @@ static inline void sched_mm_cid_before_execve(struct task_struct *t) { }
>   static inline void sched_mm_cid_after_execve(struct task_struct *t) { }
>   static inline void sched_mm_cid_fork(struct task_struct *t) { }
>   static inline void sched_mm_cid_exit_signals(struct task_struct *t) { }
> +static inline void task_mm_cid_work(struct task_struct *t) { }
>   static inline int task_mm_cid(struct task_struct *t)
>   {
>   	/*
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index d6b91e8a66d6d..e6d6e468e64b4 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1420,6 +1420,13 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
>   	WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed));
>   	raw_spin_unlock(&mm->cpus_allowed_lock);
>   }
> +
> +static inline bool mm_cid_needs_scan(struct mm_struct *mm)
> +{
> +	if (!mm)
> +		return false;
> +	return time_after(jiffies, READ_ONCE(mm->mm_cid_next_scan));
> +}
>   #else /* CONFIG_SCHED_MM_CID */
>   static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
>   static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
> @@ -1430,6 +1437,10 @@ static inline unsigned int mm_cid_size(void)
>   	return 0;
>   }
>   static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
> +static inline bool mm_cid_needs_scan(struct mm_struct *mm)
> +{
> +	return false;
> +}
>   #endif /* CONFIG_SCHED_MM_CID */
>   
>   struct mmu_gather;
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index aa9c5be7a6325..a75f61cea2271 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1428,7 +1428,7 @@ struct task_struct {
>   	int				last_mm_cid;	/* Most recent cid in mm */
>   	int				migrate_from_cpu;
>   	int				mm_cid_active;	/* Whether cid bitmap is active */
> -	struct callback_head		cid_work;
> +	unsigned long			last_cid_reset;	/* Time of last reset in jiffies */
>   #endif
>   
>   	struct tlbflush_unmap_batch	tlb_ubc;
> diff --git a/kernel/rseq.c b/kernel/rseq.c
> index b7a1ec327e811..100f81e330dc6 100644
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -441,6 +441,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
>   	}
>   	if (unlikely(rseq_update_cpu_node_id(t)))
>   		goto error;
> +	/* The mm_cid compaction returns prematurely if scan is not needed. */
> +	task_mm_cid_work(t);
>   	return;
>   
>   error:
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 81c6df746df17..27b856a1cb0a9 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -10589,22 +10589,13 @@ static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
>   	sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
>   }
>   
> -static void task_mm_cid_work(struct callback_head *work)
> +void task_mm_cid_work(struct task_struct *t)
>   {
>   	unsigned long now = jiffies, old_scan, next_scan;
> -	struct task_struct *t = current;
>   	struct cpumask *cidmask;
> -	struct mm_struct *mm;
>   	int weight, cpu;
> +	struct mm_struct *mm = t->mm;
>   
> -	WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
> -
> -	work->next = work;	/* Prevent double-add */
> -	if (t->flags & PF_EXITING)
> -		return;
> -	mm = t->mm;
> -	if (!mm)
> -		return;
>   	old_scan = READ_ONCE(mm->mm_cid_next_scan);
>   	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
>   	if (!old_scan) {
> @@ -10643,23 +10634,47 @@ void init_sched_mm_cid(struct task_struct *t)
>   		if (mm_users == 1)
>   			mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
>   	}
> -	t->cid_work.next = &t->cid_work;	/* Protect against double add */
> -	init_task_work(&t->cid_work, task_mm_cid_work);
>   }
>   
>   void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
>   {
> -	struct callback_head *work = &curr->cid_work;
> -	unsigned long now = jiffies;
> +	u64 rtime = curr->se.sum_exec_runtime - curr->se.prev_sum_exec_runtime;
>   
> +	/*
> +	 * If a task is running unpreempted for a long time, it won't get its
> +	 * mm_cid compacted and won't update its mm_cid value after a
> +	 * compaction occurs.
> +	 * For such a task, this function does two things:
> +	 * A) trigger the mm_cid recompaction,
> +	 * B) trigger an update of the task's rseq->mm_cid field at some point
> +	 * after recompaction, so it can get a mm_cid value closer to 0.
> +	 * A change in the mm_cid triggers an rseq_preempt.
> +	 *
> +	 * B occurs once after the compaction work completes, neither A nor B
> +	 * run as long as the compaction work is pending, the task is exiting
> +	 * or is not a userspace task.
> +	 */
>   	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
> -	    work->next != work)
> +	    test_tsk_thread_flag(curr, TIF_NOTIFY_RESUME))
>   		return;
> -	if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
> +	if (rtime < RSEQ_UNPREEMPTED_THRESHOLD)
>   		return;
> -
> -	/* No page allocation under rq lock */
> -	task_work_add(curr, work, TWA_RESUME);
> +	if (mm_cid_needs_scan(curr->mm)) {
> +		/* Trigger mm_cid recompaction */
> +		rseq_set_notify_resume(curr);
> +	} else if (time_after(jiffies, curr->last_cid_reset +
> +			      msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
> +		/* Update mm_cid field */
> +		int old_cid = curr->mm_cid;
> +
> +		if (!curr->mm_cid_active)
> +			return;
> +		mm_cid_snapshot_time(rq, curr->mm);
> +		mm_cid_put_lazy(curr);
> +		curr->last_mm_cid = curr->mm_cid = mm_cid_get(rq, curr, curr->mm);
> +		if (old_cid != curr->mm_cid)
> +			rseq_preempt(curr);
> +	}
>   }
>   
>   void sched_mm_cid_exit_signals(struct task_struct *t)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 475bb5998295e..90a5b58188232 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3606,6 +3606,7 @@ extern const char *preempt_modes[];
>   
>   #define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
>   #define MM_CID_SCAN_DELAY	100			/* 100ms */
> +#define RSEQ_UNPREEMPTED_THRESHOLD	SCHED_MM_CID_PERIOD_NS
>   
>   extern raw_spinlock_t cid_lock;
>   extern int use_cid_lock;
> @@ -3809,6 +3810,7 @@ static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
>   	int cid;
>   
>   	lockdep_assert_rq_held(rq);
> +	t->last_cid_reset = jiffies;
>   	cpumask = mm_cidmask(mm);
>   	cid = __this_cpu_read(pcpu_cid->cid);
>   	if (mm_cid_is_valid(cid)) {


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com