linux-kernel - Re: [PATCH v2 2/4] rseq: Run the mm_cid_compaction from rseq_handle_notify

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <8620f69a3726e2e818305027ef79605decbd4148.camel@redhat.com>
Date: Wed, 27 Aug 2025 08:55:18 +0200
From: Gabriele Monaco <gmonaco@...hat.com>
To: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
Cc: linux-kernel@...r.kernel.org, Andrew Morton <akpm@...ux-foundation.org>,
  David Hildenbrand	 <david@...hat.com>, Ingo Molnar <mingo@...hat.com>,
 Peter Zijlstra	 <peterz@...radead.org>, "Paul E. McKenney"
 <paulmck@...nel.org>, 	linux-mm@...ck.org, Thomas Gleixner
 <tglx@...utronix.de>
Subject: Re: [PATCH v2 2/4] rseq: Run the mm_cid_compaction from
 rseq_handle_notify_resume()

On Tue, 2025-08-26 at 14:01 -0400, Mathieu Desnoyers wrote:
> On 2025-07-16 12:06, Gabriele Monaco wrote:
> > Currently the mm_cid_compaction is triggered by the scheduler tick
> > and
> > runs in a task_work, behaviour is more unpredictable with periodic
> > tasks
> > with short runtime, which may rarely run during a tick.
> > 
> > Run the mm_cid_compaction from the rseq_handle_notify_resume()
> > call,
> > which runs from resume_user_mode_work. Since the context is the
> > same
> > where the task_work would run, skip this step and call the
> > compaction
> > function directly.
> > The compaction function still exits prematurely in case the scan is
> > not
> > required, that is when the pseudo-period of 100ms did not elapse.
> > 
> > Keep a tick handler used for long running tasks that are never
> > preempted
> > (i.e. that never call rseq_handle_notify_resume), which triggers a
> > compaction and mm_cid update only in that case.
> 
> Your approach looks good, but please note that this will probably
> need to be rebased on top of the rseq rework from Thomas Gleixner.
> 
> Latest version can be found here:
> 
> https://lore.kernel.org/lkml/20250823161326.635281786@linutronix.de/
> 

Mmh that's quite a large one, thanks for sharing!
I'm going to have a look but it might make sense to wait until that's
included, I guess.

Thanks,
Gabriele


> Thanks,
> 
> Mathieu
> 
> > 
> > Signed-off-by: Gabriele Monaco <gmonaco@...hat.com>
> > ---
> >   include/linux/mm.h       |  2 ++
> >   include/linux/mm_types.h | 11 ++++++++
> >   include/linux/sched.h    |  2 +-
> >   kernel/rseq.c            |  2 ++
> >   kernel/sched/core.c      | 55 +++++++++++++++++++++++++----------
> > -----
> >   kernel/sched/sched.h     |  2 ++
> >   6 files changed, 53 insertions(+), 21 deletions(-)
> > 
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index fa538feaa8d95..cc8c1c9ae26c1 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -2294,6 +2294,7 @@ void sched_mm_cid_before_execve(struct
> > task_struct *t);
> >   void sched_mm_cid_after_execve(struct task_struct *t);
> >   void sched_mm_cid_fork(struct task_struct *t);
> >   void sched_mm_cid_exit_signals(struct task_struct *t);
> > +void task_mm_cid_work(struct task_struct *t);
> >   static inline int task_mm_cid(struct task_struct *t)
> >   {
> >   	return t->mm_cid;
> > @@ -2303,6 +2304,7 @@ static inline void
> > sched_mm_cid_before_execve(struct task_struct *t) { }
> >   static inline void sched_mm_cid_after_execve(struct task_struct
> > *t) { }
> >   static inline void sched_mm_cid_fork(struct task_struct *t) { }
> >   static inline void sched_mm_cid_exit_signals(struct task_struct
> > *t) { }
> > +static inline void task_mm_cid_work(struct task_struct *t) { }
> >   static inline int task_mm_cid(struct task_struct *t)
> >   {
> >   	/*
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index d6b91e8a66d6d..e6d6e468e64b4 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -1420,6 +1420,13 @@ static inline void
> > mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
> >   	WRITE_ONCE(mm->nr_cpus_allowed,
> > cpumask_weight(mm_allowed));
> >   	raw_spin_unlock(&mm->cpus_allowed_lock);
> >   }
> > +
> > +static inline bool mm_cid_needs_scan(struct mm_struct *mm)
> > +{
> > +	if (!mm)
> > +		return false;
> > +	return time_after(jiffies, READ_ONCE(mm-
> > >mm_cid_next_scan));
> > +}
> >   #else /* CONFIG_SCHED_MM_CID */
> >   static inline void mm_init_cid(struct mm_struct *mm, struct
> > task_struct *p) { }
> >   static inline int mm_alloc_cid(struct mm_struct *mm, struct
> > task_struct *p) { return 0; }
> > @@ -1430,6 +1437,10 @@ static inline unsigned int mm_cid_size(void)
> >   	return 0;
> >   }
> >   static inline void mm_set_cpus_allowed(struct mm_struct *mm,
> > const struct cpumask *cpumask) { }
> > +static inline bool mm_cid_needs_scan(struct mm_struct *mm)
> > +{
> > +	return false;
> > +}
> >   #endif /* CONFIG_SCHED_MM_CID */
> >   
> >   struct mmu_gather;
> > diff --git a/include/linux/sched.h b/include/linux/sched.h
> > index aa9c5be7a6325..a75f61cea2271 100644
> > --- a/include/linux/sched.h
> > +++ b/include/linux/sched.h
> > @@ -1428,7 +1428,7 @@ struct task_struct {
> >   	int				last_mm_cid;	/* Most
> > recent cid in mm */
> >   	int				migrate_from_cpu;
> >   	int				mm_cid_active;	/* Whether
> > cid bitmap is active */
> > -	struct callback_head		cid_work;
> > +	unsigned long			last_cid_reset;	/*
> > Time of last reset in jiffies */
> >   #endif
> >   
> >   	struct tlbflush_unmap_batch	tlb_ubc;
> > diff --git a/kernel/rseq.c b/kernel/rseq.c
> > index b7a1ec327e811..100f81e330dc6 100644
> > --- a/kernel/rseq.c
> > +++ b/kernel/rseq.c
> > @@ -441,6 +441,8 @@ void __rseq_handle_notify_resume(struct ksignal
> > *ksig, struct pt_regs *regs)
> >   	}
> >   	if (unlikely(rseq_update_cpu_node_id(t)))
> >   		goto error;
> > +	/* The mm_cid compaction returns prematurely if scan is
> > not needed. */
> > +	task_mm_cid_work(t);
> >   	return;
> >   
> >   error:
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 81c6df746df17..27b856a1cb0a9 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -10589,22 +10589,13 @@ static void
> > sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
> >   	sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
> >   }
> >   
> > -static void task_mm_cid_work(struct callback_head *work)
> > +void task_mm_cid_work(struct task_struct *t)
> >   {
> >   	unsigned long now = jiffies, old_scan, next_scan;
> > -	struct task_struct *t = current;
> >   	struct cpumask *cidmask;
> > -	struct mm_struct *mm;
> >   	int weight, cpu;
> > +	struct mm_struct *mm = t->mm;
> >   
> > -	WARN_ON_ONCE(t != container_of(work, struct task_struct,
> > cid_work));
> > -
> > -	work->next = work;	/* Prevent double-add */
> > -	if (t->flags & PF_EXITING)
> > -		return;
> > -	mm = t->mm;
> > -	if (!mm)
> > -		return;
> >   	old_scan = READ_ONCE(mm->mm_cid_next_scan);
> >   	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
> >   	if (!old_scan) {
> > @@ -10643,23 +10634,47 @@ void init_sched_mm_cid(struct task_struct
> > *t)
> >   		if (mm_users == 1)
> >   			mm->mm_cid_next_scan = jiffies +
> > msecs_to_jiffies(MM_CID_SCAN_DELAY);
> >   	}
> > -	t->cid_work.next = &t->cid_work;	/* Protect against
> > double add */
> > -	init_task_work(&t->cid_work, task_mm_cid_work);
> >   }
> >   
> >   void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
> >   {
> > -	struct callback_head *work = &curr->cid_work;
> > -	unsigned long now = jiffies;
> > +	u64 rtime = curr->se.sum_exec_runtime - curr-
> > >se.prev_sum_exec_runtime;
> >   
> > +	/*
> > +	 * If a task is running unpreempted for a long time, it
> > won't get its
> > +	 * mm_cid compacted and won't update its mm_cid value
> > after a
> > +	 * compaction occurs.
> > +	 * For such a task, this function does two things:
> > +	 * A) trigger the mm_cid recompaction,
> > +	 * B) trigger an update of the task's rseq->mm_cid field
> > at some point
> > +	 * after recompaction, so it can get a mm_cid value closer
> > to 0.
> > +	 * A change in the mm_cid triggers an rseq_preempt.
> > +	 *
> > +	 * B occurs once after the compaction work completes,
> > neither A nor B
> > +	 * run as long as the compaction work is pending, the task
> > is exiting
> > +	 * or is not a userspace task.
> > +	 */
> >   	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD))
> > ||
> > -	    work->next != work)
> > +	    test_tsk_thread_flag(curr, TIF_NOTIFY_RESUME))
> >   		return;
> > -	if (time_before(now, READ_ONCE(curr->mm-
> > >mm_cid_next_scan)))
> > +	if (rtime < RSEQ_UNPREEMPTED_THRESHOLD)
> >   		return;
> > -
> > -	/* No page allocation under rq lock */
> > -	task_work_add(curr, work, TWA_RESUME);
> > +	if (mm_cid_needs_scan(curr->mm)) {
> > +		/* Trigger mm_cid recompaction */
> > +		rseq_set_notify_resume(curr);
> > +	} else if (time_after(jiffies, curr->last_cid_reset +
> > +			     
> > msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
> > +		/* Update mm_cid field */
> > +		int old_cid = curr->mm_cid;
> > +
> > +		if (!curr->mm_cid_active)
> > +			return;
> > +		mm_cid_snapshot_time(rq, curr->mm);
> > +		mm_cid_put_lazy(curr);
> > +		curr->last_mm_cid = curr->mm_cid = mm_cid_get(rq,
> > curr, curr->mm);
> > +		if (old_cid != curr->mm_cid)
> > +			rseq_preempt(curr);
> > +	}
> >   }
> >   
> >   void sched_mm_cid_exit_signals(struct task_struct *t)
> > diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> > index 475bb5998295e..90a5b58188232 100644
> > --- a/kernel/sched/sched.h
> > +++ b/kernel/sched/sched.h
> > @@ -3606,6 +3606,7 @@ extern const char *preempt_modes[];
> >   
> >   #define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/*
> > 100ms */
> >   #define MM_CID_SCAN_DELAY	100			/* 100ms
> > */
> > +#define RSEQ_UNPREEMPTED_THRESHOLD	SCHED_MM_CID_PERIOD_NS
> >   
> >   extern raw_spinlock_t cid_lock;
> >   extern int use_cid_lock;
> > @@ -3809,6 +3810,7 @@ static inline int mm_cid_get(struct rq *rq,
> > struct task_struct *t,
> >   	int cid;
> >   
> >   	lockdep_assert_rq_held(rq);
> > +	t->last_cid_reset = jiffies;
> >   	cpumask = mm_cidmask(mm);
> >   	cid = __this_cpu_read(pcpu_cid->cid);
> >   	if (mm_cid_is_valid(cid)) {
>