linux-kernel - Re: [PATCH v13 2/3] sched: Move task_mm_cid_work to mm work

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <e53ac875-da6a-42ce-8714-d74f77775279@efficios.com>
Date: Mon, 14 Apr 2025 11:28:17 -0400
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Gabriele Monaco <gmonaco@...hat.com>, linux-kernel@...r.kernel.org,
 Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.org>
Subject: Re: [PATCH v13 2/3] sched: Move task_mm_cid_work to mm work_struct

On 2025-04-14 08:36, Gabriele Monaco wrote:
> Currently, the task_mm_cid_work function is called in a task work
> triggered by a scheduler tick to frequently compact the mm_cids of each
> process. This can delay the execution of the corresponding thread for
> the entire duration of the function, negatively affecting the response
> in case of real time tasks. In practice, we observe task_mm_cid_work
> increasing the latency of 30-35us on a 128 cores system, this order of
> magnitude is meaningful under PREEMPT_RT.
> 
> Run the task_mm_cid_work in a new work_struct connected to the
> mm_struct rather than in the task context before returning to
> userspace.
> 
> This work_struct is initialised with the mm and disabled before freeing
> it. The queuing of the work happens while returning to userspace in
> __rseq_handle_notify_resume, maintaining the checks to avoid running
> more frequently than MM_CID_SCAN_DELAY.
> To make sure this happens predictably also on long running tasks, we
> trigger a call to __rseq_handle_notify_resume also from the scheduler
> tick if the runtime exceeded a 100ms threshold.
> 
> The main advantage of this change is that the function can be offloaded
> to a different CPU and even preempted by RT tasks.
> 
> Moreover, this new behaviour is more predictable with periodic tasks
> with short runtime, which may rarely run during a scheduler tick.
> Now, the work is always scheduled when the task returns to userspace.
> 
> The work is disabled during mmdrop, since the function cannot sleep in
> all kernel configurations, we cannot wait for possibly running work
> items to terminate. We make sure the mm is valid in case the task is
> terminating by reserving it with mmgrab/mmdrop, returning prematurely if
> we are really the last user while the work gets to run.
> This situation is unlikely since we don't schedule the work for exiting
> tasks, but we cannot rule it out.

The implementation looks good to me. Peter, how does it look from your end ?

Thanks,

Mathieu

> 
> Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid")
> Signed-off-by: Gabriele Monaco <gmonaco@...hat.com>
> ---
>   include/linux/mm_types.h | 26 ++++++++++++++
>   include/linux/sched.h    |  8 ++++-
>   kernel/rseq.c            |  2 ++
>   kernel/sched/core.c      | 75 ++++++++++++++++++++++++++--------------
>   kernel/sched/sched.h     |  6 ++--
>   5 files changed, 89 insertions(+), 28 deletions(-)
> 
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 56d07edd01f91..e4ae9295508cf 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -982,6 +982,10 @@ struct mm_struct {
>   		 * mm nr_cpus_allowed updates.
>   		 */
>   		raw_spinlock_t cpus_allowed_lock;
> +		/*
> +		 * @cid_work: Work item to run the mm_cid scan.
> +		 */
> +		struct work_struct cid_work;
>   #endif
>   #ifdef CONFIG_MMU
>   		atomic_long_t pgtables_bytes;	/* size of all page tables */
> @@ -1282,6 +1286,8 @@ enum mm_cid_state {
>   	MM_CID_LAZY_PUT = (1U << 31),
>   };
>   
> +extern void task_mm_cid_work(struct work_struct *work);
> +
>   static inline bool mm_cid_is_unset(int cid)
>   {
>   	return cid == MM_CID_UNSET;
> @@ -1354,12 +1360,14 @@ static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *
>   	if (!mm->pcpu_cid)
>   		return -ENOMEM;
>   	mm_init_cid(mm, p);
> +	INIT_WORK(&mm->cid_work, task_mm_cid_work);
>   	return 0;
>   }
>   #define mm_alloc_cid(...)	alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__))
>   
>   static inline void mm_destroy_cid(struct mm_struct *mm)
>   {
> +	disable_work(&mm->cid_work);
>   	free_percpu(mm->pcpu_cid);
>   	mm->pcpu_cid = NULL;
>   }
> @@ -1381,6 +1389,16 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
>   	WRITE_ONCE(mm->nr_cpus_allowed, cpumask_weight(mm_allowed));
>   	raw_spin_unlock(&mm->cpus_allowed_lock);
>   }
> +
> +static inline bool mm_cid_needs_scan(struct mm_struct *mm)
> +{
> +	return mm && !time_before(jiffies, READ_ONCE(mm->mm_cid_next_scan));
> +}
> +
> +static inline bool mm_cid_scan_pending(struct mm_struct *mm)
> +{
> +	return mm && work_pending(&mm->cid_work);
> +}
>   #else /* CONFIG_SCHED_MM_CID */
>   static inline void mm_init_cid(struct mm_struct *mm, struct task_struct *p) { }
>   static inline int mm_alloc_cid(struct mm_struct *mm, struct task_struct *p) { return 0; }
> @@ -1391,6 +1409,14 @@ static inline unsigned int mm_cid_size(void)
>   	return 0;
>   }
>   static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
> +static inline bool mm_cid_needs_scan(struct mm_struct *mm)
> +{
> +	return false;
> +}
> +static inline bool mm_cid_scan_pending(struct mm_struct *mm)
> +{
> +	return false;
> +}
>   #endif /* CONFIG_SCHED_MM_CID */
>   
>   struct mmu_gather;
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index f96ac19828934..3ffdb96ef6b0a 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1424,7 +1424,7 @@ struct task_struct {
>   	int				last_mm_cid;	/* Most recent cid in mm */
>   	int				migrate_from_cpu;
>   	int				mm_cid_active;	/* Whether cid bitmap is active */
> -	struct callback_head		cid_work;
> +	unsigned long			last_cid_reset;	/* Time of last reset in jiffies */
>   #endif
>   
>   	struct tlbflush_unmap_batch	tlb_ubc;
> @@ -2281,4 +2281,10 @@ static __always_inline void alloc_tag_restore(struct alloc_tag *tag, struct allo
>   #define alloc_tag_restore(_tag, _old)		do {} while (0)
>   #endif
>   
> +#ifdef CONFIG_SCHED_MM_CID
> +extern void task_queue_mm_cid(struct task_struct *curr);
> +#else
> +static inline void task_queue_mm_cid(struct task_struct *curr) { }
> +#endif
> +
>   #endif
> diff --git a/kernel/rseq.c b/kernel/rseq.c
> index b7a1ec327e811..383db2ccad4d0 100644
> --- a/kernel/rseq.c
> +++ b/kernel/rseq.c
> @@ -441,6 +441,8 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
>   	}
>   	if (unlikely(rseq_update_cpu_node_id(t)))
>   		goto error;
> +	if (mm_cid_needs_scan(t->mm))
> +		task_queue_mm_cid(t);
>   	return;
>   
>   error:
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c81cf642dba05..02b18649e6a09 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -10566,22 +10566,16 @@ static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
>   	sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
>   }
>   
> -static void task_mm_cid_work(struct callback_head *work)
> +void task_mm_cid_work(struct work_struct *work)
>   {
>   	unsigned long now = jiffies, old_scan, next_scan;
> -	struct task_struct *t = current;
>   	struct cpumask *cidmask;
> -	struct mm_struct *mm;
> +	struct mm_struct *mm = container_of(work, struct mm_struct, cid_work);
>   	int weight, cpu;
>   
> -	WARN_ON_ONCE(t != container_of(work, struct task_struct, cid_work));
> -
> -	work->next = work;	/* Prevent double-add */
> -	if (t->flags & PF_EXITING)
> -		return;
> -	mm = t->mm;
> -	if (!mm)
> -		return;
> +	/* We are the last user, process already terminated. */
> +	if (atomic_read(&mm->mm_count) == 1)
> +		goto out_drop;
>   	old_scan = READ_ONCE(mm->mm_cid_next_scan);
>   	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
>   	if (!old_scan) {
> @@ -10594,9 +10588,9 @@ static void task_mm_cid_work(struct callback_head *work)
>   			old_scan = next_scan;
>   	}
>   	if (time_before(now, old_scan))
> -		return;
> +		goto out_drop;
>   	if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
> -		return;
> +		goto out_drop;
>   	cidmask = mm_cidmask(mm);
>   	/* Clear cids that were not recently used. */
>   	for_each_possible_cpu(cpu)
> @@ -10608,6 +10602,8 @@ static void task_mm_cid_work(struct callback_head *work)
>   	 */
>   	for_each_possible_cpu(cpu)
>   		sched_mm_cid_remote_clear_weight(mm, cpu, weight);
> +out_drop:
> +	mmdrop(mm);
>   }
>   
>   void init_sched_mm_cid(struct task_struct *t)
> @@ -10620,23 +10616,52 @@ void init_sched_mm_cid(struct task_struct *t)
>   		if (mm_users == 1)
>   			mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
>   	}
> -	t->cid_work.next = &t->cid_work;	/* Protect against double add */
> -	init_task_work(&t->cid_work, task_mm_cid_work);
>   }
>   
> -void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
> +void task_tick_mm_cid(struct rq *rq, struct task_struct *t)
>   {
> -	struct callback_head *work = &curr->cid_work;
> -	unsigned long now = jiffies;
> +	u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
>   
> -	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
> -	    work->next != work)
> -		return;
> -	if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
> -		return;
> +	/*
> +	 * If a task is running unpreempted for a long time, it won't get its
> +	 * mm_cid compacted and won't update its mm_cid value after a
> +	 * compaction occurs.
> +	 * For such a task, this function does two things:
> +	 * A) trigger the mm_cid recompaction,
> +	 * B) trigger an update of the task's rseq->mm_cid field at some point
> +	 * after recompaction, so it can get a mm_cid value closer to 0.
> +	 * A change in the mm_cid triggers an rseq_preempt.
> +	 *
> +	 * A occurs only once after the scan time elapsed, until the next scan
> +	 * expires as well.
> +	 * B occurs once after the compaction work completes, that is when scan
> +	 * is no longer needed (it occurred for this mm) but the last rseq
> +	 * preempt was done before the last mm_cid scan.
> +	 */
> +	if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
> +		if (mm_cid_needs_scan(t->mm) && !mm_cid_scan_pending(t->mm))
> +			rseq_set_notify_resume(t);
> +		else if (time_after(jiffies, t->last_cid_reset +
> +				      msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
> +			int old_cid = t->mm_cid;
> +
> +			if (!t->mm_cid_active)
> +				return;
> +			mm_cid_snapshot_time(rq, t->mm);
> +			mm_cid_put_lazy(t);
> +			t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, t->mm);
> +			if (old_cid != t->mm_cid)
> +				rseq_preempt(t);
> +		}
> +	}
> +}
>   
> -	/* No page allocation under rq lock */
> -	task_work_add(curr, work, TWA_RESUME);
> +/* Call only when curr is a user thread. */
> +void task_queue_mm_cid(struct task_struct *curr)
> +{
> +	/* Ensure the mm exists when we run. */
> +	mmgrab(curr->mm);
> +	queue_work(system_unbound_wq, &curr->mm->cid_work);
>   }
>   
>   void sched_mm_cid_exit_signals(struct task_struct *t)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 47972f34ea701..c0f8fd4c575c3 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3582,13 +3582,14 @@ extern const char *preempt_modes[];
>   
>   #define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
>   #define MM_CID_SCAN_DELAY	100			/* 100ms */
> +#define RSEQ_UNPREEMPTED_THRESHOLD	SCHED_MM_CID_PERIOD_NS
>   
>   extern raw_spinlock_t cid_lock;
>   extern int use_cid_lock;
>   
>   extern void sched_mm_cid_migrate_from(struct task_struct *t);
>   extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
> -extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
> +extern void task_tick_mm_cid(struct rq *rq, struct task_struct *t);
>   extern void init_sched_mm_cid(struct task_struct *t);
>   
>   static inline void __mm_cid_put(struct mm_struct *mm, int cid)
> @@ -3798,6 +3799,7 @@ static inline int mm_cid_get(struct rq *rq, struct task_struct *t,
>   	cid = __mm_cid_get(rq, t, mm);
>   	__this_cpu_write(pcpu_cid->cid, cid);
>   	__this_cpu_write(pcpu_cid->recent_cid, cid);
> +	t->last_cid_reset = jiffies;
>   
>   	return cid;
>   }
> @@ -3857,7 +3859,7 @@ static inline void switch_mm_cid(struct rq *rq,
>   static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
>   static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
>   static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
> -static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
> +static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *t) { }
>   static inline void init_sched_mm_cid(struct task_struct *t) { }
>   #endif /* !CONFIG_SCHED_MM_CID */
>   


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com