linux-kernel - Re: [PATCH] fixup: [PATCH v12 2/3] sched: Move task_mm_cid_work to mm work

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <82c5e8c3-09a5-499a-bae2-7f5c70725a29@efficios.com>
Date: Thu, 10 Apr 2025 10:04:54 -0400
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Gabriele Monaco <gmonaco@...hat.com>, peterz@...radead.org,
 Ingo Molnar <mingo@...hat.com>, linux-kernel@...r.kernel.org
Cc: akpm@...ux-foundation.org, linux-mm@...ck.org, paulmck@...nel.org,
 shuah@...nel.org
Subject: Re: [PATCH] fixup: [PATCH v12 2/3] sched: Move task_mm_cid_work to mm
 work_struct

On 2025-04-10 08:50, Gabriele Monaco wrote:
> Thanks both for the comments, I tried to implement what Mathieu
> suggested. This patch applies directly on 2/3 but I'm sending it here
> first to get feedback.
> 
> Essentially, I refactored a bit to avoid the need to add more
> dependencies to rseq, the rseq_tick is now called task_tick_mm_cid (as
> before the series) and it does the two things you mentioned:
>   * A) trigger the mm_cid recompaction
>   * B) trigger an update of the task's rseq->mm_cid field at some point
>     after recompaction, so it can get a mm_cid value closer to 0.
> 
> Now, A occurs only after the scan time elapsed, which means it could
> potentially run multiple times in case the work is not scheduled before
> the next tick, I'm not sure adding more checks to make sure it
> happens once and only once really makes sense here.

The scan is gated by two checks now:

> +	if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
> +		if (mm_cid_needs_scan(t->mm))

And likewise for the periodic check for preemption:

> +	if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
[...]
> +		else if (time_after(jiffies, t->last_rseq_preempt +
> +				      msecs_to_jiffies(MM_CID_SCAN_DELAY))) {

Those second levels of time checks would prevent adding significant
overhead on every tick after the threshold is reached.

> 
> B is occurring after the work updates the last scan time, so we are in a
> condition where the runtime is above threshold but the (next) scan time
> did not expire yet.
> I tried to account for multiple threads updating the mm_cid (not
> necessarily the long running one, or in case more are long running), for
> this I'm tracking the last time we updated the mm_cid, if that occurred
> before the last mm_cid scan, we need to update (and preempt).
> 
> Does this make sense to you?

It makes sense. Note that it adds overhead to rseq_preempt() (a store
to t->last_rseq_preempt), which is a fast path. I don't know if we
should care.

Also part of task_tick_mm_cid could be moved to a helper, e.g.:

static
void task_reset_mm_cid(struct rq *rq, struct task_struct *t)
{
         int old_cid = t->mm_cid;
         
         if (!t->mm_cid_active)
                 return;
         mm_cid_snapshot_time(rq, t->mm);
         mm_cid_put_lazy(t);
         t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, t->mm);
         if (old_cid == t->mm_cid)
                 return;
         rseq_preempt(t);
}

Thanks,

Mathieu

> 
> Thanks,
> Gabriele
> 
> Signed-off-by: Gabriele Monaco <gmonaco@...hat.com>
> ---
>   include/linux/rseq.h  | 14 +-------------
>   include/linux/sched.h |  1 +
>   kernel/sched/core.c   | 42 +++++++++++++++++++++++++++++++++++++++++-
>   kernel/sched/sched.h  |  3 +++
>   4 files changed, 46 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/rseq.h b/include/linux/rseq.h
> index d20fd72f4c80d..7e3fa2ae9e7a4 100644
> --- a/include/linux/rseq.h
> +++ b/include/linux/rseq.h
> @@ -7,8 +7,6 @@
>   #include <linux/preempt.h>
>   #include <linux/sched.h>
>   
> -#define RSEQ_UNPREEMPTED_THRESHOLD	(100ULL * 1000000)	/* 100ms */
> -
>   /*
>    * Map the event mask on the user-space ABI enum rseq_cs_flags
>    * for direct mask checks.
> @@ -54,14 +52,7 @@ static inline void rseq_preempt(struct task_struct *t)
>   {
>   	__set_bit(RSEQ_EVENT_PREEMPT_BIT, &t->rseq_event_mask);
>   	rseq_set_notify_resume(t);
> -}
> -
> -static inline void rseq_preempt_from_tick(struct task_struct *t)
> -{
> -	u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
> -
> -	if (rtime > RSEQ_UNPREEMPTED_THRESHOLD)
> -		rseq_preempt(t);
> +	t->last_rseq_preempt = jiffies;
>   }
>   
>   /* rseq_migrate() requires preemption to be disabled. */
> @@ -114,9 +105,6 @@ static inline void rseq_signal_deliver(struct ksignal *ksig,
>   static inline void rseq_preempt(struct task_struct *t)
>   {
>   }
> -static inline void rseq_preempt_from_tick(struct task_struct *t)
> -{
> -}
>   static inline void rseq_migrate(struct task_struct *t)
>   {
>   }
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 851933e62bed3..5b057095d5dc0 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1424,6 +1424,7 @@ struct task_struct {
>   	int				last_mm_cid;	/* Most recent cid in mm */
>   	int				migrate_from_cpu;
>   	int				mm_cid_active;	/* Whether cid bitmap is active */
> +	unsigned long			last_rseq_preempt; /* Time of last preempt in jiffies */
>   #endif
>   
>   	struct tlbflush_unmap_batch	tlb_ubc;
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 52ad709094167..9f0c9cc284804 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -5663,7 +5663,7 @@ void sched_tick(void)
>   		resched_latency = cpu_resched_latency(rq);
>   	calc_global_load_tick(rq);
>   	sched_core_tick(rq);
> -	rseq_preempt_from_tick(donor);
> +	task_tick_mm_cid(rq, donor);
>   	scx_tick(rq);
>   
>   	rq_unlock(rq, &rf);
> @@ -10618,6 +10618,46 @@ void init_sched_mm_cid(struct task_struct *t)
>   	}
>   }
>   
> +void task_tick_mm_cid(struct rq *rq, struct task_struct *t)
> +{
> +	u64 rtime = t->se.sum_exec_runtime - t->se.prev_sum_exec_runtime;
> +
> +	/*
> +	 * If a task is running unpreempted for a long time, it won't get its
> +	 * mm_cid compacted and won't update its mm_cid value after a
> +	 * compaction occurs.
> +	 * For such a task, this function does two things:
> +	 * A) trigger the mm_cid recompaction,
> +	 * B) trigger an update of the task's rseq->mm_cid field at some point
> +	 * after recompaction, so it can get a mm_cid value closer to 0.
> +	 * A change in the mm_cid triggers an rseq_preempt.
> +	 *
> +	 * A occurs only after the next scan time elapsed but before the
> +	 * compaction work is actually scheduled.
> +	 * B occurs once after the compaction work completes, that is when scan
> +	 * is no longer needed (it occurred for this mm) but the last rseq
> +	 * preempt was done before the last mm_cid scan.
> +	 */
> +	if (t->mm && rtime > RSEQ_UNPREEMPTED_THRESHOLD) {
> +		if (mm_cid_needs_scan(t->mm))
> +			rseq_set_notify_resume(t);
> +		else if (time_after(jiffies, t->last_rseq_preempt +
> +				      msecs_to_jiffies(MM_CID_SCAN_DELAY))) {
> +			int old_cid = t->mm_cid;
> +
> +			if (!t->mm_cid_active)
> +				return;
> +			mm_cid_snapshot_time(rq, t->mm);
> +			mm_cid_put_lazy(t);
> +			t->last_mm_cid = t->mm_cid = mm_cid_get(rq, t, t->mm);
> +			if (old_cid == t->mm_cid)
> +				t->last_rseq_preempt = jiffies;
> +			else
> +				rseq_preempt(t);
> +		}
> +	}
> +}
> +
>   /* Call only when curr is a user thread. */
>   void task_queue_mm_cid(struct task_struct *curr)
>   {
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 1703cd16d5433..7d104d12ed974 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3582,12 +3582,14 @@ extern const char *preempt_modes[];
>   
>   #define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
>   #define MM_CID_SCAN_DELAY	100			/* 100ms */
> +#define RSEQ_UNPREEMPTED_THRESHOLD	SCHED_MM_CID_PERIOD_NS
>   
>   extern raw_spinlock_t cid_lock;
>   extern int use_cid_lock;
>   
>   extern void sched_mm_cid_migrate_from(struct task_struct *t);
>   extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
> +extern void task_tick_mm_cid(struct rq *rq, struct task_struct *t);
>   extern void init_sched_mm_cid(struct task_struct *t);
>   
>   static inline void __mm_cid_put(struct mm_struct *mm, int cid)
> @@ -3856,6 +3858,7 @@ static inline void switch_mm_cid(struct rq *rq,
>   static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
>   static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
>   static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
> +static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *t) { }
>   static inline void init_sched_mm_cid(struct task_struct *t) { }
>   #endif /* !CONFIG_SCHED_MM_CID */
>   
> 
> base-commit: c59c19fcfad857c96effa3b2e9eb6d934d2380d8


-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com