linux-kernel - Re: [PATCH v3 2/3] sched: Move task_mm_cid

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <523be97d-d572-42b8-b0b8-504cf5f32366@efficios.com>
Date: Tue, 24 Dec 2024 11:03:22 -0500
From: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>
To: Gabriele Monaco <gmonaco@...hat.com>,
 Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...nel.org>,
 linux-mm@...ck.org, linux-kernel@...r.kernel.org
Cc: Juri Lelli <juri.lelli@...hat.com>,
 Andrew Morton <akpm@...ux-foundation.org>
Subject: Re: [PATCH v3 2/3] sched: Move task_mm_cid_work to mm delayed work

On 2024-12-16 08:09, Gabriele Monaco wrote:
> Currently, the task_mm_cid_work function is called in a task work
> triggered by a scheduler tick to frequently compact the mm_cids of each
> process. This can delay the execution of the corresponding thread for
> the entire duration of the function, negatively affecting the response
> in case of real time tasks. In practice, we observe task_mm_cid_work
> increasing the latency of 30-35us on a 128 cores system, this order of
> magnitude is meaningful under PREEMPT_RT.
> 
> This patch runs the task_mm_cid_work in a new delayed work connected to
> the mm_struct rather than in the task context before returning to
> userspace.
> 
> This delayed work is initialised while allocating the mm and disabled
> before freeing it, its execution is no longer triggered by scheduler
> ticks but run periodically based on the defined MM_CID_SCAN_DELAY.
> 
> The main advantage of this change is that the function can be offloaded
> to a different CPU and even preempted by RT tasks.
> 
> Moreover, this new behaviour could be more predictable with periodic
> tasks with short runtime, which may rarely run during a scheduler tick.
> Now, the work is always scheduled with the same periodicity for each mm
> (though the periodicity is not guaranteed due to interference from other
> tasks, but mm_cid compaction is mostly best effort).
> 
> To avoid excessively increased runtime, we quickly return from the
> function if we have no work to be done (i.e. no mm_cid is allocated).
> This is helpful for tasks that sleep for a long time, but also for
> terminated task. We are no longer following the process' state, hence
> the function continues to run after a process terminates but before its
> mm is freed.
> 
> Fixes: 223baf9d17f2 ("sched: Fix performance regression introduced by mm_cid")
> To: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>

Reviewed-by: Mathieu Desnoyers <mathieu.desnoyers@...icios.com>

> Cc: Peter Zijlstra <peterz@...radead.org>
> Cc: Ingo Molnar <mingo@...nel.org>
> Cc: Andrew Morton <akpm@...ux-foundation.org>
> Signed-off-by: Gabriele Monaco <gmonaco@...hat.com>
> ---
>   include/linux/mm_types.h | 16 ++++++----
>   include/linux/sched.h    |  1 -
>   kernel/sched/core.c      | 66 +++++-----------------------------------
>   kernel/sched/sched.h     |  7 -----
>   4 files changed, 18 insertions(+), 72 deletions(-)
> 
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index d56948a74254..16076e70a6b9 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -829,12 +829,6 @@ struct mm_struct {
>   		 * runqueue locks.
>   		 */
>   		struct mm_cid __percpu *pcpu_cid;
> -		/*
> -		 * @mm_cid_next_scan: Next mm_cid scan (in jiffies).
> -		 *
> -		 * When the next mm_cid scan is due (in jiffies).
> -		 */
> -		unsigned long mm_cid_next_scan;
>   		/**
>   		 * @nr_cpus_allowed: Number of CPUs allowed for mm.
>   		 *
> @@ -857,6 +851,7 @@ struct mm_struct {
>   		 * mm nr_cpus_allowed updates.
>   		 */
>   		raw_spinlock_t cpus_allowed_lock;
> +		struct delayed_work mm_cid_work;
>   #endif
>   #ifdef CONFIG_MMU
>   		atomic_long_t pgtables_bytes;	/* size of all page tables */
> @@ -1145,11 +1140,16 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
>   
>   #ifdef CONFIG_SCHED_MM_CID
>   
> +#define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
> +#define MM_CID_SCAN_DELAY	100			/* 100ms */
> +
>   enum mm_cid_state {
>   	MM_CID_UNSET = -1U,		/* Unset state has lazy_put flag set. */
>   	MM_CID_LAZY_PUT = (1U << 31),
>   };
>   
> +extern void task_mm_cid_work(struct work_struct *work);
> +
>   static inline bool mm_cid_is_unset(int cid)
>   {
>   	return cid == MM_CID_UNSET;
> @@ -1222,12 +1222,16 @@ static inline int mm_alloc_cid_noprof(struct mm_struct *mm, struct task_struct *
>   	if (!mm->pcpu_cid)
>   		return -ENOMEM;
>   	mm_init_cid(mm, p);
> +	INIT_DELAYED_WORK(&mm->mm_cid_work, task_mm_cid_work);
> +	schedule_delayed_work(&mm->mm_cid_work,
> +			      msecs_to_jiffies(MM_CID_SCAN_DELAY));
>   	return 0;
>   }
>   #define mm_alloc_cid(...)	alloc_hooks(mm_alloc_cid_noprof(__VA_ARGS__))
>   
>   static inline void mm_destroy_cid(struct mm_struct *mm)
>   {
> +	disable_delayed_work_sync(&mm->mm_cid_work);
>   	free_percpu(mm->pcpu_cid);
>   	mm->pcpu_cid = NULL;
>   }
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index d380bffee2ef..5d141c310917 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1374,7 +1374,6 @@ struct task_struct {
>   	int				last_mm_cid;	/* Most recent cid in mm */
>   	int				migrate_from_cpu;
>   	int				mm_cid_active;	/* Whether cid bitmap is active */
> -	struct callback_head		cid_work;
>   #endif
>   
>   	struct tlbflush_unmap_batch	tlb_ubc;
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index c6d8232ad9ee..30d78fe14eff 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4516,7 +4516,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
>   	p->wake_entry.u_flags = CSD_TYPE_TTWU;
>   	p->migration_pending = NULL;
>   #endif
> -	init_sched_mm_cid(p);
>   }
>   
>   DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
> @@ -5654,7 +5653,6 @@ void sched_tick(void)
>   		resched_latency = cpu_resched_latency(rq);
>   	calc_global_load_tick(rq);
>   	sched_core_tick(rq);
> -	task_tick_mm_cid(rq, donor);
>   	scx_tick(rq);
>   
>   	rq_unlock(rq, &rf);
> @@ -10520,38 +10518,17 @@ static void sched_mm_cid_remote_clear_weight(struct mm_struct *mm, int cpu,
>   	sched_mm_cid_remote_clear(mm, pcpu_cid, cpu);
>   }
>   
> -static void task_mm_cid_work(struct callback_head *work)
> +void task_mm_cid_work(struct work_struct *work)
>   {
> -	unsigned long now = jiffies, old_scan, next_scan;
> -	struct task_struct *t = current;
>   	struct cpumask *cidmask;
> -	struct mm_struct *mm;
> +	struct delayed_work *delayed_work = container_of(work, struct delayed_work, work);
> +	struct mm_struct *mm = container_of(delayed_work, struct mm_struct, mm_cid_work);
>   	int weight, cpu;
>   
> -	SCHED_WARN_ON(t != container_of(work, struct task_struct, cid_work));
> -
> -	work->next = work;	/* Prevent double-add */
> -	if (t->flags & PF_EXITING)
> -		return;
> -	mm = t->mm;
> -	if (!mm)
> -		return;
> -	old_scan = READ_ONCE(mm->mm_cid_next_scan);
> -	next_scan = now + msecs_to_jiffies(MM_CID_SCAN_DELAY);
> -	if (!old_scan) {
> -		unsigned long res;
> -
> -		res = cmpxchg(&mm->mm_cid_next_scan, old_scan, next_scan);
> -		if (res != old_scan)
> -			old_scan = res;
> -		else
> -			old_scan = next_scan;
> -	}
> -	if (time_before(now, old_scan))
> -		return;
> -	if (!try_cmpxchg(&mm->mm_cid_next_scan, &old_scan, next_scan))
> -		return;
>   	cidmask = mm_cidmask(mm);
> +	/* Nothing to clear for now */
> +	if (cpumask_empty(cidmask))
> +		goto out;
>   	/* Clear cids that were not recently used. */
>   	for_each_possible_cpu(cpu)
>   		sched_mm_cid_remote_clear_old(mm, cpu);
> @@ -10562,35 +10539,8 @@ static void task_mm_cid_work(struct callback_head *work)
>   	 */
>   	for_each_possible_cpu(cpu)
>   		sched_mm_cid_remote_clear_weight(mm, cpu, weight);
> -}
> -
> -void init_sched_mm_cid(struct task_struct *t)
> -{
> -	struct mm_struct *mm = t->mm;
> -	int mm_users = 0;
> -
> -	if (mm) {
> -		mm_users = atomic_read(&mm->mm_users);
> -		if (mm_users == 1)
> -			mm->mm_cid_next_scan = jiffies + msecs_to_jiffies(MM_CID_SCAN_DELAY);
> -	}
> -	t->cid_work.next = &t->cid_work;	/* Protect against double add */
> -	init_task_work(&t->cid_work, task_mm_cid_work);
> -}
> -
> -void task_tick_mm_cid(struct rq *rq, struct task_struct *curr)
> -{
> -	struct callback_head *work = &curr->cid_work;
> -	unsigned long now = jiffies;
> -
> -	if (!curr->mm || (curr->flags & (PF_EXITING | PF_KTHREAD)) ||
> -	    work->next != work)
> -		return;
> -	if (time_before(now, READ_ONCE(curr->mm->mm_cid_next_scan)))
> -		return;
> -
> -	/* No page allocation under rq lock */
> -	task_work_add(curr, work, TWA_RESUME | TWAF_NO_ALLOC);
> +out:
> +	schedule_delayed_work(delayed_work, msecs_to_jiffies(MM_CID_SCAN_DELAY));
>   }
>   
>   void sched_mm_cid_exit_signals(struct task_struct *t)
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index b50dcd908702..f3b0d1d86622 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -3581,16 +3581,11 @@ extern void sched_dynamic_update(int mode);
>   
>   #ifdef CONFIG_SCHED_MM_CID
>   
> -#define SCHED_MM_CID_PERIOD_NS	(100ULL * 1000000)	/* 100ms */
> -#define MM_CID_SCAN_DELAY	100			/* 100ms */
> -
>   extern raw_spinlock_t cid_lock;
>   extern int use_cid_lock;
>   
>   extern void sched_mm_cid_migrate_from(struct task_struct *t);
>   extern void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t);
> -extern void task_tick_mm_cid(struct rq *rq, struct task_struct *curr);
> -extern void init_sched_mm_cid(struct task_struct *t);
>   
>   static inline void __mm_cid_put(struct mm_struct *mm, int cid)
>   {
> @@ -3858,8 +3853,6 @@ static inline void switch_mm_cid(struct rq *rq,
>   static inline void switch_mm_cid(struct rq *rq, struct task_struct *prev, struct task_struct *next) { }
>   static inline void sched_mm_cid_migrate_from(struct task_struct *t) { }
>   static inline void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t) { }
> -static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
> -static inline void init_sched_mm_cid(struct task_struct *t) { }
>   #endif /* !CONFIG_SCHED_MM_CID */
>   
>   extern u64 avg_vruntime(struct cfs_rq *cfs_rq);

-- 
Mathieu Desnoyers
EfficiOS Inc.
https://www.efficios.com