linux-kernel - Re: [RFC patch v3 02/20] sched: Several fixes for cache aware scheduling

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <60a01353-c1f4-41ea-99b7-a300aee35bb1@oracle.com>
Date: Mon, 7 Jul 2025 18:15:46 -0700
From: Libo Chen <libo.chen@...cle.com>
To: Tim Chen <tim.c.chen@...ux.intel.com>,
        Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
        K Prateek Nayak <kprateek.nayak@....com>,
        "Gautham R . Shenoy" <gautham.shenoy@....com>
Cc: Chen Yu <yu.c.chen@...el.com>, Juri Lelli <juri.lelli@...hat.com>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>,
        Mel Gorman <mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>,
        Tim Chen <tim.c.chen@...el.com>,
        Vincent Guittot
 <vincent.guittot@...aro.org>,
        Abel Wu <wuyun.abel@...edance.com>,
        Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
        Hillf Danton <hdanton@...a.com>, Len Brown <len.brown@...el.com>,
        linux-kernel@...r.kernel.org
Subject: Re: [RFC patch v3 02/20] sched: Several fixes for cache aware
 scheduling

Hi Chenyu

On 6/18/25 11:27, Tim Chen wrote:
> From: Chen Yu <yu.c.chen@...el.com>
> 
> 1. Fix compile error on percpu allocation.
> 2. Enqueue to the target CPU rather than the current CPU.
> 3. NULL LLC sched domain check(Libo Chen).

Can I suggest we completely disable cache-aware scheduling
for systems without any LLC in the next version? No more added
fields, function code for them. This info should be easily
determinable during bootup while building up the topology,
and cannot be modified during runtime. Sometimes it's not
possible for distros to disable it in kconfig just for one
particular CPU, and SCHED_CACHE_LB isn't enough for removing
the added fields and users can turn it back on anyway.

Thanks,
Libo


> 4. Introduce sched feature SCHED_CACHE to control cache aware scheduling
> 5. Fix unsigned occupancy initialization to -1.
> 6. If there is only 1 thread in the process, no need to enable cache
>    awareness
> 7. Add __maybe_unused to __migrate_degrades_locality() to
>    avoid compile warnings.
> 
> Signed-off-by: Chen Yu <yu.c.chen@...el.com>
> ---
>  include/linux/mm_types.h |  4 ++--
>  kernel/sched/fair.c      | 27 ++++++++++++++++-----------
>  kernel/sched/features.h  |  1 +
>  3 files changed, 19 insertions(+), 13 deletions(-)
> 
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 013291c6aaa2..9de4a0a13c4d 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1411,11 +1411,11 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas
>  #endif /* CONFIG_SCHED_MM_CID */
>  
>  #ifdef CONFIG_SCHED_CACHE
> -extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched);
> +extern void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched);
>  
>  static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
>  {
> -	struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
> +	struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
>  	if (!pcpu_sched)
>  		return -ENOMEM;
>  
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index df7d4a324fbe..89db97f8ef02 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1175,7 +1175,7 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr)
>  #define EPOCH_PERIOD	(HZ/100)	/* 10 ms */
>  #define EPOCH_OLD	5		/* 50 ms */
>  
> -void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
> +void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
>  {
>  	unsigned long epoch;
>  	int i;
> @@ -1186,7 +1186,7 @@ void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched)
>  
>  		pcpu_sched->runtime = 0;
>  		pcpu_sched->epoch = epoch = rq->cpu_epoch;
> -		pcpu_sched->occ = -1;
> +		pcpu_sched->occ = 0;
>  	}
>  
>  	raw_spin_lock_init(&mm->mm_sched_lock);
> @@ -1254,7 +1254,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>  	if (!mm || !mm->pcpu_sched)
>  		return;
>  
> -	pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched);
> +	pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq));
>  
>  	scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
>  		__update_mm_sched(rq, pcpu_sched);
> @@ -1264,12 +1264,14 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>  	}
>  
>  	/*
> -	 * If this task hasn't hit task_cache_work() for a while, invalidate
> +	 * If this task hasn't hit task_cache_work() for a while, or it
> +	 * has only 1 thread, invalidate
>  	 * it's preferred state.
>  	 */
> -	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) {
> +	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD ||
> +	    get_nr_threads(p) <= 1) {
>  		mm->mm_sched_cpu = -1;
> -		pcpu_sched->occ = -1;
> +		pcpu_sched->occ = 0;
>  	}
>  }
>  
> @@ -1286,9 +1288,6 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p)
>  
>  	guard(raw_spinlock)(&mm->mm_sched_lock);
>  
> -	if (mm->mm_sched_epoch == rq->cpu_epoch)
> -		return;
> -
>  	if (work->next == work) {
>  		task_work_add(p, work, TWA_RESUME);
>  		WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
> @@ -1322,6 +1321,9 @@ static void task_cache_work(struct callback_head *work)
>  			unsigned long occ, m_occ = 0, a_occ = 0;
>  			int m_cpu = -1, nr = 0, i;
>  
> +			if (!sd)
> +				continue;
> +
>  			for_each_cpu(i, sched_domain_span(sd)) {
>  				occ = fraction_mm_sched(cpu_rq(i),
>  							per_cpu_ptr(mm->pcpu_sched, i));
> @@ -8801,6 +8803,9 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu)
>  	struct mm_struct *mm = p->mm;
>  	int cpu;
>  
> +	if (!sched_feat(SCHED_CACHE))
> +		return prev_cpu;
> +
>  	if (!mm || p->nr_cpus_allowed == 1)
>  		return prev_cpu;
>  
> @@ -9555,7 +9560,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
>  		return 0;
>  
>  #ifdef CONFIG_SCHED_CACHE
> -	if (p->mm && p->mm->pcpu_sched) {
> +	if (sched_feat(SCHED_CACHE) && p->mm && p->mm->pcpu_sched) {
>  		/*
>  		 * XXX things like Skylake have non-inclusive L3 and might not
>  		 * like this L3 centric view. What to do about L2 stickyness ?
> @@ -9633,7 +9638,7 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
>  }
>  
>  #else
> -static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
> +static __maybe_unused long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle)
>  {
>  	return 0;
>  }
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h
> index 3c12d9f93331..d2af7bfd36bf 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -87,6 +87,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
>   */
>  SCHED_FEAT(SIS_UTIL, true)
>  
> +SCHED_FEAT(SCHED_CACHE, true)
>  /*
>   * Issue a WARN when we do multiple update_rq_clock() calls
>   * in a single rq->lock section. Default disabled because the