linux-kernel - Re: [PATCH v2 19/23] sched/cache: Avoid cache-aware scheduling for memory-heavy processes

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <91a7c325-5093-4417-aa98-34df694b0c39@gmail.com>
Date: Thu, 18 Dec 2025 11:59:57 +0800
From: Vern Hao <haoxing990@...il.com>
To: Tim Chen <tim.c.chen@...ux.intel.com>,
 Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
 K Prateek Nayak <kprateek.nayak@....com>,
 "Gautham R . Shenoy" <gautham.shenoy@....com>,
 Vincent Guittot <vincent.guittot@...aro.org>
Cc: Chen Yu <yu.c.chen@...el.com>, Juri Lelli <juri.lelli@...hat.com>,
 Dietmar Eggemann <dietmar.eggemann@....com>,
 Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>,
 Mel Gorman <mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>,
 Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
 Hillf Danton <hdanton@...a.com>, Shrikanth Hegde <sshegde@...ux.ibm.com>,
 Jianyong Wu <jianyong.wu@...look.com>, Yangyu Chen <cyy@...self.name>,
 Tingyin Duan <tingyin.duan@...il.com>, Vern Hao <vernhao@...cent.com>,
 Len Brown <len.brown@...el.com>, Aubrey Li <aubrey.li@...el.com>,
 Zhao Liu <zhao1.liu@...el.com>, Chen Yu <yu.chen.surf@...il.com>,
 Adam Li <adamli@...amperecomputing.com>, Aaron Lu <ziqianlu@...edance.com>,
 Tim Chen <tim.c.chen@...el.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 19/23] sched/cache: Avoid cache-aware scheduling for
 memory-heavy processes


On 2025/12/4 07:07, Tim Chen wrote:
> From: Chen Yu <yu.c.chen@...el.com>
>
> Prateek and Tingyin reported that memory-intensive workloads (such as
> stream) can saturate memory bandwidth and caches on the preferred LLC
> when sched_cache aggregates too many threads.
>
> To mitigate this, estimate a process's memory footprint by comparing
> its RSS (anonymous and shared pages) to the size of the LLC. If RSS
> exceeds the LLC size, skip cache-aware scheduling.
Restricting RSS prevents many applications from benefiting from this 
optimization. I believe this restriction should be lifted. For 
memory-intensive workloads, the optimization may simply yield no gains, 
but it certainly shouldn't make performance worse. We need to further 
refine this logic.
> Note that RSS is only an approximation of the memory footprint.
> By default, the comparison is strict, but a later patch will allow
> users to provide a hint to adjust this threshold.
>
> According to the test from Adam, some systems do not have shared L3
> but with shared L2 as clusters. In this case, the L2 becomes the LLC[1].
>
> Link[1]: https://lore.kernel.org/all/3cb6ebc7-a2fd-42b3-8739-b00e28a09cb6@os.amperecomputing.com/
>
> Co-developed-by: Tim Chen <tim.c.chen@...ux.intel.com>
> Signed-off-by: Chen Yu <yu.c.chen@...el.com>
> Signed-off-by: Tim Chen <tim.c.chen@...ux.intel.com>
> ---
>
> Notes:
>      v1->v2: Assigned curr_cpu in task_cache_work() before checking
>              exceed_llc_capacity(mm, curr_cpu) to avoid out-of-bound
>              access.(lkp/0day)
>
>   include/linux/cacheinfo.h | 21 ++++++++++-------
>   kernel/sched/fair.c       | 49 +++++++++++++++++++++++++++++++++++----
>   2 files changed, 57 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
> index c8f4f0a0b874..82d0d59ca0e1 100644
> --- a/include/linux/cacheinfo.h
> +++ b/include/linux/cacheinfo.h
> @@ -113,18 +113,11 @@ int acpi_get_cache_info(unsigned int cpu,
>   
>   const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf);
>   
> -/*
> - * Get the cacheinfo structure for the cache associated with @cpu at
> - * level @level.
> - * cpuhp lock must be held.
> - */
> -static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
> +static inline struct cacheinfo *_get_cpu_cacheinfo_level(int cpu, int level)
>   {
>   	struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
>   	int i;
>   
> -	lockdep_assert_cpus_held();
> -
>   	for (i = 0; i < ci->num_leaves; i++) {
>   		if (ci->info_list[i].level == level) {
>   			if (ci->info_list[i].attributes & CACHE_ID)
> @@ -136,6 +129,18 @@ static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
>   	return NULL;
>   }
>   
> +/*
> + * Get the cacheinfo structure for the cache associated with @cpu at
> + * level @level.
> + * cpuhp lock must be held.
> + */
> +static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
> +{
> +	lockdep_assert_cpus_held();
> +
> +	return _get_cpu_cacheinfo_level(cpu, level);
> +}
> +
>   /*
>    * Get the id of the cache associated with @cpu at level @level.
>    * cpuhp lock must be held.
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6afa3f9a4e9b..424ec601cfdf 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1223,6 +1223,38 @@ static int llc_id(int cpu)
>   	return llc;
>   }
>   
> +static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
> +{
> +	struct cacheinfo *ci;
> +	unsigned long rss;
> +	unsigned int llc;
> +
> +	/*
> +	 * get_cpu_cacheinfo_level() can not be used
> +	 * because it requires the cpu_hotplug_lock
> +	 * to be held. Use _get_cpu_cacheinfo_level()
> +	 * directly because the 'cpu' can not be
> +	 * offlined at the moment.
> +	 */
> +	ci = _get_cpu_cacheinfo_level(cpu, 3);
> +	if (!ci) {
> +		/*
> +		 * On system without L3 but with shared L2,
> +		 * L2 becomes the LLC.
> +		 */
> +		ci = _get_cpu_cacheinfo_level(cpu, 2);
> +		if (!ci)
> +			return true;
> +	}
Is there must call it one by one for get llc size? a static variable 
instead in building sched domain?
> +
> +	llc = ci->size;
> +
> +	rss = get_mm_counter(mm, MM_ANONPAGES) +
> +		get_mm_counter(mm, MM_SHMEMPAGES);
> +
> +	return (llc <= (rss * PAGE_SIZE));
> +}
> +
>   static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
>   {
>   	int smt_nr = 1;
> @@ -1382,7 +1414,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
>   	 */
>   	if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
>   	    get_nr_threads(p) <= 1 ||
> -	    exceed_llc_nr(mm, cpu_of(rq))) {
> +	    exceed_llc_nr(mm, cpu_of(rq)) ||
> +	    exceed_llc_capacity(mm, cpu_of(rq))) {
>   		if (mm->mm_sched_cpu != -1)
>   			mm->mm_sched_cpu = -1;
>   	}
> @@ -1439,7 +1472,7 @@ static void __no_profile task_cache_work(struct callback_head *work)
>   	struct mm_struct *mm = p->mm;
>   	unsigned long m_a_occ = 0;
>   	unsigned long curr_m_a_occ = 0;
> -	int cpu, m_a_cpu = -1, nr_running = 0;
> +	int cpu, m_a_cpu = -1, nr_running = 0, curr_cpu;
>   	cpumask_var_t cpus;
>   
>   	WARN_ON_ONCE(work != &p->cache_work);
> @@ -1449,7 +1482,9 @@ static void __no_profile task_cache_work(struct callback_head *work)
>   	if (p->flags & PF_EXITING)
>   		return;
>   
> -	if (get_nr_threads(p) <= 1) {
> +	curr_cpu = task_cpu(p);
> +	if (get_nr_threads(p) <= 1 ||
> +	    exceed_llc_capacity(mm, curr_cpu)) {
>   		if (mm->mm_sched_cpu != -1)
>   			mm->mm_sched_cpu = -1;
>   
> @@ -9895,8 +9930,12 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
>   	if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
>   		return mig_unrestricted;
>   
> -	/* skip cache aware load balance for single/too many threads */
> -	if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu))
> +	/*
> +	 * Skip cache aware load balance for single/too many threads
> +	 * or large footprint.
> +	 */
> +	if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu) ||
> +	    exceed_llc_capacity(mm, dst_cpu))
>   		return mig_unrestricted;
>   
>   	if (cpus_share_cache(dst_cpu, cpu))