[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <91a7c325-5093-4417-aa98-34df694b0c39@gmail.com>
Date: Thu, 18 Dec 2025 11:59:57 +0800
From: Vern Hao <haoxing990@...il.com>
To: Tim Chen <tim.c.chen@...ux.intel.com>,
Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
K Prateek Nayak <kprateek.nayak@....com>,
"Gautham R . Shenoy" <gautham.shenoy@....com>,
Vincent Guittot <vincent.guittot@...aro.org>
Cc: Chen Yu <yu.c.chen@...el.com>, Juri Lelli <juri.lelli@...hat.com>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>,
Mel Gorman <mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>,
Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
Hillf Danton <hdanton@...a.com>, Shrikanth Hegde <sshegde@...ux.ibm.com>,
Jianyong Wu <jianyong.wu@...look.com>, Yangyu Chen <cyy@...self.name>,
Tingyin Duan <tingyin.duan@...il.com>, Vern Hao <vernhao@...cent.com>,
Len Brown <len.brown@...el.com>, Aubrey Li <aubrey.li@...el.com>,
Zhao Liu <zhao1.liu@...el.com>, Chen Yu <yu.chen.surf@...il.com>,
Adam Li <adamli@...amperecomputing.com>, Aaron Lu <ziqianlu@...edance.com>,
Tim Chen <tim.c.chen@...el.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v2 19/23] sched/cache: Avoid cache-aware scheduling for
memory-heavy processes
On 2025/12/4 07:07, Tim Chen wrote:
> From: Chen Yu <yu.c.chen@...el.com>
>
> Prateek and Tingyin reported that memory-intensive workloads (such as
> stream) can saturate memory bandwidth and caches on the preferred LLC
> when sched_cache aggregates too many threads.
>
> To mitigate this, estimate a process's memory footprint by comparing
> its RSS (anonymous and shared pages) to the size of the LLC. If RSS
> exceeds the LLC size, skip cache-aware scheduling.
Restricting RSS prevents many applications from benefiting from this
optimization. I believe this restriction should be lifted. For
memory-intensive workloads, the optimization may simply yield no gains,
but it certainly shouldn't make performance worse. We need to further
refine this logic.
> Note that RSS is only an approximation of the memory footprint.
> By default, the comparison is strict, but a later patch will allow
> users to provide a hint to adjust this threshold.
>
> According to the test from Adam, some systems do not have shared L3
> but with shared L2 as clusters. In this case, the L2 becomes the LLC[1].
>
> Link[1]: https://lore.kernel.org/all/3cb6ebc7-a2fd-42b3-8739-b00e28a09cb6@os.amperecomputing.com/
>
> Co-developed-by: Tim Chen <tim.c.chen@...ux.intel.com>
> Signed-off-by: Chen Yu <yu.c.chen@...el.com>
> Signed-off-by: Tim Chen <tim.c.chen@...ux.intel.com>
> ---
>
> Notes:
> v1->v2: Assigned curr_cpu in task_cache_work() before checking
> exceed_llc_capacity(mm, curr_cpu) to avoid out-of-bound
> access.(lkp/0day)
>
> include/linux/cacheinfo.h | 21 ++++++++++-------
> kernel/sched/fair.c | 49 +++++++++++++++++++++++++++++++++++----
> 2 files changed, 57 insertions(+), 13 deletions(-)
>
> diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h
> index c8f4f0a0b874..82d0d59ca0e1 100644
> --- a/include/linux/cacheinfo.h
> +++ b/include/linux/cacheinfo.h
> @@ -113,18 +113,11 @@ int acpi_get_cache_info(unsigned int cpu,
>
> const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf);
>
> -/*
> - * Get the cacheinfo structure for the cache associated with @cpu at
> - * level @level.
> - * cpuhp lock must be held.
> - */
> -static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
> +static inline struct cacheinfo *_get_cpu_cacheinfo_level(int cpu, int level)
> {
> struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
> int i;
>
> - lockdep_assert_cpus_held();
> -
> for (i = 0; i < ci->num_leaves; i++) {
> if (ci->info_list[i].level == level) {
> if (ci->info_list[i].attributes & CACHE_ID)
> @@ -136,6 +129,18 @@ static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
> return NULL;
> }
>
> +/*
> + * Get the cacheinfo structure for the cache associated with @cpu at
> + * level @level.
> + * cpuhp lock must be held.
> + */
> +static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level)
> +{
> + lockdep_assert_cpus_held();
> +
> + return _get_cpu_cacheinfo_level(cpu, level);
> +}
> +
> /*
> * Get the id of the cache associated with @cpu at level @level.
> * cpuhp lock must be held.
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6afa3f9a4e9b..424ec601cfdf 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1223,6 +1223,38 @@ static int llc_id(int cpu)
> return llc;
> }
>
> +static bool exceed_llc_capacity(struct mm_struct *mm, int cpu)
> +{
> + struct cacheinfo *ci;
> + unsigned long rss;
> + unsigned int llc;
> +
> + /*
> + * get_cpu_cacheinfo_level() can not be used
> + * because it requires the cpu_hotplug_lock
> + * to be held. Use _get_cpu_cacheinfo_level()
> + * directly because the 'cpu' can not be
> + * offlined at the moment.
> + */
> + ci = _get_cpu_cacheinfo_level(cpu, 3);
> + if (!ci) {
> + /*
> + * On system without L3 but with shared L2,
> + * L2 becomes the LLC.
> + */
> + ci = _get_cpu_cacheinfo_level(cpu, 2);
> + if (!ci)
> + return true;
> + }
Is there must call it one by one for get llc size? a static variable
instead in building sched domain?
> +
> + llc = ci->size;
> +
> + rss = get_mm_counter(mm, MM_ANONPAGES) +
> + get_mm_counter(mm, MM_SHMEMPAGES);
> +
> + return (llc <= (rss * PAGE_SIZE));
> +}
> +
> static bool exceed_llc_nr(struct mm_struct *mm, int cpu)
> {
> int smt_nr = 1;
> @@ -1382,7 +1414,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> */
> if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
> get_nr_threads(p) <= 1 ||
> - exceed_llc_nr(mm, cpu_of(rq))) {
> + exceed_llc_nr(mm, cpu_of(rq)) ||
> + exceed_llc_capacity(mm, cpu_of(rq))) {
> if (mm->mm_sched_cpu != -1)
> mm->mm_sched_cpu = -1;
> }
> @@ -1439,7 +1472,7 @@ static void __no_profile task_cache_work(struct callback_head *work)
> struct mm_struct *mm = p->mm;
> unsigned long m_a_occ = 0;
> unsigned long curr_m_a_occ = 0;
> - int cpu, m_a_cpu = -1, nr_running = 0;
> + int cpu, m_a_cpu = -1, nr_running = 0, curr_cpu;
> cpumask_var_t cpus;
>
> WARN_ON_ONCE(work != &p->cache_work);
> @@ -1449,7 +1482,9 @@ static void __no_profile task_cache_work(struct callback_head *work)
> if (p->flags & PF_EXITING)
> return;
>
> - if (get_nr_threads(p) <= 1) {
> + curr_cpu = task_cpu(p);
> + if (get_nr_threads(p) <= 1 ||
> + exceed_llc_capacity(mm, curr_cpu)) {
> if (mm->mm_sched_cpu != -1)
> mm->mm_sched_cpu = -1;
>
> @@ -9895,8 +9930,12 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu,
> if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu))
> return mig_unrestricted;
>
> - /* skip cache aware load balance for single/too many threads */
> - if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu))
> + /*
> + * Skip cache aware load balance for single/too many threads
> + * or large footprint.
> + */
> + if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu) ||
> + exceed_llc_capacity(mm, dst_cpu))
> return mig_unrestricted;
>
> if (cpus_share_cache(dst_cpu, cpu))
Powered by blists - more mailing lists