[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d52fb3ba-64ee-48fa-bd53-73217f0fa435@gmail.com>
Date: Thu, 11 Dec 2025 17:03:00 +0800
From: Vern Hao <haoxing990@...il.com>
To: Tim Chen <tim.c.chen@...ux.intel.com>,
Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
K Prateek Nayak <kprateek.nayak@....com>,
"Gautham R . Shenoy" <gautham.shenoy@....com>,
Vincent Guittot <vincent.guittot@...aro.org>
Cc: Juri Lelli <juri.lelli@...hat.com>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>,
Mel Gorman <mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>,
Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
Hillf Danton <hdanton@...a.com>, Shrikanth Hegde <sshegde@...ux.ibm.com>,
Jianyong Wu <jianyong.wu@...look.com>, Yangyu Chen <cyy@...self.name>,
Tingyin Duan <tingyin.duan@...il.com>, Vern Hao <vernhao@...cent.com>,
Len Brown <len.brown@...el.com>, Aubrey Li <aubrey.li@...el.com>,
Zhao Liu <zhao1.liu@...el.com>, Chen Yu <yu.chen.surf@...il.com>,
Chen Yu <yu.c.chen@...el.com>, Adam Li <adamli@...amperecomputing.com>,
Aaron Lu <ziqianlu@...edance.com>, Tim Chen <tim.c.chen@...el.com>,
linux-kernel@...r.kernel.org, Vern Hao <haoxing990@...il.com>
Subject: Re: [PATCH v2 01/23] sched/cache: Introduce infrastructure for
cache-aware load balancing
Hi, Peter, Chen Yu and Tim:
On 2025/12/4 07:07, Tim Chen wrote:
> From: "Peter Zijlstra (Intel)" <peterz@...radead.org>
>
> Adds infrastructure to enable cache-aware load balancing,
> which improves cache locality by grouping tasks that share resources
> within the same cache domain. This reduces cache misses and improves
> overall data access efficiency.
>
> In this initial implementation, threads belonging to the same process
> are treated as entities that likely share working sets. The mechanism
> tracks per-process CPU occupancy across cache domains and attempts to
> migrate threads toward cache-hot domains where their process already
> has active threads, thereby enhancing locality.
>
> This provides a basic model for cache affinity. While the current code
> targets the last-level cache (LLC), the approach could be extended to
> other domain types such as clusters (L2) or node-internal groupings.
>
> At present, the mechanism selects the CPU within an LLC that has the
> highest recent runtime. Subsequent patches in this series will use this
> information in the load-balancing path to guide task placement toward
> preferred LLCs.
>
> In the future, more advanced policies could be integrated through NUMA
> balancing-for example, migrating a task to its preferred LLC when spare
> capacity exists, or swapping tasks across LLCs to improve cache affinity.
> Grouping of tasks could also be generalized from that of a process
> to be that of a NUMA group, or be user configurable.
>
> Originally-by: Peter Zijlstra (Intel) <peterz@...radead.org>
> Signed-off-by: Chen Yu <yu.c.chen@...el.com>
> Signed-off-by: Tim Chen <tim.c.chen@...ux.intel.com>
> ---
>
> Notes:
> v1->v2:
> Restore the original CPU scan to cover all online CPUs,
> rather than scanning within the preferred NUMA node.
> (Peter Zijlstra)
>
> Use rq->curr instead of rq->donor. (K Prateek Nayak)
>
> Minor fix in task_tick_cache() to use
> if (mm->mm_sched_epoch >= rq->cpu_epoch)
> to avoid mm_sched_epoch going backwards.
>
> include/linux/mm_types.h | 44 +++++++
> include/linux/sched.h | 11 ++
> init/Kconfig | 11 ++
> kernel/fork.c | 6 +
> kernel/sched/core.c | 6 +
> kernel/sched/fair.c | 258 +++++++++++++++++++++++++++++++++++++++
> kernel/sched/sched.h | 8 ++
> 7 files changed, 344 insertions(+)
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 90e5790c318f..1ea16ef90566 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -939,6 +939,11 @@ typedef struct {
> DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS);
> } __private mm_flags_t;
>
> +struct mm_sched {
> + u64 runtime;
> + unsigned long epoch;
> +};
> +
> struct kioctx_table;
> struct iommu_mm_data;
> struct mm_struct {
> @@ -1029,6 +1034,17 @@ struct mm_struct {
> */
> raw_spinlock_t cpus_allowed_lock;
> #endif
> +#ifdef CONFIG_SCHED_CACHE
> + /*
> + * Track per-cpu-per-process occupancy as a proxy for cache residency.
> + * See account_mm_sched() and ...
> + */
> + struct mm_sched __percpu *pcpu_sched;
> + raw_spinlock_t mm_sched_lock;
> + unsigned long mm_sched_epoch;
> + int mm_sched_cpu;
As we discussed earlier,I continue to believe that dedicating
'mm_sched_cpu' to handle the aggregated hotspots of all threads is
inappropriate, as the multiple threads lack a necessary correlation in
our real application.
So, I was wondering if we could put this variable into struct
task_struct, That allows us to better monitor the hotspot CPU of each
thread, despite some details needing consideration.
> +#endif
> +
> #ifdef CONFIG_MMU
> atomic_long_t pgtables_bytes; /* size of all page tables */
> #endif
> @@ -1487,6 +1503,34 @@ static inline unsigned int mm_cid_size(void)
> static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { }
> #endif /* CONFIG_SCHED_MM_CID */
>
> +#ifdef CONFIG_SCHED_CACHE
> +void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched);
> +
> +static inline int mm_alloc_sched_noprof(struct mm_struct *mm)
> +{
> + struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched);
> +
> + if (!pcpu_sched)
> + return -ENOMEM;
> +
> + mm_init_sched(mm, pcpu_sched);
> + return 0;
> +}
> +
> +#define mm_alloc_sched(...) alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__))
> +
> +static inline void mm_destroy_sched(struct mm_struct *mm)
> +{
> + free_percpu(mm->pcpu_sched);
> + mm->pcpu_sched = NULL;
> +}
> +#else /* !CONFIG_SCHED_CACHE */
> +
> +static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; }
> +static inline void mm_destroy_sched(struct mm_struct *mm) { }
> +
> +#endif /* CONFIG_SCHED_CACHE */
> +
> struct mmu_gather;
> extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
> extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index b469878de25c..278b529c91df 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1406,6 +1406,10 @@ struct task_struct {
> unsigned long numa_pages_migrated;
> #endif /* CONFIG_NUMA_BALANCING */
>
> +#ifdef CONFIG_SCHED_CACHE
> + struct callback_head cache_work;
> +#endif
> +
> #ifdef CONFIG_RSEQ
> struct rseq __user *rseq;
> u32 rseq_len;
> @@ -2428,4 +2432,11 @@ extern void migrate_enable(void);
>
> DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
>
> +#ifdef CONFIG_SCHED_CACHE
> +static inline bool sched_cache_enabled(void)
> +{
> + return false;
> +}
> +#endif
> +
> #endif
> diff --git a/init/Kconfig b/init/Kconfig
> index cab3ad28ca49..88556ef8cfd1 100644
> --- a/init/Kconfig
> +++ b/init/Kconfig
> @@ -983,6 +983,17 @@ config NUMA_BALANCING
>
> This system will be inactive on UMA systems.
>
> +config SCHED_CACHE
> + bool "Cache aware load balance"
> + default y
> + depends on SMP
> + help
> + When enabled, the scheduler will attempt to aggregate tasks from
> + the same process onto a single Last Level Cache (LLC) domain when
> + possible. This improves cache locality by keeping tasks that share
> + resources within the same cache domain, reducing cache misses and
> + lowering data access latency.
> +
> config NUMA_BALANCING_DEFAULT_ENABLED
> bool "Automatically enable NUMA aware memory/task placement"
> default y
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 3da0f08615a9..aae5053d1e30 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -680,6 +680,7 @@ void __mmdrop(struct mm_struct *mm)
> cleanup_lazy_tlbs(mm);
>
> WARN_ON_ONCE(mm == current->active_mm);
> + mm_destroy_sched(mm);
> mm_free_pgd(mm);
> mm_free_id(mm);
> destroy_context(mm);
> @@ -1083,6 +1084,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
> if (mm_alloc_cid(mm, p))
> goto fail_cid;
>
> + if (mm_alloc_sched(mm))
> + goto fail_sched;
> +
> if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
> NR_MM_COUNTERS))
> goto fail_pcpu;
> @@ -1092,6 +1096,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
> return mm;
>
> fail_pcpu:
> + mm_destroy_sched(mm);
> +fail_sched:
> mm_destroy_cid(mm);
> fail_cid:
> destroy_context(mm);
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index f754a60de848..e8bdf03a4b7f 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4488,6 +4488,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p)
> p->wake_entry.u_flags = CSD_TYPE_TTWU;
> p->migration_pending = NULL;
> init_sched_mm_cid(p);
> + init_sched_mm(p);
> }
>
> DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
> @@ -8791,6 +8792,11 @@ void __init sched_init(void)
>
> rq->core_cookie = 0UL;
> #endif
> +#ifdef CONFIG_SCHED_CACHE
> + raw_spin_lock_init(&rq->cpu_epoch_lock);
> + rq->cpu_epoch_next = jiffies;
> +#endif
> +
> zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i));
> }
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 5b752324270b..cb82f558dc5b 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -1152,6 +1152,8 @@ void post_init_entity_util_avg(struct task_struct *p)
> sa->runnable_avg = sa->util_avg;
> }
>
> +static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec);
> +
> static s64 update_se(struct rq *rq, struct sched_entity *se)
> {
> u64 now = rq_clock_task(rq);
> @@ -1174,6 +1176,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
>
> trace_sched_stat_runtime(running, delta_exec);
> account_group_exec_runtime(running, delta_exec);
> + account_mm_sched(rq, running, delta_exec);
>
> /* cgroup time is always accounted against the donor */
> cgroup_account_cputime(donor, delta_exec);
> @@ -1193,6 +1196,259 @@ static s64 update_se(struct rq *rq, struct sched_entity *se)
> return delta_exec;
> }
>
> +#ifdef CONFIG_SCHED_CACHE
> +
> +/*
> + * XXX numbers come from a place the sun don't shine -- probably wants to be SD
> + * tunable or so.
> + */
> +#define EPOCH_PERIOD (HZ / 100) /* 10 ms */
> +#define EPOCH_LLC_AFFINITY_TIMEOUT 5 /* 50 ms */
> +
> +static int llc_id(int cpu)
> +{
> + if (cpu < 0)
> + return -1;
> +
> + return per_cpu(sd_llc_id, cpu);
> +}
> +
> +void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
> +{
> + unsigned long epoch;
> + int i;
> +
> + for_each_possible_cpu(i) {
> + struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i);
> + struct rq *rq = cpu_rq(i);
> +
> + pcpu_sched->runtime = 0;
> + pcpu_sched->epoch = rq->cpu_epoch;
> + epoch = rq->cpu_epoch;
> + }
> +
> + raw_spin_lock_init(&mm->mm_sched_lock);
> + mm->mm_sched_epoch = epoch;
> + mm->mm_sched_cpu = -1;
> +
> + /*
> + * The update to mm->pcpu_sched should not be reordered
> + * before initialization to mm's other fields, in case
> + * the readers may get invalid mm_sched_epoch, etc.
> + */
> + smp_store_release(&mm->pcpu_sched, _pcpu_sched);
> +}
> +
> +/* because why would C be fully specified */
> +static __always_inline void __shr_u64(u64 *val, unsigned int n)
> +{
> + if (n >= 64) {
> + *val = 0;
> + return;
> + }
> + *val >>= n;
> +}
> +
> +static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
> +{
> + lockdep_assert_held(&rq->cpu_epoch_lock);
> +
> + unsigned long n, now = jiffies;
> + long delta = now - rq->cpu_epoch_next;
> +
> + if (delta > 0) {
> + n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD;
> + rq->cpu_epoch += n;
> + rq->cpu_epoch_next += n * EPOCH_PERIOD;
> + __shr_u64(&rq->cpu_runtime, n);
> + }
> +
> + n = rq->cpu_epoch - pcpu_sched->epoch;
> + if (n) {
> + pcpu_sched->epoch += n;
> + __shr_u64(&pcpu_sched->runtime, n);
> + }
> +}
> +
> +static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched)
> +{
> + guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock);
> +
> + __update_mm_sched(rq, pcpu_sched);
> +
> + /*
> + * Runtime is a geometric series (r=0.5) and as such will sum to twice
> + * the accumulation period, this means the multiplcation here should
> + * not overflow.
> + */
> + return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1);
> +}
> +
> +static inline
> +void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec)
> +{
> + struct mm_struct *mm = p->mm;
> + struct mm_sched *pcpu_sched;
> + unsigned long epoch;
> +
> + if (!sched_cache_enabled())
> + return;
> +
> + if (p->sched_class != &fair_sched_class)
> + return;
> + /*
> + * init_task and kthreads don't having mm
> + */
> + if (!mm || !mm->pcpu_sched)
> + return;
> +
> + pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq));
> +
> + scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) {
> + __update_mm_sched(rq, pcpu_sched);
> + pcpu_sched->runtime += delta_exec;
> + rq->cpu_runtime += delta_exec;
> + epoch = rq->cpu_epoch;
> + }
> +
> + /*
> + * If this task hasn't hit task_cache_work() for a while, or it
> + * has only 1 thread, invalidate its preferred state.
> + */
> + if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT ||
> + get_nr_threads(p) <= 1) {
> + if (mm->mm_sched_cpu != -1)
> + mm->mm_sched_cpu = -1;
> + }
> +}
> +
> +static void task_tick_cache(struct rq *rq, struct task_struct *p)
> +{
> + struct callback_head *work = &p->cache_work;
> + struct mm_struct *mm = p->mm;
> +
> + if (!sched_cache_enabled())
> + return;
> +
> + if (!mm || !mm->pcpu_sched)
> + return;
> +
> + /* avoid moving backwards */
> + if (mm->mm_sched_epoch >= rq->cpu_epoch)
> + return;
> +
> + guard(raw_spinlock)(&mm->mm_sched_lock);
> +
> + if (work->next == work) {
> + task_work_add(p, work, TWA_RESUME);
> + WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch);
> + }
> +}
> +
> +static void __no_profile task_cache_work(struct callback_head *work)
> +{
> + struct task_struct *p = current;
> + struct mm_struct *mm = p->mm;
> + unsigned long m_a_occ = 0;
> + unsigned long curr_m_a_occ = 0;
> + int cpu, m_a_cpu = -1;
> + cpumask_var_t cpus;
> +
> + WARN_ON_ONCE(work != &p->cache_work);
> +
> + work->next = work;
> +
> + if (p->flags & PF_EXITING)
> + return;
> +
> + if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
> + return;
> +
> + scoped_guard (cpus_read_lock) {
> + cpumask_copy(cpus, cpu_online_mask);
> +
> + for_each_cpu(cpu, cpus) {
> + /* XXX sched_cluster_active */
> + struct sched_domain *sd = per_cpu(sd_llc, cpu);
> + unsigned long occ, m_occ = 0, a_occ = 0;
> + int m_cpu = -1, i;
> +
> + if (!sd)
> + continue;
> +
> + for_each_cpu(i, sched_domain_span(sd)) {
> + occ = fraction_mm_sched(cpu_rq(i),
> + per_cpu_ptr(mm->pcpu_sched, i));
> + a_occ += occ;
> + if (occ > m_occ) {
> + m_occ = occ;
> + m_cpu = i;
> + }
> + }
> +
> + /*
> + * Compare the accumulated occupancy of each LLC. The
> + * reason for using accumulated occupancy rather than average
> + * per CPU occupancy is that it works better in asymmetric LLC
> + * scenarios.
> + * For example, if there are 2 threads in a 4CPU LLC and 3
> + * threads in an 8CPU LLC, it might be better to choose the one
> + * with 3 threads. However, this would not be the case if the
> + * occupancy is divided by the number of CPUs in an LLC (i.e.,
> + * if average per CPU occupancy is used).
> + * Besides, NUMA balancing fault statistics behave similarly:
> + * the total number of faults per node is compared rather than
> + * the average number of faults per CPU. This strategy is also
> + * followed here.
> + */
> + if (a_occ > m_a_occ) {
> + m_a_occ = a_occ;
> + m_a_cpu = m_cpu;
> + }
> +
> + if (llc_id(cpu) == llc_id(mm->mm_sched_cpu))
> + curr_m_a_occ = a_occ;
> +
> + cpumask_andnot(cpus, cpus, sched_domain_span(sd));
> + }
> + }
> +
> + if (m_a_occ > (2 * curr_m_a_occ)) {
> + /*
> + * Avoid switching mm_sched_cpu too fast.
> + * The reason to choose 2X is because:
> + * 1. It is better to keep the preferred LLC stable,
> + * rather than changing it frequently and cause migrations
> + * 2. 2X means the new preferred LLC has at least 1 more
> + * busy CPU than the old one(200% vs 100%, eg)
> + * 3. 2X is chosen based on test results, as it delivers
> + * the optimal performance gain so far.
> + */
> + mm->mm_sched_cpu = m_a_cpu;
> + }
> +
> + free_cpumask_var(cpus);
> +}
> +
> +void init_sched_mm(struct task_struct *p)
> +{
> + struct callback_head *work = &p->cache_work;
> +
> + init_task_work(work, task_cache_work);
> + work->next = work;
> +}
> +
> +#else
> +
> +static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
> + s64 delta_exec) { }
> +
> +void init_sched_mm(struct task_struct *p) { }
> +
> +static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
> +
> +#endif
> +
> /*
> * Used by other classes to account runtime.
> */
> @@ -13124,6 +13380,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
> if (static_branch_unlikely(&sched_numa_balancing))
> task_tick_numa(rq, curr);
>
> + task_tick_cache(rq, curr);
> +
> update_misfit_status(curr, rq);
> check_update_overutilized_status(task_rq(curr));
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index adfb6e3409d7..84118b522f22 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1194,6 +1194,12 @@ struct rq {
> u64 clock_pelt_idle_copy;
> u64 clock_idle_copy;
> #endif
> +#ifdef CONFIG_SCHED_CACHE
> + raw_spinlock_t cpu_epoch_lock ____cacheline_aligned;
> + u64 cpu_runtime;
> + unsigned long cpu_epoch;
> + unsigned long cpu_epoch_next;
> +#endif
>
> atomic_t nr_iowait;
>
> @@ -3819,6 +3825,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
> static inline void init_sched_mm_cid(struct task_struct *t) { }
> #endif /* !CONFIG_SCHED_MM_CID */
>
> +extern void init_sched_mm(struct task_struct *p);
> +
> extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
> extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
> static inline
Powered by blists - more mailing lists