linux-kernel - Re: [PATCH 01/19] sched/fair: Add infrastructure for cache-aware load balancing

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <22750d4a-fdd6-48ad-a2ca-aa0c12af329c@linux.ibm.com>
Date: Wed, 15 Oct 2025 00:42:48 +0530
From: Madadi Vineeth Reddy <vineethr@...ux.ibm.com>
To: Tim Chen <tim.c.chen@...ux.intel.com>
Cc: Peter Zijlstra <peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>,
        K Prateek Nayak <kprateek.nayak@....com>,
        "Gautham R . Shenoy" <gautham.shenoy@....com>,
        Vincent Guittot <vincent.guittot@...aro.org>,
        Juri Lelli <juri.lelli@...hat.com>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>,
        Mel Gorman <mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>,
        Hillf Danton <hdanton@...a.com>,
        Shrikanth Hegde <sshegde@...ux.ibm.com>,
        Jianyong Wu <jianyong.wu@...look.com>, Yangyu Chen <cyy@...self.name>,
        Tingyin Duan <tingyin.duan@...il.com>, Vern Hao <vernhao@...cent.com>,
        Len Brown <len.brown@...el.com>, Aubrey Li <aubrey.li@...el.com>,
        Zhao Liu <zhao1.liu@...el.com>, Chen Yu <yu.chen.surf@...il.com>,
        Chen Yu <yu.c.chen@...el.com>, Libo Chen <libo.chen@...cle.com>,
        Adam Li <adamli@...amperecomputing.com>,
        Tim Chen <tim.c.chen@...el.com>, linux-kernel@...r.kernel.org,
        Madadi Vineeth Reddy <vineethr@...ux.ibm.com>
Subject: Re: [PATCH 01/19] sched/fair: Add infrastructure for cache-aware load
 balancing

On 11/10/25 23:54, Tim Chen wrote:
> From: "Peter Zijlstra (Intel)" <peterz@...radead.org>
> 
> Cache-aware load balancing aims to aggregate tasks with potential
> shared resources into the same cache domain. This approach enhances
> cache locality, thereby optimizing system performance by reducing
> cache misses and improving data access efficiency.
> 

[snip]

> +static void get_scan_cpumasks(cpumask_var_t cpus, int cache_cpu,
> +			      int pref_nid, int curr_cpu)
> +{
> +#ifdef CONFIG_NUMA_BALANCING
> +	/* First honor the task's preferred node. */
> +	if (pref_nid != NUMA_NO_NODE)
> +		cpumask_or(cpus, cpus, cpumask_of_node(pref_nid));
> +#endif
> +
> +	/* Next honor the task's cache CPU if it is not included. */
> +	if (cache_cpu != -1 && !cpumask_test_cpu(cache_cpu, cpus))
> +		cpumask_or(cpus, cpus,
> +			   cpumask_of_node(cpu_to_node(cache_cpu)));
> +
> +	/*
> +	 * Lastly make sure that the task's current running node is
> +	 * considered.
> +	 */
> +	if (!cpumask_test_cpu(curr_cpu, cpus))
> +		cpumask_or(cpus, cpus, cpumask_of_node(cpu_to_node(curr_cpu)));
> +}
> +
> +static void __no_profile task_cache_work(struct callback_head *work)
> +{
> +	struct task_struct *p = current;
> +	struct mm_struct *mm = p->mm;
> +	unsigned long m_a_occ = 0;
> +	unsigned long curr_m_a_occ = 0;
> +	int cpu, m_a_cpu = -1, cache_cpu,
> +	    pref_nid = NUMA_NO_NODE, curr_cpu;
> +	cpumask_var_t cpus;
> +
> +	WARN_ON_ONCE(work != &p->cache_work);
> +
> +	work->next = work;
> +
> +	if (p->flags & PF_EXITING)
> +		return;
> +
> +	if (!zalloc_cpumask_var(&cpus, GFP_KERNEL))
> +		return;
> +
> +	curr_cpu = task_cpu(p);
> +	cache_cpu = mm->mm_sched_cpu;
> +#ifdef CONFIG_NUMA_BALANCING
> +	if (static_branch_likely(&sched_numa_balancing))
> +		pref_nid = p->numa_preferred_nid;
> +#endif
> +
> +	scoped_guard (cpus_read_lock) {
> +		get_scan_cpumasks(cpus, cache_cpu,
> +				  pref_nid, curr_cpu);
> +

IIUC, `get_scan_cpumasks` ORs together the preferred NUMA node, cache CPU's node,
and current CPU's node. This could result in scanning multiple nodes, not preferring
the NUMA preferred node.

> +		for_each_cpu(cpu, cpus) {
> +			/* XXX sched_cluster_active */
> +			struct sched_domain *sd = per_cpu(sd_llc, cpu);
> +			unsigned long occ, m_occ = 0, a_occ = 0;
> +			int m_cpu = -1, i;
> +
> +			if (!sd)
> +				continue;
> +
> +			for_each_cpu(i, sched_domain_span(sd)) {
> +				occ = fraction_mm_sched(cpu_rq(i),
> +							per_cpu_ptr(mm->pcpu_sched, i));
> +				a_occ += occ;
> +				if (occ > m_occ) {
> +					m_occ = occ;
> +					m_cpu = i;
> +				}
> +			}
> +
> +			/*
> +			 * Compare the accumulated occupancy of each LLC. The
> +			 * reason for using accumulated occupancy rather than average
> +			 * per CPU occupancy is that it works better in asymmetric LLC
> +			 * scenarios.
> +			 * For example, if there are 2 threads in a 4CPU LLC and 3
> +			 * threads in an 8CPU LLC, it might be better to choose the one
> +			 * with 3 threads. However, this would not be the case if the
> +			 * occupancy is divided by the number of CPUs in an LLC (i.e.,
> +			 * if average per CPU occupancy is used).
> +			 * Besides, NUMA balancing fault statistics behave similarly:
> +			 * the total number of faults per node is compared rather than
> +			 * the average number of faults per CPU. This strategy is also
> +			 * followed here.
> +			 */
> +			if (a_occ > m_a_occ) {
> +				m_a_occ = a_occ;
> +				m_a_cpu = m_cpu;
> +			}
> +
> +			if (llc_id(cpu) == llc_id(mm->mm_sched_cpu))
> +				curr_m_a_occ = a_occ;
> +
> +			cpumask_andnot(cpus, cpus, sched_domain_span(sd));
> +		}

This means NUMA preference has no effect on the selection, except in the
unlikely case of exactly equal occupancy across LLCs on different nodes
(where iteration order determines the winner).

How does it handle when cache locality and memory locality conflict?
Shouldn't numa preferred node get preference? Also scanning multiple
nodes add overhead, so can restricting it to numa preferred node be
better and scan others only when there is no numa preferred node?

Let me know if I am missing anything.

Thanks,
Madadi Vineeth Reddy


> +	}
> +
> +	if (m_a_occ > (2 * curr_m_a_occ)) {
> +		/*
> +		 * Avoid switching mm_sched_cpu too fast.
> +		 * The reason to choose 2X is because:
> +		 * 1. It is better to keep the preferred LLC stable,
> +		 *    rather than changing it frequently and cause migrations
> +		 * 2. 2X means the new preferred LLC has at least 1 more
> +		 *    busy CPU than the old one(200% vs 100%, eg)
> +		 * 3. 2X is chosen based on test results, as it delivers
> +		 *    the optimal performance gain so far.
> +		 */
> +		mm->mm_sched_cpu = m_a_cpu;
> +	}
> +
> +	free_cpumask_var(cpus);
> +}
> +
> +void init_sched_mm(struct task_struct *p)
> +{
> +	struct callback_head *work = &p->cache_work;
> +
> +	init_task_work(work, task_cache_work);
> +	work->next = work;
> +}
> +
> +#else
> +
> +static inline void account_mm_sched(struct rq *rq, struct task_struct *p,
> +				    s64 delta_exec) { }
> +
> +void init_sched_mm(struct task_struct *p) { }
> +
> +static void task_tick_cache(struct rq *rq, struct task_struct *p) { }
> +
> +#endif
> +
>  /*
>   * Used by other classes to account runtime.
>   */
> @@ -13031,6 +13317,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
>  	if (static_branch_unlikely(&sched_numa_balancing))
>  		task_tick_numa(rq, curr);
>  
> +	task_tick_cache(rq, curr);
> +
>  	update_misfit_status(curr, rq);
>  	check_update_overutilized_status(task_rq(curr));
>  
> diff --git a/kernel/sched/features.h b/kernel/sched/features.h
> index 3c12d9f93331..d2af7bfd36bf 100644
> --- a/kernel/sched/features.h
> +++ b/kernel/sched/features.h
> @@ -87,6 +87,7 @@ SCHED_FEAT(TTWU_QUEUE, true)
>   */
>  SCHED_FEAT(SIS_UTIL, true)
>  
> +SCHED_FEAT(SCHED_CACHE, true)
>  /*
>   * Issue a WARN when we do multiple update_rq_clock() calls
>   * in a single rq->lock section. Default disabled because the
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index be9745d104f7..2ded8d3d0ecc 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1166,6 +1166,12 @@ struct rq {
>  	u64			clock_pelt_idle_copy;
>  	u64			clock_idle_copy;
>  #endif
> +#ifdef CONFIG_SCHED_CACHE
> +	raw_spinlock_t		cpu_epoch_lock ____cacheline_aligned;
> +	u64			cpu_runtime;
> +	unsigned long		cpu_epoch;
> +	unsigned long		cpu_epoch_next;
> +#endif
>  
>  	atomic_t		nr_iowait;
>  
> @@ -3790,6 +3796,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { }
>  static inline void init_sched_mm_cid(struct task_struct *t) { }
>  #endif /* !CONFIG_SCHED_MM_CID */
>  
> +extern void init_sched_mm(struct task_struct *p);
> +
>  extern u64 avg_vruntime(struct cfs_rq *cfs_rq);
>  extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se);
>  static inline