linux-kernel - [PATCH 05/19] sched/fair: Add LLC index mapping for CPUs

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <7d75af576986cf447a171ce11f5e8a15a692e780.1760206683.git.tim.c.chen@linux.intel.com>
Date: Sat, 11 Oct 2025 11:24:42 -0700
From: Tim Chen <tim.c.chen@...ux.intel.com>
To: Peter Zijlstra <peterz@...radead.org>,
	Ingo Molnar <mingo@...hat.com>,
	K Prateek Nayak <kprateek.nayak@....com>,
	"Gautham R . Shenoy" <gautham.shenoy@....com>
Cc: Tim Chen <tim.c.chen@...ux.intel.com>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Juri Lelli <juri.lelli@...hat.com>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>,
	Mel Gorman <mgorman@...e.de>,
	Valentin Schneider <vschneid@...hat.com>,
	Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
	Hillf Danton <hdanton@...a.com>,
	Shrikanth Hegde <sshegde@...ux.ibm.com>,
	Jianyong Wu <jianyong.wu@...look.com>,
	Yangyu Chen <cyy@...self.name>,
	Tingyin Duan <tingyin.duan@...il.com>,
	Vern Hao <vernhao@...cent.com>,
	Len Brown <len.brown@...el.com>,
	Aubrey Li <aubrey.li@...el.com>,
	Zhao Liu <zhao1.liu@...el.com>,
	Chen Yu <yu.chen.surf@...il.com>,
	Chen Yu <yu.c.chen@...el.com>,
	Libo Chen <libo.chen@...cle.com>,
	Adam Li <adamli@...amperecomputing.com>,
	Tim Chen <tim.c.chen@...el.com>,
	linux-kernel@...r.kernel.org
Subject: [PATCH 05/19] sched/fair: Add LLC index mapping for CPUs

Introduce an index mapping between CPUs and their LLCs. This provides
a continuous per LLC index needed for cache-aware load balancing in
later patches.

The existing per_cpu llc_id usually points to the first CPU of the
LLC domain, which is sparse and unsuitable as an array index. Using
llc_id directly would waste memory.

With the new mapping, CPUs in the same LLC share a continuous index:

  per_cpu(llc_idx, CPU=0...15)  = 0
  per_cpu(llc_idx, CPU=16...31) = 1
  per_cpu(llc_idx, CPU=32...47) = 2
  ...

The maximum number of LLCs is limited by CONFIG_NR_LLCS. If the number
of LLCs available exceeds CONFIG_NR_LLCS, the cache aware load balance
is disabled. To further save memory, this array could be converted to
dynamic allocation in the future, or the LLC index could be made NUMA
node-wide.

As mentioned by Adam, if there is no domain with SD_SHARE_LLC, the
function update_llc_idx() should not be invoked to update the index;
otherwise, it will generate an invalid index.

Co-developed-by: Chen Yu <yu.c.chen@...el.com>
Signed-off-by: Chen Yu <yu.c.chen@...el.com>
Signed-off-by: Tim Chen <tim.c.chen@...ux.intel.com>
---
 include/linux/threads.h | 10 +++++++++
 init/Kconfig            |  9 ++++++++
 kernel/sched/fair.c     | 11 ++++++++++
 kernel/sched/sched.h    |  2 ++
 kernel/sched/topology.c | 47 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 79 insertions(+)

diff --git a/include/linux/threads.h b/include/linux/threads.h
index 1674a471b0b4..2c9b1adfe024 100644
--- a/include/linux/threads.h
+++ b/include/linux/threads.h
@@ -20,6 +20,16 @@
 /* Places which use this should consider cpumask_var_t. */
 #define NR_CPUS		CONFIG_NR_CPUS
 
+#ifndef CONFIG_NR_LLCS
+#define CONFIG_NR_LLCS 1
+#endif
+
+#if CONFIG_NR_LLCS > NR_CPUS
+#define NR_LLCS		NR_CPUS
+#else
+#define NR_LLCS		CONFIG_NR_LLCS
+#endif
+
 #define MIN_THREADS_LEFT_FOR_ROOT 4
 
 /*
diff --git a/init/Kconfig b/init/Kconfig
index 4e625db7920a..6e4c96ccdda0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -981,6 +981,15 @@ config SCHED_CACHE
 	  resources within the same cache domain, reducing cache misses and
 	  lowering data access latency.
 
+config NR_LLCS
+	int "Maximum number of Last Level Caches"
+	range 2 1024
+	depends on SMP && SCHED_CACHE
+	default 64
+	help
+	  This allows you to specify the maximum number of last level caches
+	  this kernel will support for cache aware scheduling.
+
 config NUMA_BALANCING_DEFAULT_ENABLED
 	bool "Automatically enable NUMA aware memory/task placement"
 	default y
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3d643449c48c..61c129bde8b6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1224,6 +1224,17 @@ static int llc_id(int cpu)
 	return per_cpu(sd_llc_id, cpu);
 }
 
+/*
+ * continuous LLC index, starting from 0.
+ */
+static inline int llc_idx(int cpu)
+{
+	if (cpu < 0)
+		return -1;
+
+	return per_cpu(sd_llc_idx, cpu);
+}
+
 void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched)
 {
 	unsigned long epoch;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 60f1e51685ec..b448ad6dc51d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2039,6 +2039,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(int, sd_llc_idx);
 DECLARE_PER_CPU(int, sd_share_id);
 DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa);
@@ -2047,6 +2048,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 
 extern struct static_key_false sched_asym_cpucapacity;
 extern struct static_key_false sched_cluster_active;
+extern int max_llcs;
 
 static __always_inline bool sched_asym_cpucap_active(void)
 {
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 2675db980f70..4bd033060f1d 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -659,6 +659,7 @@ static void destroy_sched_domains(struct sched_domain *sd)
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(int, sd_llc_idx);
 DEFINE_PER_CPU(int, sd_share_id);
 DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
@@ -668,6 +669,40 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
 DEFINE_STATIC_KEY_FALSE(sched_cluster_active);
 
+int max_llcs = -1;
+
+static void update_llc_idx(int cpu)
+{
+#ifdef CONFIG_SCHED_CACHE
+	int idx = -1, llc_id = -1;
+
+	if (max_llcs > NR_LLCS)
+		return;
+
+	llc_id = per_cpu(sd_llc_id, cpu);
+	idx = per_cpu(sd_llc_idx, llc_id);
+
+	/*
+	 * A new LLC is detected, increase the index
+	 * by 1.
+	 */
+	if (idx < 0) {
+		idx = max_llcs++;
+
+		if (max_llcs > NR_LLCS) {
+			if (static_branch_unlikely(&sched_cache_allowed))
+				static_branch_disable_cpuslocked(&sched_cache_allowed);
+
+			pr_warn_once("CONFIG_NR_LLCS is too small, disable cache aware load balance\n");
+			return;
+		}
+
+		per_cpu(sd_llc_idx, llc_id) = idx;
+	}
+	per_cpu(sd_llc_idx, cpu) = idx;
+#endif
+}
+
 static void update_top_cache_domain(int cpu)
 {
 	struct sched_domain_shared *sds = NULL;
@@ -687,6 +722,10 @@ static void update_top_cache_domain(int cpu)
 	per_cpu(sd_llc_id, cpu) = id;
 	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
 
+	/* only update the llc index for domain with SD_SHARE_LLC */
+	if (sd)
+		update_llc_idx(cpu);
+
 	sd = lowest_flag_domain(cpu, SD_CLUSTER);
 	if (sd)
 		id = cpumask_first(sched_domain_span(sd));
@@ -2452,6 +2491,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	bool has_asym = false;
 	bool has_cluster = false;
 
+#ifdef CONFIG_SCHED_CACHE
+	if (max_llcs < 0) {
+		for_each_possible_cpu(i)
+			per_cpu(sd_llc_idx, i) = -1;
+		max_llcs = 0;
+	}
+#endif
+
 	if (WARN_ON(cpumask_empty(cpu_map)))
 		goto error;
 
-- 
2.32.0