linux-kernel - Re: [PATCH v2 04/23] sched/cache: Make LLC id continuous

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2f9165fe-55f2-4919-be01-e9d2cdd1f960@amd.com>
Date: Tue, 23 Dec 2025 11:01:40 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: Tim Chen <tim.c.chen@...ux.intel.com>, Peter Zijlstra
	<peterz@...radead.org>, Ingo Molnar <mingo@...hat.com>, "Gautham R . Shenoy"
	<gautham.shenoy@....com>, Vincent Guittot <vincent.guittot@...aro.org>
CC: Juri Lelli <juri.lelli@...hat.com>, Dietmar Eggemann
	<dietmar.eggemann@....com>, Steven Rostedt <rostedt@...dmis.org>, Ben Segall
	<bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>, Valentin Schneider
	<vschneid@...hat.com>, Madadi Vineeth Reddy <vineethr@...ux.ibm.com>, "Hillf
 Danton" <hdanton@...a.com>, Shrikanth Hegde <sshegde@...ux.ibm.com>,
	"Jianyong Wu" <jianyong.wu@...look.com>, Yangyu Chen <cyy@...self.name>,
	Tingyin Duan <tingyin.duan@...il.com>, Vern Hao <vernhao@...cent.com>, Vern
 Hao <haoxing990@...il.com>, Len Brown <len.brown@...el.com>, Aubrey Li
	<aubrey.li@...el.com>, Zhao Liu <zhao1.liu@...el.com>, Chen Yu
	<yu.chen.surf@...il.com>, Chen Yu <yu.c.chen@...el.com>, Adam Li
	<adamli@...amperecomputing.com>, Aaron Lu <ziqianlu@...edance.com>, Tim Chen
	<tim.c.chen@...el.com>, <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v2 04/23] sched/cache: Make LLC id continuous

Hello Tim, Chenyu,

On 12/4/2025 4:37 AM, Tim Chen wrote:
> +/*
> + * Assign continuous llc id for the CPU, and return
> + * the assigned llc id.
> + */
> +static int update_llc_id(struct sched_domain *sd,
> +			 int cpu)
> +{
> +	int id = per_cpu(sd_llc_id, cpu), i;
> +
> +	if (id >= 0)
> +		return id;
> +
> +	if (sd) {
> +		/* Look for any assigned id and reuse it.*/
> +		for_each_cpu(i, sched_domain_span(sd)) {
> +			id = per_cpu(sd_llc_id, i);
> +
> +			if (id >= 0) {
> +				per_cpu(sd_llc_id, cpu) = id;
> +				return id;
> +			}
> +		}
> +	}

I don't really like tying this down to the sched_domain span since
partition and other weirdness can cause the max_llc count to go
unnecessarily high. The tl->mask() (from sched_domain_topology_level)
should give the mask considering all online CPUs and not bothering
about cpusets.

How about something like:

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5b17d8e3cb55..c19b1c4e6472 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8270,6 +8270,18 @@ static void cpuset_cpu_active(void)
 static void cpuset_cpu_inactive(unsigned int cpu)
 {
 	if (!cpuhp_tasks_frozen) {
+		/*
+		 * This is necessary since offline CPUs are
+		 * taken out of the tl->mask() and a newly
+		 * onlined CPU in same LLC will not realize
+		 * whether it should reuse the LLC ID owned
+		 * by an offline CPU without knowing the
+		 * LLC association.
+		 *
+		 * Safe to release the reference if this is
+		 * the last CPU in the LLC going offline.
+		 */
+		sched_domain_free_llc_id(cpu);
 		cpuset_update_active_cpus();
 	} else {
 		num_cpus_frozen++;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 41caa22e0680..1378a1cfad18 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -631,6 +631,7 @@ void update_sched_domain_debugfs(void)
 			i++;
 		}
 
+		debugfs_create_u32("llc_id", 0444, d_cpu, (u32 *)per_cpu_ptr(&sd_llc_id, cpu));
 		__cpumask_clear_cpu(cpu, sd_sysctl_cpus);
 	}
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3ceaa9dc9a9e..69fad88b57d8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2142,6 +2142,7 @@ extern int group_balance_cpu(struct sched_group *sg);
 
 extern void update_sched_domain_debugfs(void);
 extern void dirty_sched_domain_sysctl(int cpu);
+void sched_domain_free_llc_id(int cpu);
 
 extern int sched_update_scaling(void);
 
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index cf643a5ddedd..d6e134767f30 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -20,6 +20,46 @@ void sched_domains_mutex_unlock(void)
 /* Protected by sched_domains_mutex: */
 static cpumask_var_t sched_domains_tmpmask;
 static cpumask_var_t sched_domains_tmpmask2;
+static cpumask_var_t sched_llc_id_alloc_mask;
+DEFINE_PER_CPU(int, sd_llc_id) = -1;
+static int max_llcs = 0;
+
+static inline int sched_domain_alloc_llc_id(void)
+{
+	int llc_id;
+
+	lockdep_assert_held(&sched_domains_mutex);
+
+	llc_id = cpumask_first_zero(sched_llc_id_alloc_mask);
+	BUG_ON((unsigned int)llc_id >= nr_cpumask_bits);
+	cpumask_set_cpu(llc_id, sched_llc_id_alloc_mask);
+	++max_llcs;
+
+	return llc_id;
+}
+
+void sched_domain_free_llc_id(int cpu)
+{
+	int i, llc_id = per_cpu(sd_llc_id, cpu);
+	bool found = false;
+
+	lockdep_assert_cpus_held(); /* For cpu_active_mask. */
+	guard(mutex)(&sched_domains_mutex);
+
+	per_cpu(sd_llc_id, cpu) = -1;
+	for_each_cpu(i, cpu_active_mask) {
+		if (per_cpu(sd_llc_id, i) == llc_id) {
+			found = true;
+			break;
+		}
+	}
+
+	/* Allow future hotplugs to claim this ID */
+	if (!found) {
+		cpumask_clear_cpu(llc_id, sched_llc_id_alloc_mask);
+		--max_llcs;
+	}
+}
 
 static int __init sched_debug_setup(char *str)
 {
@@ -658,7 +698,6 @@ static void destroy_sched_domains(struct sched_domain *sd)
  */
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
-DEFINE_PER_CPU(int, sd_llc_id);
 DEFINE_PER_CPU(int, sd_share_id);
 DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa);
@@ -684,7 +723,6 @@ static void update_top_cache_domain(int cpu)
 
 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
 	per_cpu(sd_llc_size, cpu) = size;
-	per_cpu(sd_llc_id, cpu) = id;
 	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
 
 	sd = lowest_flag_domain(cpu, SD_CLUSTER);
@@ -2567,10 +2605,35 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 
 	/* Set up domains for CPUs specified by the cpu_map: */
 	for_each_cpu(i, cpu_map) {
-		struct sched_domain_topology_level *tl;
+		struct sched_domain_topology_level *tl, *tl_llc = NULL;
+		bool done = false;
 
 		sd = NULL;
 		for_each_sd_topology(tl) {
+			int flags = 0;
+
+			if (tl->sd_flags)
+				flags = (*tl->sd_flags)();
+
+			if (flags & SD_SHARE_LLC) {
+				tl_llc = tl;
+
+				/*
+				 * Entire cpu_map has been covered. We are
+				 * traversing only to find the highest
+				 * SD_SHARE_LLC level.
+				 */
+				if (done)
+					continue;
+			}
+
+			/*
+			 * Since SD_SHARE_LLC is SDF_SHARED_CHILD, we can
+			 * safely break out if the entire cpu_map has been
+			 * covered by a child domain.
+			 */
+			if (done)
+				break;
 
 			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
 
@@ -2579,7 +2642,41 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 			if (tl == sched_domain_topology)
 				*per_cpu_ptr(d.sd, i) = sd;
 			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
-				break;
+				done = true;
+		}
+
+		/* First time visiting this CPU. Assign the llc_id. */
+		if (per_cpu(sd_llc_id, i) == -1) {
+			int j, llc_id = -1;
+
+			/*
+			 * In case there are no SD_SHARE_LLC domains,
+			 * each CPU gets its own llc_id. Find the first
+			 * free bit on the mask and use it.
+			 */
+			if (!tl_llc) {
+				per_cpu(sd_llc_id, i) = sched_domain_alloc_llc_id();
+				continue;
+			}
+
+			/*
+			 * Visit all the CPUs of the LLC irrespective of the
+			 * partition constraints and find if any of them have
+			 * a valid llc_id.
+			 */
+			for_each_cpu(j, tl_llc->mask(tl, i)) {
+				llc_id = per_cpu(sd_llc_id, j);
+
+				/* Found a valid llc_id for CPU's LLC. */
+				if (llc_id != -1)
+					break;
+			}
+
+			/* Valid llc_id not found. Allocate a new one. */
+			if (llc_id == -1)
+				llc_id = sched_domain_alloc_llc_id();
+
+			per_cpu(sd_llc_id, i) = llc_id;
 		}
 	}
 
@@ -2759,6 +2856,7 @@ int __init sched_init_domains(const struct cpumask *cpu_map)
 
 	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
 	zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
+	zalloc_cpumask_var(&sched_llc_id_alloc_mask, GFP_KERNEL);
 	zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
 
 	arch_update_cpu_topology();
---

AFAICT, "sd_llc_id" isn't compared across different partitions so having
the CPUs that are actually associated with same physical LLC but across
different partitions sharing the same "sd_llc_id" shouldn't be a problem.

Thoughts?

-- 
Thanks and Regards,
Prateek