linux-kernel - [RFC PATCH 02/19] sched/topology: Optimize sd->shared allocation and assignment

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250904041516.3046-3-kprateek.nayak@amd.com>
Date: Thu, 4 Sep 2025 04:14:58 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
	Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
	<vincent.guittot@...aro.org>, Anna-Maria Behnsen <anna-maria@...utronix.de>,
	Frederic Weisbecker <frederic@...nel.org>, Thomas Gleixner
	<tglx@...utronix.de>, <linux-kernel@...r.kernel.org>
CC: Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
	<rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>, Mel Gorman
	<mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>, K Prateek Nayak
	<kprateek.nayak@....com>, "Gautham R. Shenoy" <gautham.shenoy@....com>,
	Swapnil Sapkal <swapnil.sapkal@....com>
Subject: [RFC PATCH 02/19] sched/topology: Optimize sd->shared allocation and assignment

sd->shared is ever only used for sd_llc_shared and the sd->shared of
lower domains are never referenced by the scheduler. Subsequent
optimization will bloat the size of sd->shared and allocation of
redundant "sd->shared" can be expensive.

sd_llc is always a !SD_OVERLAP domain and it's children are always a
subset of the domain, thus the degeneration of a SD_SHARE_LLC  domain
implies that it either contains a single CPU, or its span matches with
that of a its child domain.

Initialize a single level of per-CPU sched_domain_shared objects instead
of doing it for each topology level. Assign sd->shared to the highest
SD_SHARE_LLC domain and rely on the degeneration path to pass it to the
correct topology level.

Reviewed-by: Gautham R. Shenoy <gautham.shenoy@....com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
Valentin, Peter,

Was this ever attempted in the past? Is there some interesting topology
out there that I'm missing where this wouldn't work?
---
 include/linux/sched/topology.h |   1 -
 kernel/sched/topology.c        | 101 +++++++++++++++++++++++----------
 2 files changed, 71 insertions(+), 31 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index bbcfdf12aa6e..d816911de435 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -168,7 +168,6 @@ typedef int (*sched_domain_flags_f)(void);
 
 struct sd_data {
 	struct sched_domain *__percpu *sd;
-	struct sched_domain_shared *__percpu *sds;
 	struct sched_group *__percpu *sg;
 	struct sched_group_capacity *__percpu *sgc;
 };
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1104d931c015..f3ebb9e8fe00 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -679,6 +679,9 @@ static void update_top_cache_domain(int cpu)
 	if (sd) {
 		id = cpumask_first(sched_domain_span(sd));
 		size = cpumask_weight(sched_domain_span(sd));
+
+		/* If sd_llc exists, sd_llc_shared should exist too. */
+		WARN_ON_ONCE(!sd->shared);
 		sds = sd->shared;
 	}
 
@@ -727,6 +730,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 		if (sd_parent_degenerate(tmp, parent)) {
 			tmp->parent = parent->parent;
 
+			/* Pick reference to parent->shared. */
+			if (parent->shared) {
+				WARN_ON_ONCE(tmp->shared);
+				tmp->shared = parent->shared;
+				parent->shared = NULL;
+			}
+
 			if (parent->parent) {
 				parent->parent->child = tmp;
 				parent->parent->groups->flags = tmp->flags;
@@ -776,6 +786,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 }
 
 struct s_data {
+	struct sched_domain_shared * __percpu *sds;
 	struct sched_domain * __percpu *sd;
 	struct root_domain	*rd;
 };
@@ -783,6 +794,7 @@ struct s_data {
 enum s_alloc {
 	sa_rootdomain,
 	sa_sd,
+	sa_sd_shared,
 	sa_sd_storage,
 	sa_none,
 };
@@ -1529,6 +1541,9 @@ static void set_domain_attribute(struct sched_domain *sd,
 static void __sdt_free(const struct cpumask *cpu_map);
 static int __sdt_alloc(const struct cpumask *cpu_map);
 
+static void __sds_free(struct s_data *d, const struct cpumask *cpu_map);
+static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map);
+
 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 				 const struct cpumask *cpu_map)
 {
@@ -1540,6 +1555,9 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 	case sa_sd:
 		free_percpu(d->sd);
 		fallthrough;
+	case sa_sd_shared:
+		__sds_free(d, cpu_map);
+		fallthrough;
 	case sa_sd_storage:
 		__sdt_free(cpu_map);
 		fallthrough;
@@ -1555,9 +1573,11 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
 
 	if (__sdt_alloc(cpu_map))
 		return sa_sd_storage;
+	if (__sds_alloc(d, cpu_map))
+		return sa_sd_shared;
 	d->sd = alloc_percpu(struct sched_domain *);
 	if (!d->sd)
-		return sa_sd_storage;
+		return sa_sd_shared;
 	d->rd = alloc_rootdomain();
 	if (!d->rd)
 		return sa_sd;
@@ -1577,9 +1597,6 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
 	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
 	*per_cpu_ptr(sdd->sd, cpu) = NULL;
 
-	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
-		*per_cpu_ptr(sdd->sds, cpu) = NULL;
-
 	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
 		*per_cpu_ptr(sdd->sg, cpu) = NULL;
 
@@ -1709,16 +1726,6 @@ sd_init(struct sched_domain_topology_level *tl,
 		sd->cache_nice_tries = 1;
 	}
 
-	/*
-	 * For all levels sharing cache; connect a sched_domain_shared
-	 * instance.
-	 */
-	if (sd->flags & SD_SHARE_LLC) {
-		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
-		atomic_inc(&sd->shared->ref);
-		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
-	}
-
 	sd->private = sdd;
 
 	return sd;
@@ -2295,10 +2302,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 		if (!sdd->sd)
 			return -ENOMEM;
 
-		sdd->sds = alloc_percpu(struct sched_domain_shared *);
-		if (!sdd->sds)
-			return -ENOMEM;
-
 		sdd->sg = alloc_percpu(struct sched_group *);
 		if (!sdd->sg)
 			return -ENOMEM;
@@ -2309,7 +2312,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
 		for_each_cpu(j, cpu_map) {
 			struct sched_domain *sd;
-			struct sched_domain_shared *sds;
 			struct sched_group *sg;
 			struct sched_group_capacity *sgc;
 
@@ -2320,13 +2322,6 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
 			*per_cpu_ptr(sdd->sd, j) = sd;
 
-			sds = kzalloc_node(sizeof(struct sched_domain_shared),
-					GFP_KERNEL, cpu_to_node(j));
-			if (!sds)
-				return -ENOMEM;
-
-			*per_cpu_ptr(sdd->sds, j) = sds;
-
 			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
 					GFP_KERNEL, cpu_to_node(j));
 			if (!sg)
@@ -2368,8 +2363,6 @@ static void __sdt_free(const struct cpumask *cpu_map)
 				kfree(*per_cpu_ptr(sdd->sd, j));
 			}
 
-			if (sdd->sds)
-				kfree(*per_cpu_ptr(sdd->sds, j));
 			if (sdd->sg)
 				kfree(*per_cpu_ptr(sdd->sg, j));
 			if (sdd->sgc)
@@ -2377,8 +2370,6 @@ static void __sdt_free(const struct cpumask *cpu_map)
 		}
 		free_percpu(sdd->sd);
 		sdd->sd = NULL;
-		free_percpu(sdd->sds);
-		sdd->sds = NULL;
 		free_percpu(sdd->sg);
 		sdd->sg = NULL;
 		free_percpu(sdd->sgc);
@@ -2386,6 +2377,42 @@ static void __sdt_free(const struct cpumask *cpu_map)
 	}
 }
 
+static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map)
+{
+	int j;
+
+	d->sds = alloc_percpu(struct sched_domain_shared *);
+	if (!d->sds)
+		return -ENOMEM;
+
+	for_each_cpu(j, cpu_map) {
+		struct sched_domain_shared *sds;
+
+		sds = kzalloc_node(sizeof(struct sched_domain_shared),
+				GFP_KERNEL, cpu_to_node(j));
+		if (!sds)
+			return -ENOMEM;
+
+		*per_cpu_ptr(d->sds, j) = sds;
+	}
+
+	return 0;
+}
+
+static void __sds_free(struct s_data *d, const struct cpumask *cpu_map)
+{
+	int j;
+
+	if (!d->sds)
+		return;
+
+	for_each_cpu(j, cpu_map)
+		kfree(*per_cpu_ptr(d->sds, j));
+
+	free_percpu(d->sds);
+	d->sds = NULL;
+}
+
 static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
 		struct sched_domain *child, int cpu)
@@ -2537,8 +2564,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		unsigned int imb_span = 1;
 
 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+			struct sched_domain *parent = sd->parent;
 			struct sched_domain *child = sd->child;
 
+			/* Attach sd->shared to the topmost SD_SHARE_LLC domain. */
+			if ((sd->flags & SD_SHARE_LLC) &&
+			    (!parent || !(parent->flags & SD_SHARE_LLC))) {
+				int llc_id = cpumask_first(sched_domain_span(sd));
+
+				sd->shared = *per_cpu_ptr(d.sds, llc_id);
+				atomic_set(&sd->shared->nr_busy_cpus, sd->span_weight);
+				atomic_inc(&sd->shared->ref);
+			}
+
 			if (!(sd->flags & SD_SHARE_LLC) && child &&
 			    (child->flags & SD_SHARE_LLC)) {
 				struct sched_domain __rcu *top_p;
@@ -2591,6 +2629,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		if (!cpumask_test_cpu(i, cpu_map))
 			continue;
 
+		if (atomic_read(&(*per_cpu_ptr(d.sds, i))->ref))
+			*per_cpu_ptr(d.sds, i) = NULL;
+
 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
 			claim_allocations(i, sd);
 			init_sched_groups_capacity(i, sd);
-- 
2.34.1