linux-kernel - Re: [PATCH v5 1/4] smpboot: introduce SDTL_INIT() helper to tidy sched topology setup

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ba4dbdf8-bc37-493d-b2e0-2efb00ea3e19@amd.com>
Date: Fri, 11 Jul 2025 11:20:30 +0530
From: K Prateek Nayak <kprateek.nayak@....com>
To: Li Chen <me@...ux.beauty>, Thomas Gleixner <tglx@...utronix.de>,
 Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
 Dave Hansen <dave.hansen@...ux.intel.com>, x86@...nel.org,
 "H . Peter Anvin" <hpa@...or.com>,
 "Rafael J . Wysocki" <rafael.j.wysocki@...el.com>,
 Peter Zijlstra <peterz@...radead.org>, Sohil Mehta <sohil.mehta@...el.com>,
 Brian Gerst <brgerst@...il.com>,
 Patryk Wlazlyn <patryk.wlazlyn@...ux.intel.com>,
 linux-kernel@...r.kernel.org, Madhavan Srinivasan <maddy@...ux.ibm.com>,
 Michael Ellerman <mpe@...erman.id.au>, Nicholas Piggin <npiggin@...il.com>,
 Christophe Leroy <christophe.leroy@...roup.eu>,
 Heiko Carstens <hca@...ux.ibm.com>, Vasily Gorbik <gor@...ux.ibm.com>,
 Alexander Gordeev <agordeev@...ux.ibm.com>,
 Christian Borntraeger <borntraeger@...ux.ibm.com>,
 Sven Schnelle <svens@...ux.ibm.com>, Juri Lelli <juri.lelli@...hat.com>,
 Vincent Guittot <vincent.guittot@...aro.org>,
 Dietmar Eggemann <dietmar.eggemann@....com>,
 Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>,
 Mel Gorman <mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>,
 Thomas Weißschuh <thomas.weissschuh@...utronix.de>,
 Li Chen <chenl311@...natelecom.cn>, Huacai Chen <chenhuacai@...nel.org>,
 Bibo Mao <maobibo@...ngson.cn>, Tobias Huschle <huschle@...ux.ibm.com>,
 Mete Durlu <meted@...ux.ibm.com>, Joel Granados <joel.granados@...nel.org>,
 Guo Weikang <guoweikang.kernel@...il.com>,
 Swapnil Sapkal <swapnil.sapkal@....com>, linuxppc-dev@...ts.ozlabs.org,
 linux-s390@...r.kernel.org
Subject: Re: [PATCH v5 1/4] smpboot: introduce SDTL_INIT() helper to tidy
 sched topology setup

On 7/10/2025 4:27 PM, Li Chen wrote:
>  	/*
>  	 * .. and append 'j' levels of NUMA goodness.
>  	 */
>  	for (j = 1; j < nr_levels; i++, j++) {
> -		tl[i] = (struct sched_domain_topology_level){
> -			.mask = sd_numa_mask,
> -			.sd_flags = cpu_numa_flags,
> -			.flags = SDTL_OVERLAP,
> -			.numa_level = j,
> -			SD_INIT_NAME(NUMA)
> -		};
> +		tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
> +		tl[i].numa_level = j;
> +		tl[i].flags = SDTL_OVERLAP;

Tangential discussion: I was looking at this and was wondering why we
need a "tl->flags" when there is already sd_flags() function and we can
simply add SD_OVERLAP to sd_numa_flags().

I think "tl->flags" was needed when the idea of overlap domains was
added in commit e3589f6c81e4 ("sched: Allow for overlapping sched_domain
spans") when it depended on "FORCE_SD_OVERLAP" sched_feat() which
allowed toggling this off but that was done away with in commit
af85596c74de ("sched/topology: Remove FORCE_SD_OVERLAP") so perhaps we
can get rid of it now?

Relying on SD_NUMA should be enough currently. Peter, Valentin, what do
you think of something like below?

(Build and boot tested on top of this series on tip:sched/core)

diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index b04a5d04dee9..42839cfa2778 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -153,14 +153,6 @@ SD_FLAG(SD_ASYM_PACKING, SDF_NEEDS_GROUPS)
  */
 SD_FLAG(SD_PREFER_SIBLING, SDF_NEEDS_GROUPS)
 
-/*
- * sched_groups of this level overlap
- *
- * SHARED_PARENT: Set for all NUMA levels above NODE.
- * NEEDS_GROUPS: Overlaps can only exist with more than one group.
- */
-SD_FLAG(SD_OVERLAP, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
-
 /*
  * Cross-node balancing
  *
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 0d5daaa277b7..5263746b63e8 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -175,8 +175,6 @@ bool cpus_share_resources(int this_cpu, int that_cpu);
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
 typedef int (*sched_domain_flags_f)(void);
 
-#define SDTL_OVERLAP	0x01
-
 struct sd_data {
 	struct sched_domain *__percpu *sd;
 	struct sched_domain_shared *__percpu *sds;
@@ -187,7 +185,6 @@ struct sd_data {
 struct sched_domain_topology_level {
 	sched_domain_mask_f mask;
 	sched_domain_flags_f sd_flags;
-	int		    flags;
 	int		    numa_level;
 	struct sd_data      data;
 	char                *name;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 20a845697c1d..b9b4bbbf0af6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9926,9 +9926,9 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 	min_capacity = ULONG_MAX;
 	max_capacity = 0;
 
-	if (child->flags & SD_OVERLAP) {
+	if (child->flags & SD_NUMA) {
 		/*
-		 * SD_OVERLAP domains cannot assume that child groups
+		 * SD_NUMA domains cannot assume that child groups
 		 * span the current group.
 		 */
 
@@ -9941,7 +9941,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 		}
 	} else  {
 		/*
-		 * !SD_OVERLAP domains can assume that child groups
+		 * !SD_NUMA domains can assume that child groups
 		 * span the current group.
 		 */
 
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d01f5a49f2e7..977e133bb8a4 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -89,7 +89,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 			break;
 		}
 
-		if (!(sd->flags & SD_OVERLAP) &&
+		if (!(sd->flags & SD_NUMA) &&
 		    cpumask_intersects(groupmask, sched_group_span(group))) {
 			printk(KERN_CONT "\n");
 			printk(KERN_ERR "ERROR: repeated CPUs\n");
@@ -102,7 +102,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				group->sgc->id,
 				cpumask_pr_args(sched_group_span(group)));
 
-		if ((sd->flags & SD_OVERLAP) &&
+		if ((sd->flags & SD_NUMA) &&
 		    !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
 			printk(KERN_CONT " mask=%*pbl",
 				cpumask_pr_args(group_balance_mask(group)));
@@ -1344,7 +1344,7 @@ void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio)
 		 * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu"
 		 * which is shared by all the overlapping groups.
 		 */
-		WARN_ON_ONCE(sd->flags & SD_OVERLAP);
+		WARN_ON_ONCE(sd->flags & SD_NUMA);
 
 		sg = sd->groups;
 		if (cpu != sg->asym_prefer_cpu) {
@@ -2016,7 +2016,6 @@ void sched_init_numa(int offline_node)
 	for (j = 1; j < nr_levels; i++, j++) {
 		tl[i] = SDTL_INIT(sd_numa_mask, cpu_numa_flags, NUMA);
 		tl[i].numa_level = j;
-		tl[i].flags = SDTL_OVERLAP;
 	}
 
 	sched_domain_topology_saved = sched_domain_topology;
@@ -2327,7 +2326,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
 
 			if (sdd->sd) {
 				sd = *per_cpu_ptr(sdd->sd, j);
-				if (sd && (sd->flags & SD_OVERLAP))
+				if (sd && (sd->flags & SD_NUMA))
 					free_sched_groups(sd->groups, 0);
 				kfree(*per_cpu_ptr(sdd->sd, j));
 			}
@@ -2393,9 +2392,13 @@ static bool topology_span_sane(const struct cpumask *cpu_map)
 	id_seen = sched_domains_tmpmask2;
 
 	for_each_sd_topology(tl) {
+		int tl_common_flags = 0;
+
+		if (tl->sd_flags)
+			tl_common_flags = (*tl->sd_flags)();
 
 		/* NUMA levels are allowed to overlap */
-		if (tl->flags & SDTL_OVERLAP)
+		if (tl_common_flags & SD_NUMA)
 			continue;
 
 		cpumask_clear(covered);
@@ -2466,8 +2469,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 
 			if (tl == sched_domain_topology)
 				*per_cpu_ptr(d.sd, i) = sd;
-			if (tl->flags & SDTL_OVERLAP)
-				sd->flags |= SD_OVERLAP;
 			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
 				break;
 		}
@@ -2480,7 +2481,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	for_each_cpu(i, cpu_map) {
 		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
 			sd->span_weight = cpumask_weight(sched_domain_span(sd));
-			if (sd->flags & SD_OVERLAP) {
+			if (sd->flags & SD_NUMA) {
 				if (build_overlap_sched_groups(sd, i))
 					goto error;
 			} else {
---

We can also keep SD_OVERLAP and only remove SDTL_OVERLAP, tl->flags if
that is preferred or just have them both if you see a future !NUMA
usecases for overlapping domains.

>  	}
>  
>  	sched_domain_topology_saved = sched_domain_topology;

-- 
Thanks and Regards,
Prateek