lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250904041516.3046-9-kprateek.nayak@amd.com>
Date: Thu, 4 Sep 2025 04:15:04 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
	Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
	<vincent.guittot@...aro.org>, Anna-Maria Behnsen <anna-maria@...utronix.de>,
	Frederic Weisbecker <frederic@...nel.org>, Thomas Gleixner
	<tglx@...utronix.de>, <linux-kernel@...r.kernel.org>
CC: Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
	<rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>, Mel Gorman
	<mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>, K Prateek Nayak
	<kprateek.nayak@....com>, "Gautham R. Shenoy" <gautham.shenoy@....com>,
	Swapnil Sapkal <swapnil.sapkal@....com>
Subject: [RFC PATCH 08/19] sched/topology: Introduce fallback sd->shared assignment

Going forward, tying the nohz balancing to "sd->shared" will require
each CPU's hierarchy to have at least one "sd->shared" object tracking
its idle status.

If the lowest domain of the hierarchy after degeneration does not have
the SD_SHARE_LLC flag set, assign a per-Node fallback shared object to
the lowest domain on CONFIG_NO_HZ_COMMON. !CONFIG_NO_HZ_COMMON kernels
will always have tick enabled on idle CPUs and will not require nohz
idle tracking.

An example scenario where the fallback shared object is used is as
follows - Consider a cpuset with 17CPUs where 16CPUs are from the same
LLC and a singleton CPU from another LLC:

  CPU0:
  domain0: MC {0-15}
    groups: {0} {1} ... {15}
    domain1: PKG {0-16}
      groups: {0-15} {16}
  ...
  CPU15:
  domain0: MC {0-15}
    groups: {15} {0} {1} ... {14}
    domain1: PKG {0-16}
      groups: {0-15} {16}
  CPU16:
  # MC is degenerated since {16} is the only CPU is the domain
  domain0: PKG {0-15}
    groups: {16} {0-15}
  # Assign sd[PKG]->shared = fallback_dhared[cpu_to_node(16)]

If the lowest domain is a SD_OVERLAP domain, "sd->shared" is only shared
by the CPUs on the same node and not the entire domain. This is
acceptable since the fallback shared object is only keeping track of the
CPU's idle status unlike sd_llc_shared which also tracks
"has_idle_cores" and "nr_idle_scan".

Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
 kernel/sched/topology.c | 123 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 117 insertions(+), 6 deletions(-)

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 0b0257937a97..d71c60d99313 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -455,6 +455,96 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
 static void free_pd(struct perf_domain *pd) { }
 #endif /* !(CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */
 
+struct s_data {
+#ifdef CONFIG_NO_HZ_COMMON
+	struct sched_domain_shared **fallback_nohz_sds;
+#endif
+	struct sched_domain_shared * __percpu *sds;
+	struct sched_domain * __percpu *sd;
+	struct root_domain	*rd;
+};
+
+#ifdef CONFIG_NO_HZ_COMMON
+
+static int __fallback_sds_alloc(struct s_data *d, unsigned long *visited_nodes)
+{
+	int j;
+
+	d->fallback_nohz_sds = kcalloc(nr_node_ids,
+			sizeof(*d->fallback_nohz_sds), GFP_KERNEL);
+	if (!d->fallback_nohz_sds)
+		return -ENOMEM;
+
+	/*
+	 * Allocate a fallback sd->shared object
+	 * for each node covered by the cpu_map.
+	 */
+	for_each_set_bit(j, visited_nodes, nr_node_ids) {
+		struct sched_domain_shared *sds;
+
+		sds = kzalloc_node(sizeof(struct sched_domain_shared),
+				GFP_KERNEL, j);
+		if (!sds)
+			return -ENOMEM;
+
+		d->fallback_nohz_sds[j] = sds;
+	}
+
+	return 0;
+}
+
+static void __fallback_sds_free(struct s_data *d)
+{
+	int j;
+
+	if (!d->fallback_nohz_sds)
+		return;
+
+	for (j = 0; j < nr_node_ids; ++j)
+		kfree(d->fallback_nohz_sds[j]);
+
+	kfree(d->fallback_nohz_sds);
+	d->fallback_nohz_sds = NULL;
+}
+
+static void assign_fallback_sds(struct s_data *d, struct sched_domain *sd, int cpu)
+{
+	struct sched_domain_shared *sds;
+
+	sds = d->fallback_nohz_sds[cpu_to_node(cpu)];
+	sd->shared = sds;
+	atomic_inc(&sd->shared->ref);
+}
+
+static void claim_fallback_sds(struct s_data *d)
+{
+	int j;
+
+	/*
+	 * Claim allocations for the fallback shared objects
+	 * if they were assigned during cpu_attach_domain().
+	 */
+	for (j = 0; j < nr_node_ids; ++j) {
+		struct sched_domain_shared *sds = d->fallback_nohz_sds[j];
+
+		if (sds && atomic_read(&sds->ref))
+			d->fallback_nohz_sds[j] = NULL;
+	}
+}
+
+#else /* !CONFIG_NO_HZ_COMMON */
+
+static inline int __fallback_sds_alloc(struct s_data *d, unsigned long *visited_nodes)
+{
+	return 0;
+}
+
+static inline void __fallback_sds_free(struct s_data *d) { }
+static inline void assign_fallback_sds(struct s_data *d, struct sched_domain *sd, int cpu) { }
+static inline void claim_fallback_sds(struct s_data *d) { }
+
+#endif /* CONFIG_NO_HZ_COMMON */
+
 static void free_rootdomain(struct rcu_head *rcu)
 {
 	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
@@ -716,12 +806,6 @@ static void update_top_cache_domain(int cpu)
 	rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd);
 }
 
-struct s_data {
-	struct sched_domain_shared * __percpu *sds;
-	struct sched_domain * __percpu *sd;
-	struct root_domain	*rd;
-};
-
 /*
  * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
@@ -790,6 +874,14 @@ cpu_attach_domain(struct s_data *d, int cpu)
 		}
 	}
 
+	/*
+	 * Ensure there is at least one domain in the
+	 * hierarchy with sd->shared attached to
+	 * ensure participation in nohz balancing.
+	 */
+	if (sd && !(sd->flags & SD_SHARE_LLC))
+		assign_fallback_sds(d, sd, cpu);
+
 	sched_domain_debug(sd, cpu);
 
 	tmp = rq->sd;
@@ -2387,12 +2479,19 @@ static void __sdt_free(const struct cpumask *cpu_map)
 
 static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map)
 {
+	unsigned long *visited_nodes;
 	int j;
 
+	visited_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
+	if (!visited_nodes)
+		return -ENOMEM;
+
 	d->sds = alloc_percpu(struct sched_domain_shared *);
 	if (!d->sds)
 		return -ENOMEM;
 
+	bitmap_zero(visited_nodes, nr_node_ids);
+
 	for_each_cpu(j, cpu_map) {
 		struct sched_domain_shared *sds;
 
@@ -2401,9 +2500,13 @@ static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map)
 		if (!sds)
 			return -ENOMEM;
 
+		bitmap_set(visited_nodes, cpu_to_node(j), 1);
 		*per_cpu_ptr(d->sds, j) = sds;
 	}
 
+	if (__fallback_sds_alloc(d, visited_nodes))
+		return -ENOMEM;
+
 	return 0;
 }
 
@@ -2417,6 +2520,8 @@ static void __sds_free(struct s_data *d, const struct cpumask *cpu_map)
 	for_each_cpu(j, cpu_map)
 		kfree(*per_cpu_ptr(d->sds, j));
 
+	__fallback_sds_free(d);
+
 	free_percpu(d->sds);
 	d->sds = NULL;
 }
@@ -2655,6 +2760,12 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 		if (lowest_flag_domain(i, SD_CLUSTER))
 			has_cluster = true;
 	}
+
+	/*
+	 * Claim allocations for the fallback shared objects
+	 * if they were assigned during cpu_attach_domain().
+	 */
+	claim_fallback_sds(&d);
 	rcu_read_unlock();
 
 	if (has_asym)
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ