lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250904041516.3046-12-kprateek.nayak@amd.com>
Date: Thu, 4 Sep 2025 04:15:07 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
	Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
	<vincent.guittot@...aro.org>, Anna-Maria Behnsen <anna-maria@...utronix.de>,
	Frederic Weisbecker <frederic@...nel.org>, Thomas Gleixner
	<tglx@...utronix.de>, <linux-kernel@...r.kernel.org>
CC: Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
	<rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>, Mel Gorman
	<mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>, K Prateek Nayak
	<kprateek.nayak@....com>, "Gautham R. Shenoy" <gautham.shenoy@....com>,
	Swapnil Sapkal <swapnil.sapkal@....com>
Subject: [RFC PATCH 11/19] sched/topology: Introduce "nohz_shared_list" to keep track of sd->shared

Chain together all the "sd_nohz->shared" objects in a RCU protected
"nohz_shared_list" to get the full view of system for nohz idle
balancing. Since __sds_nohz_idle_alloc() is also responsible for
initializing the list_head, rename it to __sds_nohz_idle_alloc_init() to
fit the bill.

List modifications are protected by the "nohz_shared_list_lock" and
freeing of sd->shared on the list is delayed by one grace period to
ensure all the untracked references to the shared object under RCU are
dropped.

Users of "nohz_shared_list" will be added in subsequent patches where
all the current users of "nohz.idle_cpus" will be converted to use the
distributed tracking.

Reviewed-by: Gautham R. Shenoy <gautham.shenoy@....com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
 include/linux/sched/topology.h | 12 ++++---
 kernel/sched/sched.h           |  1 +
 kernel/sched/topology.c        | 62 +++++++++++++++++++++++++++++++---
 3 files changed, 65 insertions(+), 10 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 6db3448e2f00..8400961c1c61 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -64,13 +64,15 @@ extern int sched_domain_level_max;
 struct sched_group;
 
 struct sched_domain_shared {
-	atomic_t	ref;
+	atomic_t		ref;
 #ifdef CONFIG_NO_HZ_COMMON
-	atomic_t	nr_idle_cpus;
-	struct cpumask	*idle_cpus_mask;
+	atomic_t		nr_idle_cpus;
+	struct cpumask		*idle_cpus_mask;
+	struct list_head	nohz_list_node;
+	struct rcu_head		rcu;
 #endif
-	int		has_idle_cores;
-	int		nr_idle_scan;
+	int			has_idle_cores;
+	int			nr_idle_scan;
 };
 
 struct sched_domain {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 35ffb3926334..9cffcfbef1ae 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3098,6 +3098,7 @@ extern void cfs_bandwidth_usage_dec(void);
 #define nohz_flags(cpu)		(&cpu_rq(cpu)->nohz_flags)
 
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_nohz);
+extern struct list_head nohz_shared_list;
 
 extern void nohz_balance_exit_idle(struct rq *rq);
 #else /* !CONFIG_NO_HZ_COMMON: */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index c2832445c578..86e33ed07254 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -468,8 +468,12 @@ struct s_data {
 
 DEFINE_PER_CPU(struct sched_domain __rcu *, sd_nohz);
 
-static int __sds_nohz_idle_alloc(struct sched_domain_shared *sds, int node)
+static DEFINE_RAW_SPINLOCK(nohz_shared_list_lock);
+LIST_HEAD(nohz_shared_list);
+
+static int __sds_nohz_idle_alloc_init(struct sched_domain_shared *sds, int node)
 {
+	sds->nohz_list_node = (struct list_head)LIST_HEAD_INIT(sds->nohz_list_node);
 	sds->idle_cpus_mask = kzalloc_node(cpumask_size(), GFP_KERNEL, node);
 
 	if (!sds->idle_cpus_mask)
@@ -483,6 +487,7 @@ static void __sds_nohz_idle_free(struct sched_domain_shared *sds)
 	if (!sds)
 		return;
 
+	WARN_ON_ONCE(!list_empty(&sds->nohz_list_node));
 	kfree(sds->idle_cpus_mask);
 }
 
@@ -509,7 +514,7 @@ static int __fallback_sds_alloc(struct s_data *d, unsigned long *visited_nodes)
 
 		d->fallback_nohz_sds[j] = sds;
 
-		if (__sds_nohz_idle_alloc(sds, j))
+		if (__sds_nohz_idle_alloc_init(sds, j))
 			return -ENOMEM;
 	}
 
@@ -560,6 +565,7 @@ static void claim_fallback_sds(struct s_data *d)
 static void update_nohz_domain(int cpu)
 {
 	struct sched_domain *sd = highest_flag_domain(cpu, SD_SHARE_LLC);
+	struct sched_domain_shared *sds = NULL;
 
 	/*
 	 * If sd_llc doesn't exist, use the lowest sd for nohz idle
@@ -570,13 +576,52 @@ static void update_nohz_domain(int cpu)
 	if (!sd)
 		sd = rcu_dereference(cpu_rq(cpu)->sd);
 
-	WARN_ON_ONCE(sd && !sd->shared);
+	if (sd)
+		sds = sd->shared;
+
+	if (sds && list_empty(&sds->nohz_list_node)) {
+		/*
+		 * IRQs should be disabled by the caller since they
+		 * hold the rq_lock.
+		 */
+		lockdep_assert_irqs_disabled();
+
+		guard(raw_spinlock)(&nohz_shared_list_lock);
+		list_add_rcu(&sds->nohz_list_node, &nohz_shared_list);
+	}
+
+	WARN_ON_ONCE(sd && !sds);
 	rcu_assign_pointer(per_cpu(sd_nohz, cpu), sd);
 }
 
+static void destroy_sched_domain_shared_rcu(struct rcu_head *rcu)
+{
+	struct sched_domain_shared *sds = container_of(rcu, struct sched_domain_shared, rcu);
+
+	kfree(sds->idle_cpus_mask);
+	kfree(sds);
+}
+
+/*
+ * If sd->shared is on the rcu protected nohz_shared_list,
+ * remove it the list and wait once grace period before
+ * freeing.
+ */
+static int sds_delayed_free(struct sched_domain_shared *sds)
+{
+	if (list_empty(&sds->nohz_list_node))
+		return 0;
+
+	scoped_guard(raw_spinlock_irqsave, &nohz_shared_list_lock)
+		list_del_rcu(&sds->nohz_list_node);
+
+	call_rcu(&sds->rcu, destroy_sched_domain_shared_rcu);
+	return 1;
+}
+
 #else /* !CONFIG_NO_HZ_COMMON */
 
-static int __sds_nohz_idle_alloc(struct sched_domain_shared *sds, int node)
+static int __sds_nohz_idle_alloc_init(struct sched_domain_shared *sds, int node)
 {
 	return 0;
 }
@@ -592,6 +637,7 @@ static inline void __fallback_sds_free(struct s_data *d) { }
 static inline void assign_fallback_sds(struct s_data *d, struct sched_domain *sd, int cpu) { }
 static inline void claim_fallback_sds(struct s_data *d) { }
 static inline void update_nohz_domain(int cpu) { }
+static inline int sds_delayed_free(struct sched_domain_shared *sds) { return 0; }
 
 #endif /* CONFIG_NO_HZ_COMMON */
 
@@ -771,9 +817,15 @@ static void destroy_sched_domain(struct sched_domain *sd)
 	free_sched_groups(sd->groups, 1);
 
 	if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) {
+		if (sds_delayed_free(sd->shared)) {
+			sd->shared = NULL;
+			goto out;
+		}
+
 		__sds_nohz_idle_free(sd->shared);
 		kfree(sd->shared);
 	}
+out:
 	kfree(sd);
 }
 
@@ -2557,7 +2609,7 @@ static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map)
 		bitmap_set(visited_nodes, cpu_to_node(j), 1);
 		*per_cpu_ptr(d->sds, j) = sds;
 
-		if (__sds_nohz_idle_alloc(sds, cpu_to_node(j)))
+		if (__sds_nohz_idle_alloc_init(sds, cpu_to_node(j)))
 			return -ENOMEM;
 	}
 
-- 
2.34.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ