[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250904041516.3046-12-kprateek.nayak@amd.com>
Date: Thu, 4 Sep 2025 04:15:07 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
<vincent.guittot@...aro.org>, Anna-Maria Behnsen <anna-maria@...utronix.de>,
Frederic Weisbecker <frederic@...nel.org>, Thomas Gleixner
<tglx@...utronix.de>, <linux-kernel@...r.kernel.org>
CC: Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
<rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>, Mel Gorman
<mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>, K Prateek Nayak
<kprateek.nayak@....com>, "Gautham R. Shenoy" <gautham.shenoy@....com>,
Swapnil Sapkal <swapnil.sapkal@....com>
Subject: [RFC PATCH 11/19] sched/topology: Introduce "nohz_shared_list" to keep track of sd->shared
Chain together all the "sd_nohz->shared" objects in a RCU protected
"nohz_shared_list" to get the full view of system for nohz idle
balancing. Since __sds_nohz_idle_alloc() is also responsible for
initializing the list_head, rename it to __sds_nohz_idle_alloc_init() to
fit the bill.
List modifications are protected by the "nohz_shared_list_lock" and
freeing of sd->shared on the list is delayed by one grace period to
ensure all the untracked references to the shared object under RCU are
dropped.
Users of "nohz_shared_list" will be added in subsequent patches where
all the current users of "nohz.idle_cpus" will be converted to use the
distributed tracking.
Reviewed-by: Gautham R. Shenoy <gautham.shenoy@....com>
Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
include/linux/sched/topology.h | 12 ++++---
kernel/sched/sched.h | 1 +
kernel/sched/topology.c | 62 +++++++++++++++++++++++++++++++---
3 files changed, 65 insertions(+), 10 deletions(-)
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 6db3448e2f00..8400961c1c61 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -64,13 +64,15 @@ extern int sched_domain_level_max;
struct sched_group;
struct sched_domain_shared {
- atomic_t ref;
+ atomic_t ref;
#ifdef CONFIG_NO_HZ_COMMON
- atomic_t nr_idle_cpus;
- struct cpumask *idle_cpus_mask;
+ atomic_t nr_idle_cpus;
+ struct cpumask *idle_cpus_mask;
+ struct list_head nohz_list_node;
+ struct rcu_head rcu;
#endif
- int has_idle_cores;
- int nr_idle_scan;
+ int has_idle_cores;
+ int nr_idle_scan;
};
struct sched_domain {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 35ffb3926334..9cffcfbef1ae 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3098,6 +3098,7 @@ extern void cfs_bandwidth_usage_dec(void);
#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
DECLARE_PER_CPU(struct sched_domain __rcu *, sd_nohz);
+extern struct list_head nohz_shared_list;
extern void nohz_balance_exit_idle(struct rq *rq);
#else /* !CONFIG_NO_HZ_COMMON: */
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index c2832445c578..86e33ed07254 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -468,8 +468,12 @@ struct s_data {
DEFINE_PER_CPU(struct sched_domain __rcu *, sd_nohz);
-static int __sds_nohz_idle_alloc(struct sched_domain_shared *sds, int node)
+static DEFINE_RAW_SPINLOCK(nohz_shared_list_lock);
+LIST_HEAD(nohz_shared_list);
+
+static int __sds_nohz_idle_alloc_init(struct sched_domain_shared *sds, int node)
{
+ sds->nohz_list_node = (struct list_head)LIST_HEAD_INIT(sds->nohz_list_node);
sds->idle_cpus_mask = kzalloc_node(cpumask_size(), GFP_KERNEL, node);
if (!sds->idle_cpus_mask)
@@ -483,6 +487,7 @@ static void __sds_nohz_idle_free(struct sched_domain_shared *sds)
if (!sds)
return;
+ WARN_ON_ONCE(!list_empty(&sds->nohz_list_node));
kfree(sds->idle_cpus_mask);
}
@@ -509,7 +514,7 @@ static int __fallback_sds_alloc(struct s_data *d, unsigned long *visited_nodes)
d->fallback_nohz_sds[j] = sds;
- if (__sds_nohz_idle_alloc(sds, j))
+ if (__sds_nohz_idle_alloc_init(sds, j))
return -ENOMEM;
}
@@ -560,6 +565,7 @@ static void claim_fallback_sds(struct s_data *d)
static void update_nohz_domain(int cpu)
{
struct sched_domain *sd = highest_flag_domain(cpu, SD_SHARE_LLC);
+ struct sched_domain_shared *sds = NULL;
/*
* If sd_llc doesn't exist, use the lowest sd for nohz idle
@@ -570,13 +576,52 @@ static void update_nohz_domain(int cpu)
if (!sd)
sd = rcu_dereference(cpu_rq(cpu)->sd);
- WARN_ON_ONCE(sd && !sd->shared);
+ if (sd)
+ sds = sd->shared;
+
+ if (sds && list_empty(&sds->nohz_list_node)) {
+ /*
+ * IRQs should be disabled by the caller since they
+ * hold the rq_lock.
+ */
+ lockdep_assert_irqs_disabled();
+
+ guard(raw_spinlock)(&nohz_shared_list_lock);
+ list_add_rcu(&sds->nohz_list_node, &nohz_shared_list);
+ }
+
+ WARN_ON_ONCE(sd && !sds);
rcu_assign_pointer(per_cpu(sd_nohz, cpu), sd);
}
+static void destroy_sched_domain_shared_rcu(struct rcu_head *rcu)
+{
+ struct sched_domain_shared *sds = container_of(rcu, struct sched_domain_shared, rcu);
+
+ kfree(sds->idle_cpus_mask);
+ kfree(sds);
+}
+
+/*
+ * If sd->shared is on the rcu protected nohz_shared_list,
+ * remove it the list and wait once grace period before
+ * freeing.
+ */
+static int sds_delayed_free(struct sched_domain_shared *sds)
+{
+ if (list_empty(&sds->nohz_list_node))
+ return 0;
+
+ scoped_guard(raw_spinlock_irqsave, &nohz_shared_list_lock)
+ list_del_rcu(&sds->nohz_list_node);
+
+ call_rcu(&sds->rcu, destroy_sched_domain_shared_rcu);
+ return 1;
+}
+
#else /* !CONFIG_NO_HZ_COMMON */
-static int __sds_nohz_idle_alloc(struct sched_domain_shared *sds, int node)
+static int __sds_nohz_idle_alloc_init(struct sched_domain_shared *sds, int node)
{
return 0;
}
@@ -592,6 +637,7 @@ static inline void __fallback_sds_free(struct s_data *d) { }
static inline void assign_fallback_sds(struct s_data *d, struct sched_domain *sd, int cpu) { }
static inline void claim_fallback_sds(struct s_data *d) { }
static inline void update_nohz_domain(int cpu) { }
+static inline int sds_delayed_free(struct sched_domain_shared *sds) { return 0; }
#endif /* CONFIG_NO_HZ_COMMON */
@@ -771,9 +817,15 @@ static void destroy_sched_domain(struct sched_domain *sd)
free_sched_groups(sd->groups, 1);
if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) {
+ if (sds_delayed_free(sd->shared)) {
+ sd->shared = NULL;
+ goto out;
+ }
+
__sds_nohz_idle_free(sd->shared);
kfree(sd->shared);
}
+out:
kfree(sd);
}
@@ -2557,7 +2609,7 @@ static int __sds_alloc(struct s_data *d, const struct cpumask *cpu_map)
bitmap_set(visited_nodes, cpu_to_node(j), 1);
*per_cpu_ptr(d->sds, j) = sds;
- if (__sds_nohz_idle_alloc(sds, cpu_to_node(j)))
+ if (__sds_nohz_idle_alloc_init(sds, cpu_to_node(j)))
return -ENOMEM;
}
--
2.34.1
Powered by blists - more mailing lists