linux-kernel - [RFC PATCH 18/19] sched/fair: Optimize global "nohz.nr

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250904041516.3046-19-kprateek.nayak@amd.com>
Date: Thu, 4 Sep 2025 04:15:14 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
	Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
	<vincent.guittot@...aro.org>, Anna-Maria Behnsen <anna-maria@...utronix.de>,
	Frederic Weisbecker <frederic@...nel.org>, Thomas Gleixner
	<tglx@...utronix.de>, <linux-kernel@...r.kernel.org>
CC: Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
	<rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>, Mel Gorman
	<mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>, K Prateek Nayak
	<kprateek.nayak@....com>, "Gautham R. Shenoy" <gautham.shenoy@....com>,
	Swapnil Sapkal <swapnil.sapkal@....com>
Subject: [RFC PATCH 18/19] sched/fair: Optimize global "nohz.nr_cpus" tracking

Optimize "nohz.nr_cpus" by tracking number of "sd_nohz->shared" with
non-zero "nr_idle_cpus" count via "nohz.nr_doms" and only updating at
the boundary of "sd_nohz->shared->nr_idle_cpus" going from 0 -> 1 and
back from 1 -> 0.

This also introduces a chance of double accounting when a nohz idle
entry or the tick races with hotplug or cpuset as described in
__nohz_exit_idle_tracking().

__nohz_exit_idle_tracking() called when the sched_domain_shared nodes
tracking idle CPUs are freed is used to correct any potential double
accounting which can unnecessarily trigger nohz idle balances even when
all the CPUs have tick enabled.

Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
 kernel/sched/fair.c     | 63 ++++++++++++++++++++++++++++++++++++-----
 kernel/sched/sched.h    |  1 +
 kernel/sched/topology.c |  1 +
 3 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5b693bd0fab4..d65acf7ea12e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7169,7 +7169,7 @@ static DEFINE_PER_CPU(cpumask_var_t, should_we_balance_tmpmask);
 #ifdef CONFIG_NO_HZ_COMMON
 
 static struct {
-	atomic_t nr_cpus;
+	atomic_t nr_doms;
 	int has_blocked;		/* Idle CPUS has blocked load */
 	int needs_update;		/* Newly idle CPUs need their next_balance collated */
 	unsigned long next_balance;     /* in jiffy units */
@@ -12408,7 +12408,7 @@ static void nohz_balancer_kick(struct rq *rq)
 	 * None are in tickless mode and hence no need for NOHZ idle load
 	 * balancing:
 	 */
-	if (likely(!atomic_read(&nohz.nr_cpus)))
+	if (likely(!atomic_read(&nohz.nr_doms)))
 		return;
 
 	if (READ_ONCE(nohz.has_blocked) &&
@@ -12505,7 +12505,8 @@ static void set_cpu_sd_state_busy(int cpu)
 		return;
 
 	cpumask_clear_cpu(cpu, sd->shared->idle_cpus_mask);
-	atomic_dec(&sd->shared->nr_idle_cpus);
+	if (!atomic_dec_return(&sd->shared->nr_idle_cpus))
+		atomic_dec(&nohz.nr_doms);
 }
 
 void nohz_balance_exit_idle(struct rq *rq)
@@ -12516,7 +12517,6 @@ void nohz_balance_exit_idle(struct rq *rq)
 		return;
 
 	WRITE_ONCE(rq->nohz_tick_stopped, 0);
-	atomic_dec(&nohz.nr_cpus);
 
 	set_cpu_sd_state_busy(rq->cpu);
 }
@@ -12535,7 +12535,58 @@ static void set_cpu_sd_state_idle(int cpu)
 		return;
 
 	cpumask_set_cpu(cpu, sd->shared->idle_cpus_mask);
-	atomic_inc(&sd->shared->nr_idle_cpus);
+	if (!atomic_fetch_inc(&sd->shared->nr_idle_cpus))
+		atomic_inc(&nohz.nr_doms);
+}
+
+/*
+ * Correct nohz.nr_doms if sd_nohz->shared was found to have non-zero
+ * nr_idle_cpus when freeing. No local references to sds remain at
+ * this point and the only reference possible via "nohz_shared_list"
+ * will be dropped after the grace period.
+ */
+void __nohz_exit_idle_tracking(struct sched_domain_shared *sds)
+{
+
+	/*
+	 * It is possible for a idle entry to race with sched domain rebuild like:
+	 *
+	 *  CPU0 (hotplug)			CPU1 (nohz idle)
+	 *
+	 *  rq->offline(CPU1)
+	 *    set_cpu_sd_state_busy()
+	 *    rq->sd = sdd;			# Processes IPI, re-enters nohz idle
+	 *    ...				# For old sd_nohz
+	 *    ...				atomic_fetch_inc(&sd_nohz->shared->nr_idle_cpus);
+	 *    ...				atomic_inc(&nohz.nr_doms); # XXX: Accounted once
+	 *    update_top_cache_domains()
+	 *  rq->online(CPU1)
+	 *  # rq->nohz_tick_stopped is true
+	 *  set_cpu_sd_state_idle()
+	 *    # For new sd_nohz
+	 *    atomic_fetch_inc(&sd_nohz->shared->nr_idle_cpus);
+	 *    atomic_inc(&nohz.nr_doms); # XXX: Accounted twice
+	 *  ...
+	 *
+	 * "nohz.nr_doms" is used as an entry criteria in nohz_balancer_kick()
+	 * and this double accounting can lead to wasted idle balancing
+	 * triggers. Use this path to correct the accounting:
+	 *
+	 *  # In sds_delayed_free()
+	 *  __nohz_exit_idle_tracking(sds)
+	 *    # sd->shared->nr_idle_cpus is != 0
+	 *    atomic_dec(&nohz.nr_doms); # XXX: Fixes nohz.nr_doms
+	 */
+	if (atomic_read(&sds->nr_idle_cpus)) {
+		/*
+		 * Reset the "nr_idle_cpus" indicator to prevent
+		 * existing readers from traversing the idle mask
+		 * to reduce chances of traversing the same CPU
+		 * twice.
+		 */
+		atomic_set(&sds->nr_idle_cpus, 0);
+		atomic_dec(&nohz.nr_doms);
+	}
 }
 
 static void cpu_sd_exit_nohz_balance(struct rq *rq)
@@ -12587,8 +12638,6 @@ void nohz_balance_enter_idle(int cpu)
 
 	WRITE_ONCE(rq->nohz_tick_stopped, 1);
 
-	atomic_inc(&nohz.nr_cpus);
-
 	set_cpu_sd_state_idle(cpu);
 
 	/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9cffcfbef1ae..fcf4503caada 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3100,6 +3100,7 @@ extern void cfs_bandwidth_usage_dec(void);
 DECLARE_PER_CPU(struct sched_domain __rcu *, sd_nohz);
 extern struct list_head nohz_shared_list;
 
+extern void __nohz_exit_idle_tracking(struct sched_domain_shared *sds);
 extern void nohz_balance_exit_idle(struct rq *rq);
 #else /* !CONFIG_NO_HZ_COMMON: */
 static inline void nohz_balance_exit_idle(struct rq *rq) { }
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 86e33ed07254..ee9eed8470ba 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -615,6 +615,7 @@ static int sds_delayed_free(struct sched_domain_shared *sds)
 	scoped_guard(raw_spinlock_irqsave, &nohz_shared_list_lock)
 		list_del_rcu(&sds->nohz_list_node);
 
+	__nohz_exit_idle_tracking(sds);
 	call_rcu(&sds->rcu, destroy_sched_domain_shared_rcu);
 	return 1;
 }
-- 
2.34.1