linux-kernel - [RFC PATCH 11/08] sched/fair: Move from "last

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250316102916.10614-3-kprateek.nayak@amd.com>
Date: Sun, 16 Mar 2025 10:29:12 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
	Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
	<vincent.guittot@...aro.org>, Chen Yu <yu.c.chen@...el.com>,
	<linux-kernel@...r.kernel.org>
CC: Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
	<rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>, Mel Gorman
	<mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>, David Vernet
	<void@...ifault.com>, "Gautham R. Shenoy" <gautham.shenoy@....com>, "Swapnil
 Sapkal" <swapnil.sapkal@....com>, Shrikanth Hegde <sshegde@...ux.ibm.com>, "K
 Prateek Nayak" <kprateek.nayak@....com>
Subject: [RFC PATCH 11/08] sched/fair: Move from "last_update" to stats versioning

The combination of "stats_lock" and jiffy based "last_update" is not
scalable for newidle balance. Instead move to a versioning-based scheme
where the version number helps with both readers reading consistent data
without the need for a lock and writers using the version for both
locking and indicating stats freshness.

Additional semantics have been added for the writers to update state
stats if the time elapsed since last update has crossed the 50us
threshold.

Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
 kernel/sched/fair.c     | 83 +++++++++++++++++++++++++++--------------
 kernel/sched/sched.h    | 22 ++++++++++-
 kernel/sched/topology.c |  3 +-
 3 files changed, 77 insertions(+), 31 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d09f900a3107..6c486e194a9d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10275,11 +10275,13 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
 	return check_cpu_capacity(rq, sd);
 }
 
-static inline void cache_sd_stats(struct sched_domain *sd, struct sg_lb_stats *sd_stats)
+static inline void cache_sd_stats(struct lb_env *env, struct sg_lb_stats *sd_stats)
 {
-	struct sched_domain_shared *sd_share = sd->shared;
-	unsigned long current_jiffy = jiffies;
+	struct sched_domain_shared *sd_share = env->sd->shared;
 	struct sg_lb_stats_prop *lb_prop;
+	int cpu, retry_limit = 3;
+	u64 time, lock;
+	long version;
 
 	if (!sd_share)
 		return;
@@ -10288,23 +10290,52 @@ static inline void cache_sd_stats(struct sched_domain *sd, struct sg_lb_stats *s
 	if (!lb_prop)
 		return;
 
-	/* Concurrent load balancing instance already updated the stats */
-	if (READ_ONCE(lb_prop->last_update) == current_jiffy)
+	version = atomic_long_read_acquire(&lb_prop->version);
+	if (version < 0) /* Raced with a concurrent update. */
 		return;
 
-	scoped_guard(raw_spinlock_irqsave_try, &lb_prop->stats_lock) {
-		if (READ_ONCE(lb_prop->last_update) == current_jiffy)
-			break;
+	guard(irqsave)(); /* Minimize interruptions. */
+
+	cpu = smp_processor_id();
+	time = sched_clock_cpu(cpu);
 
-		lb_prop->sg_stats = *sd_stats;
+	/* Version is still fresh, no need to be rude yet. */
+	if (version > 0 && (s64)(time - version) <= 50 * NSEC_PER_USEC)
+		return;
 
+retry:
+	/*
+	 * Try to grab the stats for update. If the cmpxchg fails,
+	 * a concurrent writer succeeded to grab the stats before
+	 * this load balancing instance did. The acquire ordering
+	 * also pairs against readers checking the version after
+	 * reading the stats to ensure consistent state.
+	 */
+	lock = atomic_long_cmpxchg_acquire(&lb_prop->version, version, LLONG_MIN);
+
+	/* Someone else grabbed the version. */
+	if (lock != version) {
 		/*
-		 *  Pairs against readers checking the last_update
-		 *  before reading the cached stats.
+		 * Version is up for grabs! Try again. If the CPU grabs
+		 * the lock next time around lock = version = 0 and this
+		 * is skipped. If it cannot grab the version, lock != 0
+		 * and we return from here thus ensuring on a single
+		 * retry.
 		 */
-		smp_wmb();
-		WRITE_ONCE(lb_prop->last_update, current_jiffy);
+		if (!lock) {
+			version = 0;
+			goto retry;
+		}
+		return;
 	}
+
+	lb_prop->sg_stats = *sd_stats;
+
+	/*
+	 * Pairs against readers checking the version
+	 * before reading the stats.
+	 */
+	atomic_long_set_release(&lb_prop->version, time);
 }
 
 static inline int can_retrieve_stats(struct sched_domain *sd, enum cpu_idle_type idle)
@@ -10346,8 +10377,8 @@ static inline int can_retrieve_stats(struct sched_domain *sd, enum cpu_idle_type
 static inline int retrieve_cached_stats(struct sched_group *group, struct sg_lb_stats *sg_stats)
 {
 	struct sched_domain_shared *sg_share = group->shared;
-	unsigned long current_jiffy = jiffies;
 	struct sg_lb_stats_prop *lb_prop;
+	long version;
 
 	if (!sg_share)
 		return 0;
@@ -10356,24 +10387,22 @@ static inline int retrieve_cached_stats(struct sched_group *group, struct sg_lb_
 	if (!lb_prop)
 		return 0;
 
-	/* Stale stats */
-	if (READ_ONCE(lb_prop->last_update) != current_jiffy)
-		return 0;
-
 	/*
-	 * Pairs against the update to sgs_prop->last_update to
-	 * prevent readers from seeing an inconsistent value of
-	 * the propagated stats from a concurrent update.
+	 * Pairs with writer atomically updating version after
+	 * writing the stats.
 	 */
-	smp_rmb();
+	version = atomic_long_read_acquire(&lb_prop->version);
+	if (version <= 0) /* Stats have gone stale / being updated. */
+		return 0;
+
 	*sg_stats = lb_prop->sg_stats;
 
 	/*
-	 * If stats were read in the same interval, it cannot
-	 * read an inconsistent state since stats are only
-	 * updated once per jiffy.
+	 * Pairs with writer atomically invalidating a version
+	 * before updating the stats.
 	 */
-	return time_before_eq(jiffies, current_jiffy);
+	smp_rmb();
+	return atomic_long_read(&lb_prop->version) == version;
 }
 
 /**
@@ -11155,7 +11184,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 		sg_overutilized = sd_stats.overutilized;
 		sum_util = sd_stats.group_util;
 
-		cache_sd_stats(env->sd, &sd_stats);
+		cache_sd_stats(env, &sd_stats);
 	}
 
 	/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 391c4180eeb3..64f7e013fd59 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2176,8 +2176,26 @@ struct sg_lb_stats {
  * sched_domain load balancing statistics up the hierarchy.
  */
 struct sg_lb_stats_prop {
-	raw_spinlock_t          stats_lock;	/* Lock for updating the cached stats */
-	unsigned long		last_update;	/* Time when stats was last updated (jiffies) */
+	/*
+	 * Stats version has the following semantics:
+	 *
+	 * When 0, stats are considered state. A writer can lock the
+	 * stats by atomically changing it to LLONG_MIN. Once the
+	 * stats are written, the version is atomically updated to the
+	 * value returned by sched_clock_cpu().
+	 *
+	 * If the reader finds a positive value for version, the stats
+	 * are considered to be fresh and the reader will copy it for
+	 * load balancing. The version seen before and after the read
+	 * is compared to ensure the stats copied are consistent.
+	 *
+	 * Since invalidations under uncertain circumstances can take a
+	 * long time, a rude writer can always attempt to take over the
+	 * stats by atomically updating the version to LLONG_MIN if it
+	 * finds a large difference betwwen a valid version and the
+	 * value returned by sched_clock_cpu().
+	 */
+	atomic_long_t		version;
 	struct sg_lb_stats	sg_stats;	/* Cached sched_group stats */
 };
 
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index aeb55f66e8d6..2e72ef8d8d8e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2304,8 +2304,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 			if (!sg_stats)
 				return -ENOMEM;
 
-			raw_spin_lock_init(&sg_stats->stats_lock);
-			sg_stats->last_update = 0;
+			atomic_long_set(&sg_stats->version, 0);
 			sds->private = (void *)sg_stats;
 
 			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
-- 
2.43.0