linux-kernel - [RFC PATCH 8/8] sched/fair: Update stats for sched_domain using the sched

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250313093746.6760-9-kprateek.nayak@amd.com>
Date: Thu, 13 Mar 2025 09:37:46 +0000
From: K Prateek Nayak <kprateek.nayak@....com>
To: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>,
	Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot
	<vincent.guittot@...aro.org>, Chen Yu <yu.c.chen@...el.com>,
	<linux-kernel@...r.kernel.org>
CC: Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt
	<rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>, Mel Gorman
	<mgorman@...e.de>, Valentin Schneider <vschneid@...hat.com>, David Vernet
	<void@...ifault.com>, "Gautham R. Shenoy" <gautham.shenoy@....com>, "Swapnil
 Sapkal" <swapnil.sapkal@....com>, Shrikanth Hegde <sshegde@...ux.ibm.com>, "K
 Prateek Nayak" <kprateek.nayak@....com>
Subject: [RFC PATCH 8/8] sched/fair: Update stats for sched_domain using the sched_group stats

Aggregate the individual sched_group stats to compute the stat for the
entire sched_domain. Cache this in sd->shared which the sg->shared also
points to for the corresponding sched_group of sd for its parent. This
ensures that the stats are readily available for the higher domains if
the load balancing continues.

With the new infrastructure in place, following are the benchmark
numbers:

  ==================================================================
  Test          : hackbench
  Units         : Normalized time in seconds
  Interpretation: Lower is better
  Statistic     : AMean
  ==================================================================
  Case:           tip[pct imp](CV)      stats_prop[pct imp](CV)
   1-groups     1.00 [ -0.00](10.12)     1.09 [ -9.11](11.93)
   2-groups     1.00 [ -0.00]( 6.92)     1.00 [ -0.22]( 4.57)
   4-groups     1.00 [ -0.00]( 3.14)     0.99 [  0.83]( 1.77)
   8-groups     1.00 [ -0.00]( 1.35)     1.00 [ -0.31]( 2.24)
  16-groups     1.00 [ -0.00]( 1.32)     0.99 [  0.84]( 0.67)

  ==================================================================
  Test          : tbench
  Units         : Normalized throughput
  Interpretation: Higher is better
  Statistic     : AMean
  ==================================================================
  Clients:    tip[pct imp](CV)      stats_prop[pct imp](CV)
      1     1.00 [  0.00]( 0.43)     0.99 [ -0.87]( 1.34)
      2     1.00 [  0.00]( 0.58)     1.02 [  2.14]( 0.29)
      4     1.00 [  0.00]( 0.54)     1.01 [  1.24]( 0.82)
      8     1.00 [  0.00]( 0.49)     1.01 [  0.62]( 0.97)
     16     1.00 [  0.00]( 1.06)     1.01 [  0.94]( 0.70)
     32     1.00 [  0.00]( 1.27)     0.99 [ -1.24]( 1.38)
     64     1.00 [  0.00]( 1.54)     1.00 [ -0.43]( 0.36)
    128     1.00 [  0.00]( 0.38)     1.00 [ -0.01]( 1.22)
    256     1.00 [  0.00]( 1.85)     1.02 [  1.58]( 0.90)
    512     1.00 [  0.00]( 0.31)     1.01 [  0.76]( 1.19)
   1024     1.00 [  0.00]( 0.19)     1.00 [  0.44]( 0.35)

  ==================================================================
  Test          : stream-10
  Units         : Normalized Bandwidth, MB/s
  Interpretation: Higher is better
  Statistic     : HMean
  ==================================================================
  Test:       tip[pct imp](CV)      stats_prop[pct imp](CV)
   Copy     1.00 [  0.00](11.31)     1.02 [  1.69]( 6.44)
  Scale     1.00 [  0.00]( 6.62)     1.01 [  0.80]( 5.37)
    Add     1.00 [  0.00]( 7.06)     1.02 [  1.54]( 6.72)
  Triad     1.00 [  0.00]( 8.91)     1.01 [  1.36]( 6.73)

  ==================================================================
  Test          : stream-100
  Units         : Normalized Bandwidth, MB/s
  Interpretation: Higher is better
  Statistic     : HMean
  ==================================================================
  Test:       tip[pct imp](CV)      stats_prop[pct imp](CV)
   Copy     1.00 [  0.00]( 2.01)     0.98 [ -1.55]( 2.15)
  Scale     1.00 [  0.00]( 1.49)     1.00 [  0.23]( 0.58)
    Add     1.00 [  0.00]( 2.67)     1.01 [  0.65]( 1.95)
  Triad     1.00 [  0.00]( 2.19)     1.01 [  0.61]( 1.37)

  ==================================================================
  Test          : netperf
  Units         : Normalized Througput
  Interpretation: Higher is better
  Statistic     : AMean
  ==================================================================
     Clients:       tip[pct imp](CV)      stats_prop[pct imp](CV)
    1-clients     1.00 [  0.00]( 1.43)     1.00 [  0.17]( 0.32)
    2-clients     1.00 [  0.00]( 1.02)     1.01 [  1.00]( 0.44)
    4-clients     1.00 [  0.00]( 0.83)     1.01 [  0.62]( 0.36)
    8-clients     1.00 [  0.00]( 0.73)     1.00 [ -0.11]( 0.65)
   16-clients     1.00 [  0.00]( 0.97)     1.00 [  0.49]( 0.77)
   32-clients     1.00 [  0.00]( 0.88)     1.00 [  0.30]( 0.94)
   64-clients     1.00 [  0.00]( 1.49)     1.00 [  0.36]( 1.57)
  128-clients     1.00 [  0.00]( 1.05)     1.00 [  0.14]( 1.46)
  256-clients     1.00 [  0.00]( 3.85)     1.00 [ -0.04]( 4.85)
  512-clients     1.00 [  0.00](59.63)     1.00 [ -0.02](62.28)

  ==================================================================
  Test          : schbench
  Units         : Normalized 99th percentile latency in us
  Interpretation: Lower is better
  Statistic     : Median
  ==================================================================
  #workers:     tip[pct imp](CV)      stats_prop[pct imp](CV)
    1         1.00 [ -0.00]( 6.67)     0.76 [ 24.44](35.80)
    2         1.00 [ -0.00](10.18)     0.87 [ 13.04](10.38)
    4         1.00 [ -0.00]( 4.49)     1.04 [ -4.26]( 3.14)
    8         1.00 [ -0.00]( 6.68)     0.98 [  1.89]( 8.07)
   16         1.00 [ -0.00]( 1.87)     1.03 [ -3.28]( 5.21)
   32         1.00 [ -0.00]( 4.01)     0.98 [  2.20]( 1.31)
   64         1.00 [ -0.00]( 3.21)     1.00 [ -0.00]( 3.23)
  128         1.00 [ -0.00](44.13)     1.06 [ -6.43](113.66)
  256         1.00 [ -0.00](14.46)     1.04 [ -3.52]( 8.43)
  512         1.00 [ -0.00]( 1.95)     1.02 [ -1.80]( 1.14)

  ==================================================================
  Test          : new-schbench-requests-per-second
  Units         : Normalized Requests per second
  Interpretation: Higher is better
  Statistic     : Median
  ==================================================================
  #workers:      tip[pct imp](CV)      stats_prop[pct imp](CV)
    1          1.00 [  0.00]( 0.46)     1.00 [  0.00]( 0.55)
    2          1.00 [  0.00]( 0.15)     0.99 [ -0.88]( 0.26)
    4          1.00 [  0.00]( 0.15)     0.99 [ -0.59]( 0.15)
    8          1.00 [  0.00]( 0.15)     0.99 [ -0.88]( 0.26)
   16          1.00 [  0.00]( 0.00)     1.00 [ -0.29]( 0.15)
   32          1.00 [  0.00]( 3.40)     1.07 [  6.59]( 0.16)
   64          1.00 [  0.00]( 7.09)     1.00 [ -0.38]( 0.96)
  128          1.00 [  0.00]( 0.00)     1.00 [  0.00]( 0.20)
  256          1.00 [  0.00]( 1.12)     1.00 [ -0.30]( 1.50)
  512          1.00 [  0.00]( 0.22)     1.05 [  4.86]( 0.71)

  ==================================================================
  Test          : new-schbench-wakeup-latency
  Units         : Normalized 99th percentile latency in us
  Interpretation: Lower is better
  Statistic     : Median
  ==================================================================
  #workers:     tip[pct imp](CV)      stats_prop[pct imp](CV)
    1         1.00 [ -0.00](19.72)     0.85 [ 15.38](16.75)
    2         1.00 [ -0.00](15.96)     1.00 [ -0.00]( 0.00)
    4         1.00 [ -0.00]( 3.87)     1.00 [ -0.00]( 4.08)
    8         1.00 [ -0.00]( 8.15)     1.00 [ -0.00](11.71)
   16         1.00 [ -0.00]( 3.87)     0.92 [  7.69]( 4.19)
   32         1.00 [ -0.00](12.99)     0.73 [ 26.67]( 0.00)
   64         1.00 [ -0.00]( 6.20)     1.12 [-12.50]( 9.94)
  128         1.00 [ -0.00]( 0.96)     0.98 [  1.55]( 0.95)
  256         1.00 [ -0.00]( 2.76)     0.99 [  1.45]( 1.38)
  512         1.00 [ -0.00]( 0.20)     1.20 [-20.42]( 0.00)

  ==================================================================
  Test          : new-schbench-request-latency
  Units         : Normalized 99th percentile latency in us
  Interpretation: Lower is better
  Statistic     : Median
  ==================================================================
  #workers:     tip[pct imp](CV)      stats_prop[pct imp](CV)
    1         1.00 [ -0.00]( 1.07)     1.02 [ -2.08]( 0.13)
    2         1.00 [ -0.00]( 0.14)     1.04 [ -3.97]( 0.13)
    4         1.00 [ -0.00]( 1.39)     1.03 [ -3.15]( 0.13)
    8         1.00 [ -0.00]( 0.36)     1.03 [ -3.16]( 0.00)
   16         1.00 [ -0.00]( 1.18)     1.02 [ -1.59]( 0.75)
   32         1.00 [ -0.00]( 8.42)     0.81 [ 19.08]( 0.25)
   64         1.00 [ -0.00]( 4.85)     1.01 [ -1.10]( 2.58)
  128         1.00 [ -0.00]( 0.28)     1.00 [ -0.21]( 0.38)
  256         1.00 [ -0.00](10.52)     0.95 [  4.74]( 6.94)
  512         1.00 [ -0.00]( 0.69)     1.09 [ -8.99]( 0.27)

  ==================================================================
  Test          : Various longer running benchmarks
  Units         : %diff in throughput reported
  Interpretation: Higher is better
  Statistic     : Median
  ==================================================================
  Benchmarks:                 %diff

  ycsb-cassandra             -0.54%
  ycsb-mongodb                0.09%

  deathstarbench-1x          -0.30%
  deathstarbench-2x           2.38%
  deathstarbench-3x           0.58%
  deathstarbench-6x           0.62%

  hammerdb+mysql 16VU         0.76%
  hammerdb+mysql 64VU         0.74%

* The tail latencies reported by schbench increases possibly due to the
  sync in load balancing across multiple domains however it remains to
  be investigated.

Signed-off-by: K Prateek Nayak <kprateek.nayak@....com>
---
 kernel/sched/fair.c | 99 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 92 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3b402f294f0b..212bee3e9f35 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10275,6 +10275,38 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
 	return check_cpu_capacity(rq, sd);
 }
 
+static inline void cache_sd_stats(struct sched_domain *sd, struct sg_lb_stats *sd_stats)
+{
+	struct sched_domain_shared *sd_share = sd->shared;
+	unsigned long current_jiffy = jiffies;
+	struct sg_lb_stats_prop *lb_prop;
+
+	if (!sd_share)
+		return;
+
+	lb_prop = (struct sg_lb_stats_prop *)sd_share->private;
+	if (!lb_prop)
+		return;
+
+	/* Concurrent load balancing instance already updated the stats */
+	if (READ_ONCE(lb_prop->last_update) == current_jiffy)
+		return;
+
+	scoped_guard(raw_spinlock_irqsave_try, &lb_prop->stats_lock) {
+		if (READ_ONCE(lb_prop->last_update) == current_jiffy)
+			break;
+
+		lb_prop->sg_stats = *sd_stats;
+
+		/*
+		 *  Pairs against readers checking the last_update
+		 *  before reading the cached stats.
+		 */
+		smp_wmb();
+		WRITE_ONCE(lb_prop->last_update, current_jiffy);
+	}
+}
+
 static inline int can_retrieve_stats(struct sched_domain *sd, enum cpu_idle_type idle)
 {
 	/*
@@ -10344,6 +10376,35 @@ static inline int retrieve_cached_stats(struct sched_group *group, struct sg_lb_
 	return time_before_eq(jiffies, current_jiffy);
 }
 
+/**
+ * aggregate_sd_prop_stats - Compute sched domains's stats from group stats.
+ * @env: The load balancing environment.
+ * @sgs_prop: variable to hold the statistics to propagate for the sd
+ * @sgs: Group stat that was computed or retrieved
+ */
+static inline void aggregate_sd_stats(struct lb_env *env,
+				      struct sg_lb_stats *sd_stats,
+				      struct sg_lb_stats *sg_stats)
+{
+	sd_stats->group_load += sg_stats->group_load;
+	sd_stats->group_util += sg_stats->group_util;
+	sd_stats->group_runnable += sg_stats->group_runnable;
+	sd_stats->sum_h_nr_running += sg_stats->sum_h_nr_running;
+	sd_stats->sum_nr_running += sg_stats->sum_nr_running;
+	sd_stats->idle_cpus += sg_stats->idle_cpus;
+	sd_stats->group_capacity += sg_stats->group_capacity;
+	sd_stats->group_weight += sg_stats->group_weight;
+	sd_stats->overloaded |= sg_stats->overloaded;
+	sd_stats->overutilized |= sg_stats->overutilized;
+
+#ifdef CONFIG_NUMA_BALANCING
+	if (env->sd->flags & SD_NUMA) {
+		sd_stats->nr_numa_running += sg_stats->nr_numa_running;
+		sd_stats->nr_preferred_running += sg_stats->nr_preferred_running;
+	}
+#endif
+}
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -11041,9 +11102,18 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 {
 	struct sched_group *sg = env->sd->groups;
 	struct sg_lb_stats *local = &sds->local_stat;
-	struct sg_lb_stats tmp_sgs;
-	unsigned long sum_util = 0;
 	bool sg_overloaded = 0, sg_overutilized = 0;
+	struct sg_lb_stats tmp_sgs, sd_stats;
+	unsigned long sum_util = 0;
+	bool should_prop = false;
+
+	/*
+	 * If a parent domain exists and the cached stats can be retrieved when
+	 * load balancing there, aggregate the statistics at current domain
+	 * to be retrieved when load balancing at parent.
+	 */
+	if (env->sd->parent && can_retrieve_stats(env->sd->parent, env->idle))
+		should_prop = true;
 
 	do {
 		struct sg_lb_stats *sgs = &tmp_sgs;
@@ -11061,21 +11131,36 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 
 		update_sg_lb_stats(env, sds, sg, sgs);
 
+		if (should_prop)
+			aggregate_sd_stats(env, &sd_stats, sgs);
+
 		if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
 			sds->busiest = sg;
 			sds->busiest_stat = *sgs;
 		}
 
 		/* Now, start updating sd_lb_stats */
-		sds->total_load += sgs->group_load;
-		sds->total_capacity += sgs->group_capacity;
-		sg_overloaded |= sgs->overloaded;
-		sg_overutilized |= sgs->overutlizied;
+		if (!should_prop) {
+			sds->total_load += sgs->group_load;
+			sds->total_capacity += sgs->group_capacity;
+			sg_overloaded |= sgs->overloaded;
+			sg_overutilized |= sgs->overutilized;
+			sum_util += sgs->group_util;
+		}
 
-		sum_util += sgs->group_util;
 		sg = sg->next;
 	} while (sg != env->sd->groups);
 
+	if (should_prop) {
+		sds->total_load = sd_stats.group_load;
+		sds->total_capacity = sd_stats.group_capacity;
+		sg_overloaded = sd_stats.overloaded;
+		sg_overutilized = sd_stats.overutilized;
+		sum_util = sd_stats.group_util;
+
+		cache_sd_stats(env->sd, &sd_stats);
+	}
+
 	/*
 	 * Indicate that the child domain of the busiest group prefers tasks
 	 * go to a child's sibling domains first. NB the flags of a sched group
-- 
2.43.0