linux-kernel - Re: [RESEND PATCH] sched/fair: Skip sched_balance

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <a5d5ce5e-9f98-4c0d-a4ed-5e4a8a6f7b86@linux.ibm.com>
Date: Thu, 16 Oct 2025 19:33:34 +0530
From: Shrikanth Hegde <sshegde@...ux.ibm.com>
To: Peter Zijlstra <peterz@...radead.org>
Cc: Tim Chen <tim.c.chen@...ux.intel.com>, Ingo Molnar <mingo@...nel.org>,
        Chen Yu <yu.c.chen@...el.com>, Doug Nelson <doug.nelson@...el.com>,
        Mohini Narkhede <mohini.narkhede@...el.com>,
        linux-kernel@...r.kernel.org,
        Vincent Guittot <vincent.guittot@...aro.org>,
        K Prateek Nayak <kprateek.nayak@....com>
Subject: Re: [RESEND PATCH] sched/fair: Skip sched_balance_running cmpxchg
 when balance is not due



On 10/14/25 3:12 PM, Peter Zijlstra wrote:
> On Tue, Oct 14, 2025 at 03:03:41PM +0530, Shrikanth Hegde wrote:
> 
>>> @@ -11758,6 +11775,12 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
>>>    		goto out_balanced;
>>>    	}
>>> +	if (idle != CPU_NEWLY_IDLE && (sd->flags & SD_SERIALIZE)) {
>>> +		if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
>>> +			goto out_balanced;
>>
>> Maybe goto out instead of out_balanced ?
> 
> That would be inconsistent with the !should_we_balance() goto
> out_balanced right above this, no?
> 
Hi Peter.

Did similar probe points numbers compared to this. Even the patch is quite similar to what
was suggested there a while ago.
https://lore.kernel.org/all/41e11090-a100-48a7-a0dd-c989772822d7@linux.ibm.com/

480 CPUs system with 6 NUMA nodes. (different system than last time)

tl;dr

- Number of time sched_balance_running is taken is way less after the swb check. (which is great)
- Number of time it fails to set is very low after swb. (So out_balanced vs out may not make a
significant difference.)
- Patch is at the end. It is this patch + redo stuff + (ref_variable_stuff(ignore))


--- detailed log----

++++++++++++ probe points +++++++++++++++
(added a ref("crap") so i could put a probe where i want )

       0  static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
...
      20                 max_cost += sd->max_newidle_lb_cost;
          
                         /*
                          * Stop the load balance at this level. There is another
                          * CPU in our sched group which is doing load balancing more
                          * actively.
                          */
                         if (!continue_balancing) {
                                 if (need_decay)
                                         continue;
                                 break;
                         }
      33                 if (sd->flags & SD_SERIALIZE)
      34                         ref = ref + 5;



<sched_balance_rq@...me/shrikanth/sched_tip/kernel/sched/fair.c:0>
       0  static int sched_balance_rq(int this_cpu, struct rq *this_rq,
                                 struct sched_domain *sd, enum cpu_idle_type idle,
                                 int *continue_balancing)
...
                 int need_unlock = false;
          
                 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
          
      25         schedstat_inc(sd->lb_count[idle]);
...
      34         if (idle != CPU_NEWLY_IDLE && (sd->flags & SD_SERIALIZE)) {
      35                 if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) {
      36                         ref = ref+1;
      37                         goto out_balanced;
                         }
      39                 ref = ref + 2;
      40                 need_unlock = true;
...
                                         env.loop_break = SCHED_NR_MIGRATE_BREAK;
     167                                 if (need_unlock) {
     168                                         ref = ref+3;
     169                                         atomic_set_release(&sched_balance_running, 0);
                                         }
                                         goto redo;
...
          out:
     287         if (need_unlock) {
     288                 ref = ref +4;
     289                 atomic_set_release(&sched_balance_running, 0);
                 }
          
                 return ld_moved;

   probe:sched_balance_domains_L34 (on sched_balance_domains:34@...nel/sched/fair.c)
   probe:sched_balance_rq_L168 (on sched_balance_rq:168@...nel/sched/fair.c)
   probe:sched_balance_rq_L21 (on sched_balance_rq+312@...nel/sched/fair.c)
   probe:sched_balance_rq_L288 (on sched_balance_rq+312@...nel/sched/fair.c)
   probe:sched_balance_rq_L35 (on sched_balance_rq+312@...nel/sched/fair.c)
   probe:sched_balance_rq_L36 (on sched_balance_rq+312@...nel/sched/fair.c)
   probe:sched_balance_rq_L39 (on sched_balance_rq+312@...nel/sched/fair.c)

+++++++++++ Data on various load points ++++++++++++++++++++++++
--- idle ---
perf stat -a -e probe:* sleep 10

              6,123      probe:sched_balance_domains_L34
             10,378      probe:sched_balance_rq_L21
                 79      probe:sched_balance_rq_L35
                 17      probe:sched_balance_rq_L36
                 62      probe:sched_balance_rq_L39
                  0      probe:sched_balance_rq_L168
                 62      probe:sched_balance_rq_L288

--- 25% load ---
perf stat -a -e probe:* stress-ng --cpu=480 -l 25 -t 10

            510,551      probe:sched_balance_domains_L34
            303,892      probe:sched_balance_rq_L21
                442      probe:sched_balance_rq_L35
                  3      probe:sched_balance_rq_L36
                439      probe:sched_balance_rq_L39
                  0      probe:sched_balance_rq_L168
                439      probe:sched_balance_rq_L288

--- 50% load ---

            248,969      probe:sched_balance_domains_L34
            187,864      probe:sched_balance_rq_L21
                926      probe:sched_balance_rq_L35
                  6      probe:sched_balance_rq_L36
                920      probe:sched_balance_rq_L39
                  0      probe:sched_balance_rq_L168
                920      probe:sched_balance_rq_L288

--- 75% load ---

            110,294      probe:sched_balance_domains_L34
             71,568      probe:sched_balance_rq_L21
                861      probe:sched_balance_rq_L35
                  6      probe:sched_balance_rq_L36
                855      probe:sched_balance_rq_L39
                  0      probe:sched_balance_rq_L168
                855      probe:sched_balance_rq_L288

--- 100% load ---

             85,960      probe:sched_balance_domains_L34
             48,169      probe:sched_balance_rq_L21
                 71      probe:sched_balance_rq_L35
                  4      probe:sched_balance_rq_L36
                 67      probe:sched_balance_rq_L39
                  0      probe:sched_balance_rq_L168
                 67      probe:sched_balance_rq_L288

++++++++++++++++++ patch ++++++++++++++++++++++++++++++++++++
(ignore ref crap)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index cee1793e8277..832104705500 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11722,10 +11722,29 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd
  	}
  }
  
+
+/*
+ * This flag serializes load-balancing passes over large domains
+ * (above the NODE topology level) - only one load-balancing instance
+ * may run at a time, to reduce overhead on very large systems with
+ * lots of CPUs and large NUMA distances.
+ *
+ * - Note that load-balancing passes triggered while another one
+ *   is executing are skipped and not re-tried.
+ *
+ * - Also note that this does not serialize rebalance_domains()
+ *   execution, as non-SD_SERIALIZE domains will still be
+ *   load-balanced in parallel.
+ */
+static atomic_t sched_balance_running = ATOMIC_INIT(0);
+
  /*
   * Check this_cpu to ensure it is balanced within domain. Attempt to move
   * tasks if there is an imbalance.
   */
+
+int ref = 0;
+
  static int sched_balance_rq(int this_cpu, struct rq *this_rq,
  			struct sched_domain *sd, enum cpu_idle_type idle,
  			int *continue_balancing)
@@ -11747,10 +11766,12 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
  		.fbq_type	= all,
  		.tasks		= LIST_HEAD_INIT(env.tasks),
  	};
+	int need_unlock = false;
  
  	cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
  
  	schedstat_inc(sd->lb_count[idle]);
+	ref = 1;
  
  redo:
  	if (!should_we_balance(&env)) {
@@ -11758,6 +11779,15 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
  		goto out_balanced;
  	}
  
+	if (idle != CPU_NEWLY_IDLE && (sd->flags & SD_SERIALIZE)) {
+		if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1)) {
+			ref = ref+1;
+			goto out_balanced;
+		}
+		ref = ref + 2;
+		need_unlock = true;
+	}
+
  	group = sched_balance_find_src_group(&env);
  	if (!group) {
  		schedstat_inc(sd->lb_nobusyg[idle]);
@@ -11882,6 +11912,10 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
  			if (!cpumask_subset(cpus, env.dst_grpmask)) {
  				env.loop = 0;
  				env.loop_break = SCHED_NR_MIGRATE_BREAK;
+				if (need_unlock) {
+					ref = ref+3;
+					atomic_set_release(&sched_balance_running, 0);
+				}
  				goto redo;
  			}
  			goto out_all_pinned;
@@ -11998,6 +12032,11 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
  	    sd->balance_interval < sd->max_interval)
  		sd->balance_interval *= 2;
  out:
+	if (need_unlock) {
+		ref = ref +4;
+		atomic_set_release(&sched_balance_running, 0);
+	}
+
  	return ld_moved;
  }
  
@@ -12122,21 +12161,6 @@ static int active_load_balance_cpu_stop(void *data)
  	return 0;
  }
  
-/*
- * This flag serializes load-balancing passes over large domains
- * (above the NODE topology level) - only one load-balancing instance
- * may run at a time, to reduce overhead on very large systems with
- * lots of CPUs and large NUMA distances.
- *
- * - Note that load-balancing passes triggered while another one
- *   is executing are skipped and not re-tried.
- *
- * - Also note that this does not serialize rebalance_domains()
- *   execution, as non-SD_SERIALIZE domains will still be
- *   load-balanced in parallel.
- */
-static atomic_t sched_balance_running = ATOMIC_INIT(0);
-
  /*
   * Scale the max sched_balance_rq interval with the number of CPUs in the system.
   * This trades load-balance latency on larger machines for less cross talk.
@@ -12192,7 +12216,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
  	/* Earliest time when we have to do rebalance again */
  	unsigned long next_balance = jiffies + 60*HZ;
  	int update_next_balance = 0;
-	int need_serialize, need_decay = 0;
+	int need_decay = 0;
  	u64 max_cost = 0;
  
  	rcu_read_lock();
@@ -12215,14 +12239,10 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
  			break;
  		}
  
-		interval = get_sd_balance_interval(sd, busy);
-
-		need_serialize = sd->flags & SD_SERIALIZE;
-		if (need_serialize) {
-			if (atomic_cmpxchg_acquire(&sched_balance_running, 0, 1))
-				goto out;
-		}
+		if (sd->flags & SD_SERIALIZE)
+			ref = ref + 5;
  
+		interval = get_sd_balance_interval(sd, busy);
  		if (time_after_eq(jiffies, sd->last_balance + interval)) {
  			if (sched_balance_rq(cpu, rq, sd, idle, &continue_balancing)) {
  				/*
@@ -12236,9 +12256,7 @@ static void sched_balance_domains(struct rq *rq, enum cpu_idle_type idle)
  			sd->last_balance = jiffies;
  			interval = get_sd_balance_interval(sd, busy);
  		}
-		if (need_serialize)
-			atomic_set_release(&sched_balance_running, 0);
-out:
+
  		if (time_after(next_balance, sd->last_balance + interval)) {
  			next_balance = sd->last_balance + interval;
  			update_next_balance = 1;