linux-kernel - [PATCH] Reduce rq lock contention in load

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <ef54b130-e727-5ed1-1a4e-1e3c0a713b98@bytedance.com>
Date:   Thu, 24 Nov 2022 17:07:46 +0800
From:   chenying <chenying.kernel@...edance.com>
To:     mingo@...hat.com, Peter Zijlstra <peterz@...radead.org>,
        Juri Lelli <juri.lelli@...hat.com>,
        Vincent Guittot <vincent.guittot@...aro.org>,
        Dietmar Eggemann <dietmar.eggemann@....com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Benjamin Segall <bsegall@...gle.com>
Cc:     linux-kernel <linux-kernel@...r.kernel.org>,
        Abel Wu <wuyun.abel@...edance.com>
Subject: [PATCH] Reduce rq lock contention in load_balance()

From: chenying <chenying.kernel@...edance.com>

When doing newidle load balancing, we may have lock contention on rq->lock
while finding the same busiest rq on multiple cpus. However, it is often
the case that after the first load balancing, the busiest-rq may not be the
busiest anymore. This may lead to pointless waits for locks. For this case,
we want to use trylock to reduce rq lock contention in load_balance().

We add rq->lb_lock for the load balancing path, and uses trylock to
try to acquire the busiest rq lb_lock, if it fails, clear the
busiest rq's cpu from load_balance_mask and then goto refind.

The test results show that this patch brings ~35% rq lock contentions
reduced and no scheduling latency reduction.

unpatched:
lock_stat version 0.4
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                               class name    con-bounces    contentions 
  waittime-min   waittime-max waittime-total   waittime-avg 
acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total 
  holdtime-avg
.............................................................................................................................................................................................................................

                                &rq->lock:         24906          25996 
          0.08          27.77       43122.87           1.66 
1216316        6601547           0.05          41.59    10224267.38 
      1.55
                                ---------
                                &rq->lock           1210 
[<000000000fe88813>] scheduler_tick+0x4f/0xf0
                                &rq->lock           1885 
[<00000000de367e3c>] _nohz_idle_balance+0x116/0x250 
          &rq->lock          15111          [<00000000daf6fa95>] 
update_blocked_averages+0x30/0x6f0
                                &rq->lock           1156 
[<00000000d5c71b0e>] __schedule+0xa9/0x800
                                ---------
                                &rq->lock          15542 
[<00000000daf6fa95>] update_blocked_averages+0x30/0x6f0
                                &rq->lock            733 
[<000000000fe88813>] scheduler_tick+0x4f/0xf0
                                &rq->lock           3066 
[<000000000bc2ee47>] try_to_wake_up+0x206/0x710
                                &rq->lock           1272 
[<00000000d5c71b0e>] __schedule+0xa9/0x800

patched:
lock_stat version 0.4
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                               class name    con-bounces    contentions 
  waittime-min   waittime-max waittime-total   waittime-avg 
acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total 
  holdtime-avg
.............................................................................................................................................................................................................................

                                &rq->lock:         16174          17105 
          0.07          33.13       31154.45           1.82 
1162817        6602803           0.05          64.68    10141979.28 
      1.54
                                ---------
                                &rq->lock          11665 
[<00000000ce27c902>] update_blocked_averages+0x30/0x700
                                &rq->lock           1457 
[<00000000a6302c24>] try_to_wake_up+0x206/0x710
                                &rq->lock           1159 
[<000000009f2bc605>] __schedule+0xa9/0x810
                                &rq->lock           1411 
[<00000000aa0a6e31>] _nohz_idle_balance+0x116/0x250
                                ---------
                                &rq->lock           3032 
[<00000000a6302c24>] try_to_wake_up+0x206/0x710
                                &rq->lock            248 
[<000000008bd7e827>] load_balance+0x571/0xe80
                                &rq->lock          11502 
[<00000000ce27c902>] update_blocked_averages+0x30/0x700
                                &rq->lock           1253 
[<000000009f2bc605>] __schedule+0xa9/0x810

unpatched:
  # ./runqlat 60 1

     usecs               : count     distribution
          0 -> 1          : 1172     | 
      |
          2 -> 3          : 210063   |************************ 
      |
          4 -> 7          : 337576 
|****************************************|
          8 -> 15         : 24555    |** 
      |
         16 -> 31         : 13598    |* 
      |
         32 -> 63         : 779      | 
      |
         64 -> 127        : 230      | 
      |
        128 -> 255        : 83       | 
      |
        256 -> 511        : 54       | 
      |
        512 -> 1023       : 62       | 
      |
       1024 -> 2047       : 123      | 
      |
       2048 -> 4095       : 283      | 
      |
       4096 -> 8191       : 1362     | 
      |
       8192 -> 16383      : 2775     | 
      |
      16384 -> 32767      : 52352    |****** 
      |
      32768 -> 65535      : 14       | 
      |
      65536 -> 131071     : 140      | 
      |

  patched:
  # ./runqlat 60 1

      usecs               : count     distribution
          0 -> 1          : 1091     | 
      |
          2 -> 3          : 205259   |*********************** 
      |
          4 -> 7          : 351620 
|****************************************|
          8 -> 15         : 27812    |*** 
      |
         16 -> 31         : 13971    |* 
      |
         32 -> 63         : 727      | 
      |
         64 -> 127        : 198      | 
      |
        128 -> 255        : 103      | 
      |
        256 -> 511        : 61       | 
      |
        512 -> 1023       : 45       | 
      |
       1024 -> 2047       : 108      | 
      |
       2048 -> 4095       : 271      | 
      |
       4096 -> 8191       : 1342     | 
      |
       8192 -> 16383      : 2732     | 
      |
      16384 -> 32767      : 49367    |***** 
      |
      32768 -> 65535      : 8        | 
      |
      65536 -> 131071     : 183      | 
      |

test script:

	#!/bin/bash

	mkdir /sys/fs/cgroup/cpuset/test1
	echo 12,14,16,18,20,22 > /sys/fs/cgroup/cpuset/test1/cpuset.cpus
	echo 0 > /sys/fs/cgroup/cpuset/test1/cpuset.mems

	mkdir /sys/fs/cgroup/cpuset/test2
	echo 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46 
 > /sys/fs/cgroup/cpuset/test2/cpuset.cpus
	echo 0 > /sys/fs/cgroup/cpuset/test2/cpuset.mems

	cgexec -g cpuset:test1 sysbench --test=cpu --cpu-max-prime=200000 run 
--num-threads=24 --rate=100 --time=6000 &
	cgexec -g cpuset:test2 sysbench --test=cpu --cpu-max-prime=200000 run 
--num-threads=96 --rate=100 --time=6000 &

Suggested-by: Abel Wu <wuyun.abel@...edance.com>
Signed-off-by: chenying <chenying.kernel@...edance.com>
---
  kernel/sched/core.c  |  1 +
  kernel/sched/fair.c  | 12 ++++++++++++
  kernel/sched/sched.h |  1 +
  3 files changed, 14 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index daff72f00385..d41f1a9c7d5f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9697,6 +9697,7 @@ void __init sched_init(void)

  		rq = cpu_rq(i);
  		raw_spin_lock_init(&rq->__lock);
+		raw_spin_lock_init(&rq->lb_lock);
  		rq->nr_running = 0;
  		rq->calc_load_active = 0;
  		rq->calc_load_update = jiffies + LOAD_FREQ;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4a0b8bd941c..d92c42671b99 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10295,6 +10295,7 @@ static int load_balance(int this_cpu, struct rq 
*this_rq,
  		goto out_balanced;
  	}

+refind:
  	busiest = find_busiest_queue(&env, group);
  	if (!busiest) {
  		schedstat_inc(sd->lb_nobusyq[idle]);
@@ -10303,6 +10304,14 @@ static int load_balance(int this_cpu, struct rq 
*this_rq,

  	WARN_ON_ONCE(busiest == env.dst_rq);

+	if (!raw_spin_trylock(&busiest->lb_lock)) {
+		__cpumask_clear_cpu(cpu_of(busiest), cpus);
+		if (cpumask_intersects(sched_group_span(group), cpus))
+			goto refind;
+
+		goto out_balanced;
+	}
+
  	schedstat_add(sd->lb_imbalance[idle], env.imbalance);

  	env.src_cpu = busiest->cpu;
@@ -10403,6 +10412,8 @@ static int load_balance(int this_cpu, struct rq 
*this_rq,

  		/* All tasks on this runqueue were pinned by CPU affinity */
  		if (unlikely(env.flags & LBF_ALL_PINNED)) {
+			raw_spin_unlock(&busiest->lb_lock);
+
  			__cpumask_clear_cpu(cpu_of(busiest), cpus);
  			/*
  			 * Attempting to continue load balancing at the current
@@ -10420,6 +10431,7 @@ static int load_balance(int this_cpu, struct rq 
*this_rq,
  			goto out_all_pinned;
  		}
  	}
+	raw_spin_unlock(&busiest->lb_lock);

  	if (!ld_moved) {
  		schedstat_inc(sd->lb_failed[idle]);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a4a20046e586..384690bda8c3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -954,6 +954,7 @@ struct balance_callback {
  struct rq {
  	/* runqueue lock: */
  	raw_spinlock_t		__lock;
+	raw_spinlock_t          lb_lock;

  	/*
  	 * nr_running and cpu_load should be in the same cacheline because
-- 
2.11.0