[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <11222824-9a23-0766-70f3-709ab2fc6cc0@bytedance.com>
Date: Tue, 13 Dec 2022 11:13:24 +0800
From: chenying <chenying.kernel@...edance.com>
To: Abel Wu <wuyun.abel@...edance.com>, mingo@...hat.com,
Peter Zijlstra <peterz@...radead.org>,
Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Benjamin Segall <bsegall@...gle.com>
Cc: linux-kernel <linux-kernel@...r.kernel.org>
Subject: [PATCH] sched: Reduce rq lock contention in load_balance()
From: chenying <chenying.kernel@...edance.com>
When doing newidle load balancing, we may have lock contention on rq->lock
while finding the same busiest rq on multiple cpus. However, it is often
the case that after the first load balancing, the busiest-rq may not be the
busiest anymore. This may lead to pointless waits for locks.
Add rq->balancing to quickly check if the busiest rq has been selected
in load_balance on other cpu. If it has been selected, clear the busiest
rq's
cpu from load_balance_mask and then goto refind.
The test results show that this patch brings ~30% rq lock contentions
reduced and no scheduling latency degradation.
unpatched:
lock_stat version 0.4
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class name con-bounces contentions
waittime-min waittime-max waittime-total waittime-avg
acq-bounces acquisitions holdtime-min holdtime-max holdtime-total
holdtime-avg
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
&rq->lock: 25532 26471
0.09 22.86 42250.81 1.60
1232063 6586225 0.05 40.54 10280028.19
1.56
---------
&rq->lock 1310
[<0000000081600630>] __schedule+0xa9/0x800
&rq->lock 1430
[<00000000754f510d>] try_to_wake_up+0x206/0x710
&rq->lock 15426
[<0000000020af4cb5>] update_blocked_averages+0x30/0x6f0
&rq->lock 1449
[<00000000dc949053>] _nohz_idle_balance+0x116/0x250
---------
&rq->lock 3329
[<00000000754f510d>] try_to_wake_up+0x206/0x710
&rq->lock 1241
[<0000000081600630>] __schedule+0xa9/0x800
&rq->lock 15480
[<0000000020af4cb5>] update_blocked_averages+0x30/0x6f0
&rq->lock 5333
[<000000004969102f>] load_balance+0x3b7/0xe40
patched:
lock_stat version 0.4
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
class name con-bounces contentions
waittime-min waittime-max waittime-total waittime-avg
acq-bounces acquisitions holdtime-min holdtime-max holdtime-total
holdtime-avg
.............................................................................................................................................................................................................................
&rq->lock: 17497 18300
0.09 23.15 32152.22 1.76
1137409 6484176 0.05 40.19 10125220.60
1.56
---------
&rq->lock 12298
[<000000004314e22f>] update_blocked_averages+0x30/0x6f0
&rq->lock 1005
[<000000005b222b90>] __schedule+0xa9/0x800
&rq->lock 1271
[<00000000c7a66a89>] try_to_wake_up+0x206/0x710
&rq->lock 1380
[<00000000eac23b6b>] load_balance+0x560/0xe70
---------
&rq->lock 2962
[<00000000c7a66a89>] try_to_wake_up+0x206/0x710
&rq->lock 11844
[<000000004314e22f>] update_blocked_averages+0x30/0x6f0
&rq->lock 592
[<0000000032421516>] scheduler_tick+0x4f/0xf0
&rq->lock 1243
[<000000005b222b90>] __schedule+0xa9/0x800
unpatched:
# ./runqlat 60 1
usecs : count distribution
0 -> 1 : 1172 |
|
2 -> 3 : 210063 |************************
|
4 -> 7 : 337576
|****************************************|
8 -> 15 : 24555 |**
|
16 -> 31 : 13598 |*
|
32 -> 63 : 779 |
|
64 -> 127 : 230 |
|
128 -> 255 : 83 |
|
256 -> 511 : 54 |
|
512 -> 1023 : 62 |
|
1024 -> 2047 : 123 |
|
2048 -> 4095 : 283 |
|
4096 -> 8191 : 1362 |
|
8192 -> 16383 : 2775 |
|
16384 -> 32767 : 52352 |******
|
32768 -> 65535 : 14 |
|
65536 -> 131071 : 140 |
|
patched:
# ./runqlat 60 1
usecs : count distribution
0 -> 1 : 1091 |
|
2 -> 3 : 205259 |***********************
|
4 -> 7 : 351620
|****************************************|
8 -> 15 : 27812 |***
|
16 -> 31 : 13971 |*
|
32 -> 63 : 727 |
|
64 -> 127 : 198 |
|
128 -> 255 : 103 |
|
256 -> 511 : 61 |
|
512 -> 1023 : 45 |
|
1024 -> 2047 : 108 |
|
2048 -> 4095 : 271 |
|
4096 -> 8191 : 1342 |
|
8192 -> 16383 : 2732 |
|
16384 -> 32767 : 49367 |*****
|
32768 -> 65535 : 8 |
|
65536 -> 131071 : 183 |
|
Below is the script to run the sysbench workload:
#!/bin/bash
mkdir /sys/fs/cgroup/cpuset/test1
echo 12,14,16,18,20,22 > /sys/fs/cgroup/cpuset/test1/cpuset.cpus
echo 0 > /sys/fs/cgroup/cpuset/test1/cpuset.mems
mkdir /sys/fs/cgroup/cpuset/test2
echo
0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46 >
/sys/fs/cgroup/cpuset/test2/cpuset.cpus
echo 0 > /sys/fs/cgroup/cpuset/test2/cpuset.mems
cgexec -g cpuset:test1 sysbench --test=cpu
--cpu-max-prime=200000 run --num-threads=24 --rate=100 --time=6000 &
cgexec -g cpuset:test2 sysbench --test=cpu
--cpu-max-prime=200000 run --num-threads=96 --rate=100 --time=6000 &
Suggested-by: Abel Wu <wuyun.abel@...edance.com>
Signed-off-by: chenying <chenying.kernel@...edance.com>
---
kernel/sched/core.c | 1 +
kernel/sched/fair.c | 11 +++++++++++
kernel/sched/sched.h | 1 +
3 files changed, 13 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index daff72f00385..ca4fa84c8751 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9737,6 +9737,7 @@ void __init sched_init(void)
rq->rd = NULL;
rq->cpu_capacity = rq->cpu_capacity_orig =
SCHED_CAPACITY_SCALE;
rq->balance_callback = &balance_push_callback;
+ rq->balancing = false;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e4a0b8bd941c..aeb4fa9ac93a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10295,6 +10295,7 @@ static int load_balance(int this_cpu, struct rq
*this_rq,
goto out_balanced;
}
+refind:
busiest = find_busiest_queue(&env, group);
if (!busiest) {
schedstat_inc(sd->lb_nobusyq[idle]);
@@ -10303,6 +10304,14 @@ static int load_balance(int this_cpu, struct rq
*this_rq,
WARN_ON_ONCE(busiest == env.dst_rq);
+ if (READ_ONCE(busiest->balancing)) {
+ __cpumask_clear_cpu(cpu_of(busiest), cpus);
+ if (cpumask_intersects(sched_group_span(group), cpus))
+ goto refind;
+
+ goto out_balanced;
+ }
+
schedstat_add(sd->lb_imbalance[idle], env.imbalance);
env.src_cpu = busiest->cpu;
@@ -10323,6 +10332,7 @@ static int load_balance(int this_cpu, struct rq
*this_rq,
more_balance:
rq_lock_irqsave(busiest, &rf);
update_rq_clock(busiest);
+ WRITE_ONCE(busiest->balancing, true);
/*
* cur_ld_moved - load moved in current iteration
@@ -10338,6 +10348,7 @@ static int load_balance(int this_cpu, struct rq
*this_rq,
* See task_rq_lock() family for the details.
*/
+ WRITE_ONCE(busiest->balancing, false);
rq_unlock(busiest, &rf);
if (cur_ld_moved) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a4a20046e586..6730808a0ab8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1044,6 +1044,7 @@ struct rq {
struct balance_callback *balance_callback;
+ unsigned char balancing;
unsigned char nohz_idle_balance;
unsigned char idle_balance;
--
2.11.0
Powered by blists - more mailing lists