linux-kernel - [PATCH v3 0/2] sched: Minor changes for rd->overload access

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240325054505.201995-1-sshegde@linux.ibm.com>
Date: Mon, 25 Mar 2024 11:15:03 +0530
From: Shrikanth Hegde <sshegde@...ux.ibm.com>
To: mingo@...nel.org, peterz@...radead.org, vincent.guittot@...aro.org
Cc: sshegde@...ux.ibm.com, dietmar.eggemann@....com, qyousef@...alina.io,
        linux-kernel@...r.kernel.org, vschneid@...hat.com
Subject: [PATCH v3 0/2] sched: Minor changes for rd->overload access

v2 -> v3:
- Wrapped check for value change inside ser_rd_overload_status
  as suggested by Qais.
- Added reviewed-by tags.

v1 -> v2:
- dropped Fixes tag.
- Added one of the perf probes in the changelog for reference.
- Added reviewed-by tags.

tl;dr
When running workloads in large systems, it was observed that access to
rd->overload was taking time. It would be better to check the value
before updating since value changes less often. Patch 1/2 does that.
With patch updates happen only if necessary. CPU Bus traffic reduced a
bit. No significant gains in workload performance.

Qais Suggested that it would be better to use the helper functions to
access the rd->overload instead. Patch 2/2 does that.

*These patches depend on below to be applied first*
https://lore.kernel.org/all/20240307085725.444486-1-sshegde@linux.ibm.com/


-----------------------------------------------------------------------
Detailed Perf annotation and probes stat
-----------------------------------------------------------------------
=======
6.8-rc5
=======
320 CPU system, SMT8
  NUMA node(s):          4
  NUMA node0 CPU(s):     0-79
  NUMA node1 CPU(s):     80-159
  NUMA node6 CPU(s):     160-239
  NUMA node7 CPU(s):     240-319

Perf annoate while running "schbench -t 320 -i 30 -r 30"
       │     if (!READ_ONCE(this_rq->rd->overload) ||
 18.05 │       ld       r9,2752(r31)
       │     sd = rcu_dereference_check_sched_domain(this_rq->sd);
  6.97 │       ld       r30,2760(r31)


Added some dummy codes so the probes can be put at required places.
perf probe -L update_sd_lb_stats
     46         if (env->sd->flags & SD_NUMA)
     47                 env->fbq_type = fbq_classify_group(&sds->busiest_stat);

     49         if (!env->sd->parent) {
                        /* update overload indicator if we are at root domain */
     51                 WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD);

perf -probe -L sched_balance_newidle
                rcu_read_lock();
     38         sd = rcu_dereference_check_sched_domain(this_rq->sd);

                if (!READ_ONCE(this_rq->rd->overload) ||
                    (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {

perf probe -L add_nr_running
         #ifdef CONFIG_SMP
     11         if (prev_nr < 2 && rq->nr_running >= 2) {
     12                 if (!READ_ONCE(rq->rd->overload)) {
     13                         a = a +10;
                                WRITE_ONCE(rq->rd->overload, 1);
                        }

probe hits when running different workload.
idle
320 probe:add_nr_running_L12
260 probe:add_nr_running_L13
1K probe:sched_balance_newidle_L38
596 probe:update_sd_lb_stats_L51

/hackbench 10 process 100000 loops
130K probe:add_nr_running_L12
93 probe:add_nr_running_L13
1M probe:sched_balance_newidle_L38
109K probe:update_sd_lb_stats_L51

/schbench -t 320 -i 30 -r 30
3K probe:add_nr_running_L12
436 probe:add_nr_running_L13
125K probe:sched_balance_newidle_L38
33K probe:update_sd_lb_stats_L51

Modified stress-ng --wait
3K probe:add_nr_running_L12
1K probe:add_nr_running_L13
6M probe:sched_balance_newidle_L38
11K probe:update_sd_lb_stats_L51

stress-ng --cpu=400 -l 20
833 probe:add_nr_running_L12
280 probe:add_nr_running_L13
2K probe:sched_balance_newidle_L38
1K probe:update_sd_lb_stats_L51

stress-ng --cpu=400 -l 100
730 probe:add_nr_running_L12
0 probe:add_nr_running_L13
0 probe:sched_balance_newidle_L38
0 probe:update_sd_lb_stats_L51

stress-ng --cpu=800 -l 50
2K probe:add_nr_running_L12
0 probe:add_nr_running_L13
2K probe:sched_balance_newidle_L38
946 probe:update_sd_lb_stats_L51

stress-ng --cpu=800 -l 100
361 probe:add_nr_running_L12
0 probe:add_nr_running_L13
0 probe:sched_balance_newidle_L38
0 probe:update_sd_lb_stats_L51

L13 numbers are quite less compared to L12. This indicates that it might
not change often.

------------------------------------------------------------------------------
==========
With Patch:
==========
Perf annoate while running "schbench -t 320 -i 30 -r 30"
       │     if (!READ_ONCE(this_rq->rd->overload) ||
       │       ld       r9,2752(r31)
       │     sd = rcu_dereference_check_sched_domain(this_rq->sd);
       │       ld       r30,2760(r31)
       │     if (!READ_ONCE(this_rq->rd->overload) ||
       │       lwz      r9,536(r9)
       │       cmpwi    r9,0
       │     ↓ beq      2b4
       │100:   mflr     r0
       │       cmpdi    r30,0
  0.38 │       std      r0,240(r1)
  1.56 │     ↓ beq      120


perf probe -L update_sd_lb_stats
     49         if (!env->sd->parent) {
     50                 int a;
                        /* update overload indicator if we are at root domain */
                        if ( READ_ONCE(env->dst_rq->rd->overload) != sg_status & SG_OVERLOAD) {
     53                         a= a+10;
                                WRITE_ONCE(env->dst_rq->rd->overload, sg_status & SG_OVERLOAD);
                        }

perf probe -L sched_balance_newidle
     38         sd = rcu_dereference_check_sched_domain(this_rq->sd);

                if (!READ_ONCE(this_rq->rd->overload) ||
                    (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {

perf probe -L add_nr_running
		#ifdef CONFIG_SMP
     11         if (prev_nr < 2 && rq->nr_running >= 2) {
     12                 if (!READ_ONCE(rq->rd->overload)) {
     13                         a = a +10;
                                WRITE_ONCE(rq->rd->overload, 1);
                        }

perf probes when running different workloads. How many times actual
value changes in update_sd_lb_stats is indicated as L53/L50*100.
idle
818 probe:sched_balance_newidle_L38
262 probe:update_sd_lb_stats_L53	<-- 86%
321 probe:add_nr_running_L12
261 probe:add_nr_running_L13
304 probe:update_sd_lb_stats_L50

/hackbench 10 process 100000 loops
1M probe:sched_balance_newidle_L38
139 probe:update_sd_lb_stats_L53	<-- 0.25%
129K probe:add_nr_running_L12
74 probe:add_nr_running_L13
54K probe:update_sd_lb_stats_L50

/schbench -t 320 -i 30 -r 30
101K probe:sched_balance_newidle_L38
2K probe:update_sd_lb_stats_L53		<-- 9.09%
5K probe:add_nr_running_L12
1K probe:add_nr_running_L13
22K probe:update_sd_lb_stats_L50

Modified stress-ng --wait
6M probe:sched_balance_newidle_L38
2K probe:update_sd_lb_stats_L53		<-- 25%
4K probe:add_nr_running_L12
2K probe:add_nr_running_L13
8K probe:update_sd_lb_stats_L50

stress-ng --cpu=400 -l 20
2K probe:sched_balance_newidle_L38
286 probe:update_sd_lb_stats_L53	<-- 36.11%
746 probe:add_nr_running_L12
256 probe:add_nr_running_L13
792 probe:update_sd_lb_stats_L50

stress-ng --cpu=400 -l 100
2 probe:sched_balance_newidle_L38
0 probe:update_sd_lb_stats_L53		<-- NA
923 probe:add_nr_running_L12
0 probe:add_nr_running_L13
0 probe:update_sd_lb_stats_L50

stress-ng --cpu=800 -l 50
2K probe:sched_balance_newidle_L38
0 probe:update_sd_lb_stats_L53		<-- 0%
2K probe:add_nr_running_L12
0 probe:add_nr_running_L13
429 probe:update_sd_lb_stats_L50

stress-ng --cpu=800 -l 100
0 probe:sched_balance_newidle_L38
0 probe:update_sd_lb_stats_L53		<-- NA
424 probe:add_nr_running_L12
0 probe:add_nr_running_L13
1 probe:update_sd_lb_stats_L50

This indicates that likely that value changes less often. So adding a
read before update would help in generic workloads.
-------------------------------------------------------------------------------

Shrikanth Hegde (2):
  sched/fair: Check rd->overload value before update
  sched/fair: Use helper functions to access rd->overload

 kernel/sched/fair.c  |  4 ++--
 kernel/sched/sched.h | 14 ++++++++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

--
2.39.3