linux-kernel - [RFC PATCH] sched/fair: Make tg->load

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20230327053955.GA570404@ziqianlu-desk2>
Date:   Mon, 27 Mar 2023 13:39:55 +0800
From:   Aaron Lu <aaron.lu@...el.com>
To:     Peter Zijlstra <peterz@...radead.org>,
        Ingo Molnar <mingo@...hat.com>,
        Juri Lelli <juri.lelli@...hat.com>,
        Vincent Guittot <vincent.guittot@...aro.org>
CC:     Dietmar Eggemann <dietmar.eggemann@....com>,
        Steven Rostedt <rostedt@...dmis.org>,
        Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
        Daniel Bristot de Oliveira <bristot@...hat.com>,
        "Valentin Schneider" <vschneid@...hat.com>,
        Tim Chen <tim.c.chen@...el.com>,
        "Nitin Tekchandani" <nitin.tekchandani@...el.com>,
        Waiman Long <longman@...hat.com>,
        Yu Chen <yu.c.chen@...el.com>, <linux-kernel@...r.kernel.org>
Subject: [RFC PATCH] sched/fair: Make tg->load_avg per node

When using sysbench to benchmark Postgres in a single docker instance
with sysbench's nr_threads set to nr_cpu, it is observed there are times
update_cfs_group() and update_load_avg() shows noticeable overhead on
cpus of one node of a 2sockets/112core/224cpu Intel Sapphire Rapids:

    10.01%     9.86%  [kernel.vmlinux]        [k] update_cfs_group
     7.84%     7.43%  [kernel.vmlinux]        [k] update_load_avg

While cpus of the other node normally sees a lower cycle percent:

     4.46%     4.36%  [kernel.vmlinux]        [k] update_cfs_group
     4.02%     3.40%  [kernel.vmlinux]        [k] update_load_avg

Annotate shows the cycles are mostly spent on accessing tg->load_avg
with update_load_avg() being the write side and update_cfs_group() being
the read side.

The reason why only cpus of one node has bigger overhead is: task_group
is allocated on demand from a slab and whichever cpu happens to do the
allocation, the allocated tg will be located on that node and accessing
to tg->load_avg will have a lower cost for cpus on the same node and
a higer cost for cpus of the remote node.

Tim Chen told me that PeterZ once mentioned a way to solve a similar
problem by making a counter per node so do the same for tg->load_avg.
After this change, the worst number I saw during a 5 minutes run from
both nodes are:

     2.77%     2.11%  [kernel.vmlinux]        [k] update_load_avg
     2.72%     2.59%  [kernel.vmlinux]        [k] update_cfs_group

Another observation of this workload is: it has a lot of wakeup time
task migrations and that is the reason why update_load_avg() and
update_cfs_group() shows noticeable cost. Running this workload in N
instances setup where N >= 2 with sysbench's nr_threads set to 1/N nr_cpu,
task migrations on wake up time are greatly reduced and the overhead from
the two above mentioned functions also dropped a lot. It's not clear to
me why running in multiple instances can reduce task migrations on
wakeup path yet.

Reported-by: Nitin Tekchandani <nitin.tekchandani@...el.com>
Signed-off-by: Aaron Lu <aaron.lu@...el.com>
---
 kernel/sched/core.c  | 24 +++++++++++++++++-------
 kernel/sched/debug.c |  2 +-
 kernel/sched/fair.c  |  5 +++--
 kernel/sched/sched.h | 32 ++++++++++++++++++++++++--------
 4 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2a4918a1faa9..531d465038d8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9759,9 +9759,6 @@ int in_sched_functions(unsigned long addr)
  */
 struct task_group root_task_group;
 LIST_HEAD(task_groups);
-
-/* Cacheline aligned slab cache for task_group */
-static struct kmem_cache *task_group_cache __read_mostly;
 #endif
 
 void __init sched_init(void)
@@ -9820,8 +9817,6 @@ void __init sched_init(void)
 #endif /* CONFIG_RT_GROUP_SCHED */
 
 #ifdef CONFIG_CGROUP_SCHED
-	task_group_cache = KMEM_CACHE(task_group, 0);
-
 	list_add(&root_task_group.list, &task_groups);
 	INIT_LIST_HEAD(&root_task_group.children);
 	INIT_LIST_HEAD(&root_task_group.siblings);
@@ -10219,7 +10214,6 @@ static void sched_free_group(struct task_group *tg)
 	free_fair_sched_group(tg);
 	free_rt_sched_group(tg);
 	autogroup_free(tg);
-	kmem_cache_free(task_group_cache, tg);
 }
 
 static void sched_free_group_rcu(struct rcu_head *rcu)
@@ -10241,11 +10235,27 @@ static void sched_unregister_group(struct task_group *tg)
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(struct task_group *parent)
 {
+	size_t size = sizeof(struct task_group);
+	int __maybe_unused i, nodes;
 	struct task_group *tg;
 
-	tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+	nodes = num_possible_nodes();
+	size += nodes * sizeof(void *);
+	tg = kzalloc(size, GFP_KERNEL);
+	if (!tg)
+		return ERR_PTR(-ENOMEM);
+
+	for_each_node(i) {
+		tg->node_info[i] = kzalloc_node(sizeof(struct tg_node_info), GFP_KERNEL, i);
+		if (!tg->node_info[i])
+			return ERR_PTR(-ENOMEM);
+	}
+#else
+	tg = kzalloc(size, GFP_KERNEL);
 	if (!tg)
 		return ERR_PTR(-ENOMEM);
+#endif
 
 	if (!alloc_fair_sched_group(tg, parent))
 		goto err;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1637b65ba07a..2f20728aa093 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -645,7 +645,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 	SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
 			cfs_rq->tg_load_avg_contrib);
 	SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_avg",
-			atomic_long_read(&cfs_rq->tg->load_avg));
+			tg_load_avg(cfs_rq->tg));
 #endif
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0f8736991427..68ac015fab6a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3439,7 +3439,7 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
 
 	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
 
-	tg_weight = atomic_long_read(&tg->load_avg);
+	tg_weight = tg_load_avg(tg);
 
 	/* Ensure tg_weight >= load */
 	tg_weight -= cfs_rq->tg_load_avg_contrib;
@@ -3608,6 +3608,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 {
 	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+	int node = cpu_to_node(cfs_rq->rq->cpu);
 
 	/*
 	 * No need to update load_avg for root_task_group as it is not used.
@@ -3616,7 +3617,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
 		return;
 
 	if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
-		atomic_long_add(delta, &cfs_rq->tg->load_avg);
+		atomic_long_add(delta, &cfs_rq->tg->node_info[node]->load_avg);
 		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
 	}
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 771f8ddb7053..11a1aed4e8f0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -365,6 +365,14 @@ struct cfs_bandwidth {
 #endif
 };
 
+struct tg_node_info {
+	/*
+	 * load_avg can be heavily contended at clock tick and task
+	 * enqueue/dequeue time, so put it in its own cacheline.
+	 */
+	atomic_long_t		load_avg ____cacheline_aligned;
+};
+
 /* Task group related information */
 struct task_group {
 	struct cgroup_subsys_state css;
@@ -379,14 +387,6 @@ struct task_group {
 	/* A positive value indicates that this is a SCHED_IDLE group. */
 	int			idle;
 
-#ifdef	CONFIG_SMP
-	/*
-	 * load_avg can be heavily contended at clock tick time, so put
-	 * it in its own cacheline separated from the fields above which
-	 * will also be accessed at each tick.
-	 */
-	atomic_long_t		load_avg ____cacheline_aligned;
-#endif
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -418,8 +418,24 @@ struct task_group {
 	struct uclamp_se	uclamp[UCLAMP_CNT];
 #endif
 
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+	struct tg_node_info	*node_info[];
+#endif
 };
 
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+static inline long tg_load_avg(struct task_group *tg)
+{
+	long load_avg = 0;
+	int i;
+
+	for_each_node(i)
+		load_avg += atomic_long_read(&tg->node_info[i]->load_avg);
+
+	return load_avg;
+}
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #define ROOT_TASK_GROUP_LOAD	NICE_0_LOAD
 

base-commit: c9c3395d5e3dcc6daee66c6908354d47bf98cb0c
-- 
2.39.2