[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ab045bd69263d5e4e71a9aa36703071a267d2a36.1753076363.git.pan.deng@intel.com>
Date: Mon, 21 Jul 2025 14:10:25 +0800
From: Pan Deng <pan.deng@...el.com>
To: peterz@...radead.org,
mingo@...nel.org
Cc: linux-kernel@...r.kernel.org,
tianyou.li@...el.com,
tim.c.chen@...ux.intel.com,
yu.c.chen@...el.com,
pan.deng@...el.com
Subject: [PATCH v2 3/4] sched/rt: Split root_domain->rto_count to per-NUMA-node counters
When running a multi-instance FFmpeg workload on HCC system, significant
contention is observed on root_domain `rto_count` and `overloaded` fields.
The SUT is a 2-socket machine with 240 physical cores and 480 logical
CPUs. 60 FFmpeg instances are launched, each pinned to 4 physical cores
(8 logical CPUs) for transcoding tasks. Sub-threads use RT priority 99
with FIFO scheduling. FPS is used as score.
perf c2c tool reveals:
root_domain cache line 1:
- `rto_count` (0x4) is frequently loaded/stored
- `overloaded` (0x28) is heavily loaded
- cycles per load: ~2.8K to 44K:
A separate patch rearranges root_domain to place `overloaded` on a
different cache line, but this alone is insufficient to resolve the
contention on `rto_count`. As a complementary, this patch splits
`rto_count` into per-numa-node counters to reduce the contention.
With this change:
- FPS improves by ~4%
- Kernel cycles% drops from ~20% to ~18.6%
- The cache line no longer appears in perf-c2c report
- stress-ng cyclic benchmark is improved ~50.4%, command:
stress-ng/stress-ng --cyclic $(nproc) --cyclic-policy fifo \
--timeout 30 --minimize --metrics
- rt-tests/pi_stress is improved ~5.1%, command:
rt-tests/pi_stress -D 30 -g $(($(nproc) / 2))
Appendix:
1. Perf c2c report of root_domain cache line 1:
------- ------- ------ ------ ------ ------ ------------------------
Rmt Lcl Store Data Load Total Symbol
Hitm% Hitm% L1 Hit% offset cycles records
------- ------- ------ ------ ------ ------ ------------------------
231 43 48 0xff14d42c400e3800
------- ------- ------ ------ ------ ------ ------------------------
22.51% 18.60% 0.00% 0x4 5041 247 pull_rt_task
5.63% 2.33% 45.83% 0x4 6995 315 dequeue_pushable_task
3.90% 4.65% 54.17% 0x4 6587 370 enqueue_pushable_task
0.43% 0.00% 0.00% 0x4 17111 4 enqueue_pushable_task
0.43% 0.00% 0.00% 0x4 44062 4 dequeue_pushable_task
32.03% 27.91% 0.00% 0x28 6393 285 enqueue_task_rt
16.45% 27.91% 0.00% 0x28 5534 139 sched_balance_newidle
14.72% 18.60% 0.00% 0x28 5287 110 dequeue_task_rt
3.46% 0.00% 0.00% 0x28 2820 25 enqueue_task_fair
0.43% 0.00% 0.00% 0x28 220 3 enqueue_task_stop
Signed-off-by: Pan Deng <pan.deng@...el.com>
Reviewed-by: Tianyou Li <tianyou.li@...el.com>
Reviewed-by: Chen Yu <yu.c.chen@...el.com>
---
V1 -> V2: Fixed non CONFIG_SMP build issue
---
kernel/sched/rt.c | 56 ++++++++++++++++++++++++++++++++++++++---
kernel/sched/sched.h | 9 ++++++-
kernel/sched/topology.c | 7 ++++++
3 files changed, 68 insertions(+), 4 deletions(-)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e40422c37033..cbcfd3aa3439 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -337,9 +337,58 @@ static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
return rq->online && rq->rt.highest_prio.curr > prev->prio;
}
+int rto_counts_init(atomic_tp **rto_counts)
+{
+ int i;
+ atomic_tp *counts = kzalloc(nr_node_ids * sizeof(atomic_tp), GFP_KERNEL);
+
+ if (!counts)
+ return -ENOMEM;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ counts[i] = kzalloc_node(sizeof(atomic_t), GFP_KERNEL, i);
+
+ if (!counts[i])
+ goto cleanup;
+ }
+
+ *rto_counts = counts;
+ return 0;
+
+cleanup:
+ while (i--)
+ kfree(counts[i]);
+
+ kfree(counts);
+ return -ENOMEM;
+}
+
+void rto_counts_cleanup(atomic_tp *rto_counts)
+{
+ for (int i = 0; i < nr_node_ids; i++)
+ kfree(rto_counts[i]);
+
+ kfree(rto_counts);
+}
+
static inline int rt_overloaded(struct rq *rq)
{
- return atomic_read(&rq->rd->rto_count);
+ int count = 0;
+ int cur_node, nid;
+
+ cur_node = numa_node_id();
+
+ for (int i = 0; i < nr_node_ids; i++) {
+ nid = (cur_node + i) % nr_node_ids;
+ count += atomic_read(rq->rd->rto_counts[nid]);
+
+ // The caller only checks if it is 0
+ // or 1, so that return once > 1
+ if (count > 1)
+ return count;
+ }
+
+ return count;
}
static inline void rt_set_overload(struct rq *rq)
@@ -358,7 +407,7 @@ static inline void rt_set_overload(struct rq *rq)
* Matched by the barrier in pull_rt_task().
*/
smp_wmb();
- atomic_inc(&rq->rd->rto_count);
+ atomic_inc(rq->rd->rto_counts[cpu_to_node(rq->cpu)]);
}
static inline void rt_clear_overload(struct rq *rq)
@@ -367,7 +416,7 @@ static inline void rt_clear_overload(struct rq *rq)
return;
/* the order here really doesn't matter */
- atomic_dec(&rq->rd->rto_count);
+ atomic_dec(rq->rd->rto_counts[cpu_to_node(rq->cpu)]);
cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
}
@@ -443,6 +492,7 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
static inline void rt_queue_push_tasks(struct rq *rq)
{
}
+
#endif /* CONFIG_SMP */
static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index bc67806911f2..13fc3ac3381b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -953,6 +953,8 @@ struct perf_domain {
struct rcu_head rcu;
};
+typedef atomic_t *atomic_tp;
+
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
@@ -963,12 +965,15 @@ struct perf_domain {
*/
struct root_domain {
atomic_t refcount;
- atomic_t rto_count;
struct rcu_head rcu;
cpumask_var_t span;
cpumask_var_t online;
atomic_t dlo_count;
+
+ /* rto_count per node */
+ atomic_tp *rto_counts;
+
struct dl_bw dl_bw;
struct cpudl cpudl;
@@ -1030,6 +1035,8 @@ extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
extern void sched_get_rd(struct root_domain *rd);
extern void sched_put_rd(struct root_domain *rd);
+extern int rto_counts_init(atomic_tp **rto_counts);
+extern void rto_counts_cleanup(atomic_tp *rto_counts);
static inline int get_rd_overloaded(struct root_domain *rd)
{
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b958fe48e020..166dc8177a44 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -457,6 +457,7 @@ static void free_rootdomain(struct rcu_head *rcu)
{
struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
+ rto_counts_cleanup(rd->rto_counts);
cpupri_cleanup(&rd->cpupri);
cpudl_cleanup(&rd->cpudl);
free_cpumask_var(rd->dlo_mask);
@@ -549,8 +550,14 @@ static int init_rootdomain(struct root_domain *rd)
if (cpupri_init(&rd->cpupri) != 0)
goto free_cpudl;
+
+ if (rto_counts_init(&rd->rto_counts) != 0)
+ goto free_cpupri;
+
return 0;
+free_cpupri:
+ cpupri_cleanup(&rd->cpupri);
free_cpudl:
cpudl_cleanup(&rd->cpudl);
free_rto_mask:
--
2.43.5
Powered by blists - more mailing lists