linux-kernel - [RFC PATCH v2 2/3] sched/fair: Reorder struct cfs

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250602180544.3626909-3-zecheng@google.com>
Date: Mon,  2 Jun 2025 18:05:42 +0000
From: Zecheng Li <zecheng@...gle.com>
To: Ingo Molnar <mingo@...hat.com>, Peter Zijlstra <peterz@...radead.org>, 
	Juri Lelli <juri.lelli@...hat.com>, Vincent Guittot <vincent.guittot@...aro.org>
Cc: Dietmar Eggemann <dietmar.eggemann@....com>, Steven Rostedt <rostedt@...dmis.org>, 
	Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>, 
	Valentin Schneider <vschneid@...hat.com>, Xu Liu <xliuprof@...gle.com>, 
	Blake Jones <blakejones@...gle.com>, Josh Don <joshdon@...gle.com>, 
	Madadi Vineeth Reddy <vineethr@...ux.ibm.com>, linux-kernel@...r.kernel.org, 
	Zecheng Li <zecheng@...gle.com>
Subject: [RFC PATCH v2 2/3] sched/fair: Reorder struct cfs_rq

Hot fields are moved to the head of the struct, for a total of 128
bytes, which accounts for two cache lines in x86. With all related
CONFIG enabled, it also moves fields originally located around the 4th
and 5th cache line offsets to provide better locality when executing CFS
bandwidth control functions. Due to the removal of holes in the struct,
its size is observed to reduce by one cacheline in an x86 system.

The following changes are proposed:

- Move `curr`, `rq`, `tg`, `throttle_count`, and `runtime_enabled` to
the first cache line as they are frequently accessed (and mostly read).
They are pointers to the closely related structs (`rq`, `tg`) or checked
as a condition (`curr`, `throttle_count` and `runtime_enabled`).

- `propagate` and `idle`, two frequently read fields, were placed in
separate cache lines. Group them in cache line 2 with the remaining
fields previously in cache line 1 to fill the hole.

- `on_list` is often accessed together with `throttle_clock_*` in
`tg_unthrottle_up` and `tg_throttle_down` functions. Move
`runtime_remaining` and `throttled_pelt_idle`, which are less frequently
accessed, to the outside to allow grouping `on_list` and
throttle-related fields together. This cache group aligns to 64 byte
boundaries only when the target architecture utilizes a 64 byte cache
line size.

- Use `__cacheline_group_*` macros to delineate logically grouped fields
for cache alignment, with compile-time checks added in
`cfs_rq_struct_check`.

Signed-off-by: Zecheng Li <zecheng@...gle.com>
---
 kernel/sched/core.c  | 61 ++++++++++++++++++++++++++++++++-
 kernel/sched/sched.h | 81 ++++++++++++++++++++++++++++++--------------
 2 files changed, 115 insertions(+), 27 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c81cf642dba0..ba89cd4f2fac 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8524,6 +8524,8 @@ LIST_HEAD(task_groups);
 static struct kmem_cache *task_group_cache __ro_after_init;
 #endif
 
+static void __init cfs_rq_struct_check(void);
+
 void __init sched_init(void)
 {
 	unsigned long ptr = 0;
@@ -8540,7 +8542,7 @@ void __init sched_init(void)
 	BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class));
 	BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
 #endif
-
+	cfs_rq_struct_check();
 	wait_bit_init();
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -10746,3 +10748,60 @@ void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
 		set_next_task(rq, ctx->p);
 }
 #endif	/* CONFIG_SCHED_CLASS_EXT */
+
+static void __init cfs_rq_struct_check(void)
+{
+	/*
+	 * The first two cache lines are hot and mostly read
+	 * except load.inv_weight
+	 */
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, load);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, nr_queued);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, h_nr_queued);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, h_nr_runnable);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, h_nr_idle);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, curr);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, rq);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, tg);
+
+#ifdef CONFIG_CFS_BANDWIDTH
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, throttle_count);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, runtime_enabled);
+#endif
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, idle);
+
+#ifdef CONFIG_SMP
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, propagate);
+#endif
+#endif
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, avg_vruntime);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, avg_load);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, min_vruntime);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, tasks_timeline);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, hot, next);
+
+	/*
+	 * This cache line groups hot fields of the throttling functions.
+	 * This group is enabled when CFS_BANDWIDTH is configured.
+	 */
+#ifdef CONFIG_CFS_BANDWIDTH
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, throttle, throttled);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, throttle, on_list);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, throttle,
+				      leaf_cfs_rq_list);
+
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, throttle, throttled_clock);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, throttle,
+				      throttled_clock_pelt);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, throttle,
+				      throttled_clock_pelt_time);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, throttle,
+				      throttled_clock_self);
+	CACHELINE_ASSERT_GROUP_MEMBER(struct cfs_rq, throttle,
+				      throttled_clock_self_time);
+#endif
+#endif
+}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 47972f34ea70..b0a6c70c01ea 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -641,31 +641,55 @@ struct balance_callback {
 	void (*func)(struct rq *rq);
 };
 
+/**
+ * The `throttle` cache group is designed to group 64 bytes into a cache
+ * line, which benefits architectures with a 64-byte cache line size. To
+ * prevent performance degradation on other architectures, let's
+ * conditionally align it when the target system utilizes a 64-byte
+ * cache line.
+ */
+#define THROTTLE_GROUP_ALIGN_COND 64
+
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
+	/* The first cache line group is hot and mostly read */
+	__cacheline_group_begin(hot);
 	struct load_weight	load;
 	unsigned int		nr_queued;
 	unsigned int		h_nr_queued;       /* SCHED_{NORMAL,BATCH,IDLE} */
 	unsigned int		h_nr_runnable;     /* SCHED_{NORMAL,BATCH,IDLE} */
 	unsigned int		h_nr_idle; /* SCHED_IDLE */
+	/*
+	 * 'curr' points to currently running entity on this cfs_rq.
+	 * It is set to NULL otherwise (i.e when none are currently running).
+	 */
+	struct sched_entity	*curr;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct rq		*rq;	/* CPU runqueue to which this cfs_rq is attached */
+	struct task_group	*tg;	/* group that "owns" this runqueue */
+
+#ifdef CONFIG_CFS_BANDWIDTH
+	int			throttle_count;
+	int			runtime_enabled;
+#endif
+	/* Locally cached copy of our task_group's idle value */
+	int			idle;
+
+#ifdef CONFIG_SMP
+	long			propagate;
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 
 	s64			avg_vruntime;
 	u64			avg_load;
 
 	u64			min_vruntime;
-#ifdef CONFIG_SCHED_CORE
-	unsigned int		forceidle_seq;
-	u64			min_vruntime_fi;
-#endif
 
 	struct rb_root_cached	tasks_timeline;
 
-	/*
-	 * 'curr' points to currently running entity on this cfs_rq.
-	 * It is set to NULL otherwise (i.e when none are currently running).
-	 */
-	struct sched_entity	*curr;
 	struct sched_entity	*next;
+	__cacheline_group_end(hot);
 
 #ifdef CONFIG_SMP
 	/*
@@ -686,7 +710,6 @@ struct cfs_rq {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	u64			last_update_tg_load_avg;
 	unsigned long		tg_load_avg_contrib;
-	long			propagate;
 	long			prop_runnable_sum;
 
 	/*
@@ -702,8 +725,21 @@ struct cfs_rq {
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-	struct rq		*rq;	/* CPU runqueue to which this cfs_rq is attached */
-
+#ifdef CONFIG_CFS_BANDWIDTH
+	s64			runtime_remaining;
+	u64			throttled_pelt_idle;
+#ifndef CONFIG_64BIT
+	u64                     throttled_pelt_idle_copy;
+#endif
+	/*
+	 * This cache line groups hot fields of the throttling functions.
+	 * This group is enabled when CFS_BANDWIDTH is configured.
+	 * Alignment is enforced only when the target architecture
+	 * utilizes a 64-byte cache line size.
+	 */
+	__cacheline_group_begin_aligned_cond(throttle, THROTTLE_GROUP_ALIGN_COND);
+	int			throttled;
+#endif /* CONFIG_CFS_BANDWIDTH */
 	/*
 	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
 	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
@@ -714,30 +750,23 @@ struct cfs_rq {
 	 */
 	int			on_list;
 	struct list_head	leaf_cfs_rq_list;
-	struct task_group	*tg;	/* group that "owns" this runqueue */
-
-	/* Locally cached copy of our task_group's idle value */
-	int			idle;
-
 #ifdef CONFIG_CFS_BANDWIDTH
-	int			runtime_enabled;
-	s64			runtime_remaining;
-
-	u64			throttled_pelt_idle;
-#ifndef CONFIG_64BIT
-	u64                     throttled_pelt_idle_copy;
-#endif
 	u64			throttled_clock;
 	u64			throttled_clock_pelt;
 	u64			throttled_clock_pelt_time;
 	u64			throttled_clock_self;
 	u64			throttled_clock_self_time;
-	int			throttled;
-	int			throttle_count;
+	__cacheline_group_end_aligned_cond(throttle, THROTTLE_GROUP_ALIGN_COND);
+
 	struct list_head	throttled_list;
 	struct list_head	throttled_csd_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
+
+#ifdef CONFIG_SCHED_CORE
+	unsigned int		forceidle_seq;
+	u64			min_vruntime_fi;
+#endif
 };
 
 #ifdef CONFIG_SCHED_CLASS_EXT
-- 
2.49.0