linux-kernel - Re: [PATCH v3] sched: reorder some fields in struct rq

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite for Android: free password hash cracker in your pocket

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <20251028112901.GL3245006@noisy.programming.kicks-ass.net>
Date: Tue, 28 Oct 2025 12:29:01 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: Blake Jones <blakejones@...gle.com>
Cc: Ingo Molnar <mingo@...hat.com>, Juri Lelli <juri.lelli@...hat.com>,
	Vincent Guittot <vincent.guittot@...aro.org>,
	Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
	Josh Don <joshdon@...gle.com>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Dietmar Eggemann <dietmar.eggemann@....com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
	Valentin Schneider <vschneid@...hat.com>,
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH v3] sched: reorder some fields in struct rq

On Tue, Oct 28, 2025 at 01:03:48AM -0700, Blake Jones wrote:

> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e7718f12bc55..e9f71a09a5d8 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1114,28 +1114,47 @@ DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
>   * acquire operations must be ordered by ascending &runqueue.
>   */
>  struct rq {
> -	/* runqueue lock: */
> -	raw_spinlock_t		__lock;
> -
> -	/* Per class runqueue modification mask; bits in class order. */
> -	unsigned int		queue_mask;

queue_mask is modified right along with nr_running -- every
enqueue/dequeue. It seems weird to move it away from nr_running.

> +	/*
> +	 * The following members are loaded together from pick_next_task(),
> +	 * and should be on an isolated cache line to avoid cache pollution.
> +	 * They are loaded much more often than they are stored.
> +	 */
>  	unsigned int		nr_running;
>  #ifdef CONFIG_NUMA_BALANCING
>  	unsigned int		nr_numa_running;
>  	unsigned int		nr_preferred_running;
> -	unsigned int		numa_migrate_on;
>  #endif
> +	unsigned int		ttwu_pending;
> +	unsigned long		cpu_capacity;
> +#ifdef CONFIG_SCHED_PROXY_EXEC
> +	struct task_struct __rcu	*donor;  /* Scheduling context */
> +	struct task_struct __rcu	*curr;   /* Execution context */
> +#else
> +	union {
> +		struct task_struct __rcu *donor; /* Scheduler context */
> +		struct task_struct __rcu *curr;  /* Execution context */
> +	};
> +#endif
> +	struct task_struct	*idle;
> +	/* padding left here deliberately */
> +
> +	/*
> +	 * The next cacheline holds the (hot) runqueue lock, as well as
> +	 * some other less performance-critical fields.
> +	 */
> +	u64			nr_switches	____cacheline_aligned;
> +
> +	/* runqueue lock: */
> +	raw_spinlock_t		__lock;

Does it not make more sense to put the lock in the same cacheline as the
clock fields?

> +
>  #ifdef CONFIG_NO_HZ_COMMON
> -	unsigned long		last_blocked_load_update_tick;
> -	unsigned int		has_blocked_load;
> -	call_single_data_t	nohz_csd;
>  	unsigned int		nohz_tick_stopped;
>  	atomic_t		nohz_flags;
> +	unsigned int		has_blocked_load;
> +	unsigned long		last_blocked_load_update_tick;
> +	call_single_data_t	nohz_csd;
>  #endif /* CONFIG_NO_HZ_COMMON */
>  
> -	unsigned int		ttwu_pending;
> -	u64			nr_switches;
> -
>  #ifdef CONFIG_UCLAMP_TASK
>  	/* Utilization clamp values based on CPU's RUNNABLE tasks */
>  	struct uclamp_rq	uclamp[UCLAMP_CNT] ____cacheline_aligned;
> @@ -1158,6 +1177,9 @@ struct rq {
>  	struct list_head	*tmp_alone_branch;
>  #endif /* CONFIG_FAIR_GROUP_SCHED */
>  
> +#ifdef CONFIG_NUMA_BALANCING
> +	unsigned int		numa_migrate_on;
> +#endif
>  	/*
>  	 * This is part of a global counter where only the total sum
>  	 * over all CPUs matters. A task can increase this counter on
> @@ -1166,36 +1188,33 @@ struct rq {
>  	 */
>  	unsigned long 		nr_uninterruptible;
>  
> -#ifdef CONFIG_SCHED_PROXY_EXEC
> -	struct task_struct __rcu	*donor;  /* Scheduling context */
> -	struct task_struct __rcu	*curr;   /* Execution context */
> -#else
> -	union {
> -		struct task_struct __rcu *donor; /* Scheduler context */
> -		struct task_struct __rcu *curr;  /* Execution context */
> -	};
> -#endif
>  	struct sched_dl_entity	*dl_server;
> -	struct task_struct	*idle;
>  	struct task_struct	*stop;
>  	unsigned long		next_balance;
>  	struct mm_struct	*prev_mm;
>  
> -	unsigned int		clock_update_flags;
> -	u64			clock;
> -	/* Ensure that all clocks are in the same cache line */
> +	/* Per class runqueue modification mask; bits in class order. */
> +	unsigned int		queue_mask;
> +
> +	atomic_t		nr_iowait;

nr_iowait has cross CPU updates and would be subject to false sharing.

> +
> +	/*
> +	 * The following fields of clock data are frequently referenced
> +	 * and updated together, and should go on their own cache line.
> +	 */
>  	u64			clock_task ____cacheline_aligned;
>  	u64			clock_pelt;
> +	u64			clock;
>  	unsigned long		lost_idle_time;
> +	unsigned int		clock_update_flags;
>  	u64			clock_pelt_idle;
>  	u64			clock_idle;
> +
>  #ifndef CONFIG_64BIT
>  	u64			clock_pelt_idle_copy;
>  	u64			clock_idle_copy;
>  #endif
>  
> -	atomic_t		nr_iowait;
> -
>  	u64 last_seen_need_resched_ns;
>  	int ticks_without_resched;

Does the thing want a __no_randomize_layout ?