[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251028112901.GL3245006@noisy.programming.kicks-ass.net>
Date: Tue, 28 Oct 2025 12:29:01 +0100
From: Peter Zijlstra <peterz@...radead.org>
To: Blake Jones <blakejones@...gle.com>
Cc: Ingo Molnar <mingo@...hat.com>, Juri Lelli <juri.lelli@...hat.com>,
Vincent Guittot <vincent.guittot@...aro.org>,
Madadi Vineeth Reddy <vineethr@...ux.ibm.com>,
Josh Don <joshdon@...gle.com>,
Andrew Morton <akpm@...ux-foundation.org>,
Dietmar Eggemann <dietmar.eggemann@....com>,
Steven Rostedt <rostedt@...dmis.org>,
Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>,
Valentin Schneider <vschneid@...hat.com>,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH v3] sched: reorder some fields in struct rq
On Tue, Oct 28, 2025 at 01:03:48AM -0700, Blake Jones wrote:
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index e7718f12bc55..e9f71a09a5d8 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1114,28 +1114,47 @@ DECLARE_STATIC_KEY_FALSE(sched_uclamp_used);
> * acquire operations must be ordered by ascending &runqueue.
> */
> struct rq {
> - /* runqueue lock: */
> - raw_spinlock_t __lock;
> -
> - /* Per class runqueue modification mask; bits in class order. */
> - unsigned int queue_mask;
queue_mask is modified right along with nr_running -- every
enqueue/dequeue. It seems weird to move it away from nr_running.
> + /*
> + * The following members are loaded together from pick_next_task(),
> + * and should be on an isolated cache line to avoid cache pollution.
> + * They are loaded much more often than they are stored.
> + */
> unsigned int nr_running;
> #ifdef CONFIG_NUMA_BALANCING
> unsigned int nr_numa_running;
> unsigned int nr_preferred_running;
> - unsigned int numa_migrate_on;
> #endif
> + unsigned int ttwu_pending;
> + unsigned long cpu_capacity;
> +#ifdef CONFIG_SCHED_PROXY_EXEC
> + struct task_struct __rcu *donor; /* Scheduling context */
> + struct task_struct __rcu *curr; /* Execution context */
> +#else
> + union {
> + struct task_struct __rcu *donor; /* Scheduler context */
> + struct task_struct __rcu *curr; /* Execution context */
> + };
> +#endif
> + struct task_struct *idle;
> + /* padding left here deliberately */
> +
> + /*
> + * The next cacheline holds the (hot) runqueue lock, as well as
> + * some other less performance-critical fields.
> + */
> + u64 nr_switches ____cacheline_aligned;
> +
> + /* runqueue lock: */
> + raw_spinlock_t __lock;
Does it not make more sense to put the lock in the same cacheline as the
clock fields?
> +
> #ifdef CONFIG_NO_HZ_COMMON
> - unsigned long last_blocked_load_update_tick;
> - unsigned int has_blocked_load;
> - call_single_data_t nohz_csd;
> unsigned int nohz_tick_stopped;
> atomic_t nohz_flags;
> + unsigned int has_blocked_load;
> + unsigned long last_blocked_load_update_tick;
> + call_single_data_t nohz_csd;
> #endif /* CONFIG_NO_HZ_COMMON */
>
> - unsigned int ttwu_pending;
> - u64 nr_switches;
> -
> #ifdef CONFIG_UCLAMP_TASK
> /* Utilization clamp values based on CPU's RUNNABLE tasks */
> struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;
> @@ -1158,6 +1177,9 @@ struct rq {
> struct list_head *tmp_alone_branch;
> #endif /* CONFIG_FAIR_GROUP_SCHED */
>
> +#ifdef CONFIG_NUMA_BALANCING
> + unsigned int numa_migrate_on;
> +#endif
> /*
> * This is part of a global counter where only the total sum
> * over all CPUs matters. A task can increase this counter on
> @@ -1166,36 +1188,33 @@ struct rq {
> */
> unsigned long nr_uninterruptible;
>
> -#ifdef CONFIG_SCHED_PROXY_EXEC
> - struct task_struct __rcu *donor; /* Scheduling context */
> - struct task_struct __rcu *curr; /* Execution context */
> -#else
> - union {
> - struct task_struct __rcu *donor; /* Scheduler context */
> - struct task_struct __rcu *curr; /* Execution context */
> - };
> -#endif
> struct sched_dl_entity *dl_server;
> - struct task_struct *idle;
> struct task_struct *stop;
> unsigned long next_balance;
> struct mm_struct *prev_mm;
>
> - unsigned int clock_update_flags;
> - u64 clock;
> - /* Ensure that all clocks are in the same cache line */
> + /* Per class runqueue modification mask; bits in class order. */
> + unsigned int queue_mask;
> +
> + atomic_t nr_iowait;
nr_iowait has cross CPU updates and would be subject to false sharing.
> +
> + /*
> + * The following fields of clock data are frequently referenced
> + * and updated together, and should go on their own cache line.
> + */
> u64 clock_task ____cacheline_aligned;
> u64 clock_pelt;
> + u64 clock;
> unsigned long lost_idle_time;
> + unsigned int clock_update_flags;
> u64 clock_pelt_idle;
> u64 clock_idle;
> +
> #ifndef CONFIG_64BIT
> u64 clock_pelt_idle_copy;
> u64 clock_idle_copy;
> #endif
>
> - atomic_t nr_iowait;
> -
> u64 last_seen_need_resched_ns;
> int ticks_without_resched;
Does the thing want a __no_randomize_layout ?
Powered by blists - more mailing lists