linux-kernel - Re: [PATCH sched_ext/for-6.15] sched_ext: Implement SCX_OPS_ALLOW_QUEUED

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z62q4Zuh7ry9tH3L@gpd3>
Date: Thu, 13 Feb 2025 09:18:41 +0100
From: Andrea Righi <arighi@...dia.com>
To: Tejun Heo <tj@...nel.org>
Cc: Peter Zijlstra <peterz@...radead.org>,
	David Vernet <void@...ifault.com>,
	Changwoo Min <changwoo@...lia.com>, Neel Natu <neelnatu@...gle.com>,
	Barret Rhoden <brho@...gle.com>, linux-kernel@...r.kernel.org,
	kernel-team@...a.com, sched-ext@...a.com
Subject: Re: [PATCH sched_ext/for-6.15] sched_ext: Implement
 SCX_OPS_ALLOW_QUEUED_WAKEUP

Hi Tejun,

On Wed, Feb 12, 2025 at 01:08:31PM -1000, Tejun Heo wrote:
> A task wakeup can be either processed on the waker's CPU or bounced to the
> wakee's previous CPU using an IPI (ttwu_queue). Bouncing to the wakee's CPU
> avoids the waker's CPU locking and accessing the wakee's rq which can be
> expensive across cache and node boundaries.
> 
> When ttwu_queue path is taken, select_task_rq() and thus ops.select_cpu()
> are skipped. As this confused some BPF schedulers, there wasn't a good way
> for a BPF scheduler to tell whether idle CPU selection has been skipped,
> ops.enqueue() couldn't insert tasks into foreign local DSQs, and the
> performance difference on machines with simple toplogies were minimal,
> sched_ext disabled ttwu_queue.
> 
> However, this optimization makes noticeable difference on more complex
> topologies and a BPF scheduler now has an easy way tell whether
> ops.select_cpu() was skipped since 9b671793c7d9 ("sched_ext, scx_qmap: Add
> and use SCX_ENQ_CPU_SELECTED") and can insert tasks into foreign local DSQs
> since 5b26f7b920f7 ("sched_ext: Allow SCX_DSQ_LOCAL_ON for direct
> dispatches").
> 
> Implement SCX_OPS_ALLOW_QUEUED_WAKEUP which allows BPF schedulers to choose
> to enable ttwu_queue optimization.

I'm wondering whether it makes sense to introduce a new SCX_OPS flag for
this, considering that we already have the TTWU_QUEUE sched feature, that
determines this behavior.

Is this in perspective of a future scenario, when we may potentially have
multiple scx schedulers running at the same time and they may want to set a
different queued wakeup behavior?

Thanks,
-Andrea

> 
> Signed-off-by: Tejun Heo <tj@...nel.org>
> Reported-by: Neel Natu <neelnatu@...gle.com>
> Reported-by: Barret Rhoden <brho@...gle.com>
> Cc: Peter Zijlstra (Intel) <peterz@...radead.org>
> ---
>  kernel/sched/core.c |    9 ++-------
>  kernel/sched/ext.c  |   30 ++++++++++++++++++++++++------
>  kernel/sched/ext.h  |   10 ++++++++++
>  3 files changed, 36 insertions(+), 13 deletions(-)
> 
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3921,13 +3921,8 @@ bool cpus_share_resources(int this_cpu,
>  
>  static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
>  {
> -	/*
> -	 * The BPF scheduler may depend on select_task_rq() being invoked during
> -	 * wakeups. In addition, @p may end up executing on a different CPU
> -	 * regardless of what happens in the wakeup path making the ttwu_queue
> -	 * optimization less meaningful. Skip if on SCX.
> -	 */
> -	if (task_on_scx(p))
> +	/* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
> +	if (!scx_allow_ttwu_queue(p))
>  		return false;
>  
>  	/*
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -96,7 +96,7 @@ enum scx_ops_flags {
>  	/*
>  	 * Keep built-in idle tracking even if ops.update_idle() is implemented.
>  	 */
> -	SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
> +	SCX_OPS_KEEP_BUILTIN_IDLE	= 1LLU << 0,
>  
>  	/*
>  	 * By default, if there are no other task to run on the CPU, ext core
> @@ -104,7 +104,7 @@ enum scx_ops_flags {
>  	 * flag is specified, such tasks are passed to ops.enqueue() with
>  	 * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
>  	 */
> -	SCX_OPS_ENQ_LAST	= 1LLU << 1,
> +	SCX_OPS_ENQ_LAST		= 1LLU << 1,
>  
>  	/*
>  	 * An exiting task may schedule after PF_EXITING is set. In such cases,
> @@ -117,13 +117,13 @@ enum scx_ops_flags {
>  	 * depend on pid lookups and wants to handle these tasks directly, the
>  	 * following flag can be used.
>  	 */
> -	SCX_OPS_ENQ_EXITING	= 1LLU << 2,
> +	SCX_OPS_ENQ_EXITING		= 1LLU << 2,
>  
>  	/*
>  	 * If set, only tasks with policy set to SCHED_EXT are attached to
>  	 * sched_ext. If clear, SCHED_NORMAL tasks are also included.
>  	 */
> -	SCX_OPS_SWITCH_PARTIAL	= 1LLU << 3,
> +	SCX_OPS_SWITCH_PARTIAL		= 1LLU << 3,
>  
>  	/*
>  	 * A migration disabled task can only execute on its current CPU. By
> @@ -136,7 +136,21 @@ enum scx_ops_flags {
>  	 * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
>  	 * and thus may disagree with cpumask_weight(p->cpus_ptr).
>  	 */
> -	SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
> +	SCX_OPS_ENQ_MIGRATION_DISABLED	= 1LLU << 4,
> +
> +	/*
> +	 * Queued wakeup (ttwu_queue) is an optimization during wakeups which
> +	 * bypasses ops.select_cpu() and invokes ops.enqueue() on the wakee's
> +	 * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
> +	 * transfers. As the BPF scheduler may depend on ops.select_cpu() being
> +	 * invoked during wakeups, queued wakeup is disabled by default.
> +	 *
> +	 * If this ops flag is set, queued wakeup optimization is enabled and
> +	 * the BPF scheduler must be able to handle ops.enqueue() invoked on the
> +	 * wakee's CPU without preceding ops.select_cpu() even for tasks which
> +	 * may be executed on multiple CPUs.
> +	 */
> +	SCX_OPS_ALLOW_QUEUED_WAKEUP	= 1LLU << 5,
>  
>  	/*
>  	 * CPU cgroup support flags
> @@ -147,6 +161,7 @@ enum scx_ops_flags {
>  				  SCX_OPS_ENQ_LAST |
>  				  SCX_OPS_ENQ_EXITING |
>  				  SCX_OPS_ENQ_MIGRATION_DISABLED |
> +				  SCX_OPS_ALLOW_QUEUED_WAKEUP |
>  				  SCX_OPS_SWITCH_PARTIAL |
>  				  SCX_OPS_HAS_CGROUP_WEIGHT,
>  };
> @@ -897,6 +912,7 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_a
>  static struct sched_ext_ops scx_ops;
>  static bool scx_warned_zero_slice;
>  
> +DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
>  static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
>  static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
>  static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
> @@ -4717,6 +4733,7 @@ static void scx_ops_disable_workfn(struc
>  	static_branch_disable(&__scx_ops_enabled);
>  	for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
>  		static_branch_disable(&scx_has_op[i]);
> +	static_branch_disable(&scx_ops_allow_queued_wakeup);
>  	static_branch_disable(&scx_ops_enq_last);
>  	static_branch_disable(&scx_ops_enq_exiting);
>  	static_branch_disable(&scx_ops_enq_migration_disabled);
> @@ -5348,9 +5365,10 @@ static int scx_ops_enable(struct sched_e
>  		if (((void (**)(void))ops)[i])
>  			static_branch_enable(&scx_has_op[i]);
>  
> +	if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
> +		static_branch_enable(&scx_ops_allow_queued_wakeup);
>  	if (ops->flags & SCX_OPS_ENQ_LAST)
>  		static_branch_enable(&scx_ops_enq_last);
> -
>  	if (ops->flags & SCX_OPS_ENQ_EXITING)
>  		static_branch_enable(&scx_ops_enq_exiting);
>  	if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
> --- a/kernel/sched/ext.h
> +++ b/kernel/sched/ext.h
> @@ -8,6 +8,8 @@
>   */
>  #ifdef CONFIG_SCHED_CLASS_EXT
>  
> +DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
> +
>  void scx_tick(struct rq *rq);
>  void init_scx_entity(struct sched_ext_entity *scx);
>  void scx_pre_fork(struct task_struct *p);
> @@ -34,6 +36,13 @@ static inline bool task_on_scx(const str
>  	return scx_enabled() && p->sched_class == &ext_sched_class;
>  }
>  
> +static inline bool scx_allow_ttwu_queue(const struct task_struct *p)
> +{
> +	return !scx_enabled() ||
> +		static_branch_likely(&scx_ops_allow_queued_wakeup) ||
> +		p->sched_class != &ext_sched_class;
> +}
> +
>  #ifdef CONFIG_SCHED_CORE
>  bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
>  		   bool in_fi);
> @@ -52,6 +61,7 @@ static inline void scx_rq_activate(struc
>  static inline void scx_rq_deactivate(struct rq *rq) {}
>  static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
>  static inline bool task_on_scx(const struct task_struct *p) { return false; }
> +static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; }
>  static inline void init_sched_ext_class(void) {}
>  
>  #endif	/* CONFIG_SCHED_CLASS_EXT */