linux-kernel - Re: [PATCH sched_ext/for-6.15 v2] sched_ext: Implement SCX_OPS_ALLOW_QUEUED

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <e85f7088-7426-4140-b7bd-1ef8d414199d@igalia.com>
Date: Fri, 14 Feb 2025 16:00:36 +0900
From: Changwoo Min <changwoo@...lia.com>
To: Tejun Heo <tj@...nel.org>, Neel Natu <neelnatu@...gle.com>
Cc: Peter Zijlstra <peterz@...radead.org>, David Vernet <void@...ifault.com>,
 Andrea Righi <arighi@...dia.com>, Barret Rhoden <brho@...gle.com>,
 linux-kernel@...r.kernel.org, kernel-team@...a.com, sched-ext@...a.com
Subject: Re: [PATCH sched_ext/for-6.15 v2] sched_ext: Implement
 SCX_OPS_ALLOW_QUEUED_WAKEUP

Hello Tejun,

Sorry for the delayed response. This makes sense to me.

Acked-by: Changwoo Min <changwoo@...lia.com>

Regards,
Changwoo Min

On 25. 2. 14. 02:33, Tejun Heo wrote:
>  From 3539c6411a7c9d6c5895f78750f93160705cd250 Mon Sep 17 00:00:00 2001
> From: Tejun Heo <tj@...nel.org>
> Date: Wed, 12 Feb 2025 13:08:31 -1000
> 
> A task wakeup can be either processed on the waker's CPU or bounced to the
> wakee's previous CPU using an IPI (ttwu_queue). Bouncing to the wakee's CPU
> avoids the waker's CPU locking and accessing the wakee's rq which can be
> expensive across cache and node boundaries.
> 
> When ttwu_queue path is taken, select_task_rq() and thus ops.select_cpu()
> may be skipped in some cases (racing against the wakee switching out). As
> this confused some BPF schedulers, there wasn't a good way for a BPF
> scheduler to tell whether idle CPU selection has been skipped, ops.enqueue()
> couldn't insert tasks into foreign local DSQs, and the performance
> difference on machines with simple toplogies were minimal, sched_ext
> disabled ttwu_queue.
> 
> However, this optimization makes noticeable difference on more complex
> topologies and a BPF scheduler now has an easy way tell whether
> ops.select_cpu() was skipped since 9b671793c7d9 ("sched_ext, scx_qmap: Add
> and use SCX_ENQ_CPU_SELECTED") and can insert tasks into foreign local DSQs
> since 5b26f7b920f7 ("sched_ext: Allow SCX_DSQ_LOCAL_ON for direct
> dispatches").
> 
> Implement SCX_OPS_ALLOW_QUEUED_WAKEUP which allows BPF schedulers to choose
> to enable ttwu_queue optimization.
> 
> v2: Update the patch description and comment re. ops.select_cpu() being
>      skipped in some cases as opposed to always as per Neel.
> 
> Signed-off-by: Tejun Heo <tj@...nel.org>
> Reported-by: Neel Natu <neelnatu@...gle.com>
> Reported-by: Barret Rhoden <brho@...gle.com>
> Cc: Peter Zijlstra (Intel) <peterz@...radead.org>
> Acked-by: Andrea Righi <arighi@...dia.com>
> ---
>   kernel/sched/core.c |  9 ++-------
>   kernel/sched/ext.c  | 32 ++++++++++++++++++++++++++------
>   kernel/sched/ext.h  | 10 ++++++++++
>   3 files changed, 38 insertions(+), 13 deletions(-)
> 
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index e77897a62442..618bb0a5eb1c 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3921,13 +3921,8 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
>   
>   static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
>   {
> -	/*
> -	 * The BPF scheduler may depend on select_task_rq() being invoked during
> -	 * wakeups. In addition, @p may end up executing on a different CPU
> -	 * regardless of what happens in the wakeup path making the ttwu_queue
> -	 * optimization less meaningful. Skip if on SCX.
> -	 */
> -	if (task_on_scx(p))
> +	/* See SCX_OPS_ALLOW_QUEUED_WAKEUP. */
> +	if (!scx_allow_ttwu_queue(p))
>   		return false;
>   
>   	/*
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 98d5f2f68f38..2e1a1e4fc304 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -96,7 +96,7 @@ enum scx_ops_flags {
>   	/*
>   	 * Keep built-in idle tracking even if ops.update_idle() is implemented.
>   	 */
> -	SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
> +	SCX_OPS_KEEP_BUILTIN_IDLE	= 1LLU << 0,
>   
>   	/*
>   	 * By default, if there are no other task to run on the CPU, ext core
> @@ -104,7 +104,7 @@ enum scx_ops_flags {
>   	 * flag is specified, such tasks are passed to ops.enqueue() with
>   	 * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
>   	 */
> -	SCX_OPS_ENQ_LAST	= 1LLU << 1,
> +	SCX_OPS_ENQ_LAST		= 1LLU << 1,
>   
>   	/*
>   	 * An exiting task may schedule after PF_EXITING is set. In such cases,
> @@ -117,13 +117,13 @@ enum scx_ops_flags {
>   	 * depend on pid lookups and wants to handle these tasks directly, the
>   	 * following flag can be used.
>   	 */
> -	SCX_OPS_ENQ_EXITING	= 1LLU << 2,
> +	SCX_OPS_ENQ_EXITING		= 1LLU << 2,
>   
>   	/*
>   	 * If set, only tasks with policy set to SCHED_EXT are attached to
>   	 * sched_ext. If clear, SCHED_NORMAL tasks are also included.
>   	 */
> -	SCX_OPS_SWITCH_PARTIAL	= 1LLU << 3,
> +	SCX_OPS_SWITCH_PARTIAL		= 1LLU << 3,
>   
>   	/*
>   	 * A migration disabled task can only execute on its current CPU. By
> @@ -136,7 +136,23 @@ enum scx_ops_flags {
>   	 * current CPU while p->nr_cpus_allowed keeps tracking p->user_cpus_ptr
>   	 * and thus may disagree with cpumask_weight(p->cpus_ptr).
>   	 */
> -	SCX_OPS_ENQ_MIGRATION_DISABLED = 1LLU << 4,
> +	SCX_OPS_ENQ_MIGRATION_DISABLED	= 1LLU << 4,
> +
> +	/*
> +	 * Queued wakeup (ttwu_queue) is a wakeup optimization that invokes
> +	 * ops.enqueue() on the ops.select_cpu() selected or the wakee's
> +	 * previous CPU via IPI (inter-processor interrupt) to reduce cacheline
> +	 * transfers. When this optimization is enabled, ops.select_cpu() is
> +	 * skipped in some cases (when racing against the wakee switching out).
> +	 * As the BPF scheduler may depend on ops.select_cpu() being invoked
> +	 * during wakeups, queued wakeup is disabled by default.
> +	 *
> +	 * If this ops flag is set, queued wakeup optimization is enabled and
> +	 * the BPF scheduler must be able to handle ops.enqueue() invoked on the
> +	 * wakee's CPU without preceding ops.select_cpu() even for tasks which
> +	 * may be executed on multiple CPUs.
> +	 */
> +	SCX_OPS_ALLOW_QUEUED_WAKEUP	= 1LLU << 5,
>   
>   	/*
>   	 * CPU cgroup support flags
> @@ -147,6 +163,7 @@ enum scx_ops_flags {
>   				  SCX_OPS_ENQ_LAST |
>   				  SCX_OPS_ENQ_EXITING |
>   				  SCX_OPS_ENQ_MIGRATION_DISABLED |
> +				  SCX_OPS_ALLOW_QUEUED_WAKEUP |
>   				  SCX_OPS_SWITCH_PARTIAL |
>   				  SCX_OPS_HAS_CGROUP_WEIGHT,
>   };
> @@ -897,6 +914,7 @@ DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
>   static struct sched_ext_ops scx_ops;
>   static bool scx_warned_zero_slice;
>   
> +DEFINE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
>   static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
>   static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
>   static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_migration_disabled);
> @@ -4717,6 +4735,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
>   	static_branch_disable(&__scx_ops_enabled);
>   	for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
>   		static_branch_disable(&scx_has_op[i]);
> +	static_branch_disable(&scx_ops_allow_queued_wakeup);
>   	static_branch_disable(&scx_ops_enq_last);
>   	static_branch_disable(&scx_ops_enq_exiting);
>   	static_branch_disable(&scx_ops_enq_migration_disabled);
> @@ -5348,9 +5367,10 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>   		if (((void (**)(void))ops)[i])
>   			static_branch_enable(&scx_has_op[i]);
>   
> +	if (ops->flags & SCX_OPS_ALLOW_QUEUED_WAKEUP)
> +		static_branch_enable(&scx_ops_allow_queued_wakeup);
>   	if (ops->flags & SCX_OPS_ENQ_LAST)
>   		static_branch_enable(&scx_ops_enq_last);
> -
>   	if (ops->flags & SCX_OPS_ENQ_EXITING)
>   		static_branch_enable(&scx_ops_enq_exiting);
>   	if (ops->flags & SCX_OPS_ENQ_MIGRATION_DISABLED)
> diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h
> index 1079b56b0f7a..1bda96b19a1b 100644
> --- a/kernel/sched/ext.h
> +++ b/kernel/sched/ext.h
> @@ -8,6 +8,8 @@
>    */
>   #ifdef CONFIG_SCHED_CLASS_EXT
>   
> +DECLARE_STATIC_KEY_FALSE(scx_ops_allow_queued_wakeup);
> +
>   void scx_tick(struct rq *rq);
>   void init_scx_entity(struct sched_ext_entity *scx);
>   void scx_pre_fork(struct task_struct *p);
> @@ -34,6 +36,13 @@ static inline bool task_on_scx(const struct task_struct *p)
>   	return scx_enabled() && p->sched_class == &ext_sched_class;
>   }
>   
> +static inline bool scx_allow_ttwu_queue(const struct task_struct *p)
> +{
> +	return !scx_enabled() ||
> +		static_branch_likely(&scx_ops_allow_queued_wakeup) ||
> +		p->sched_class != &ext_sched_class;
> +}
> +
>   #ifdef CONFIG_SCHED_CORE
>   bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
>   		   bool in_fi);
> @@ -52,6 +61,7 @@ static inline void scx_rq_activate(struct rq *rq) {}
>   static inline void scx_rq_deactivate(struct rq *rq) {}
>   static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; }
>   static inline bool task_on_scx(const struct task_struct *p) { return false; }
> +static inline bool scx_allow_ttwu_queue(const struct task_struct *p) { return true; }
>   static inline void init_sched_ext_class(void) {}
>   
>   #endif	/* CONFIG_SCHED_CLASS_EXT */