[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z1nYPWeJvmizCvJJ@yury-ThinkPad>
Date: Wed, 11 Dec 2024 10:21:49 -0800
From: Yury Norov <yury.norov@...il.com>
To: Andrea Righi <arighi@...dia.com>
Cc: Tejun Heo <tj@...nel.org>, David Vernet <void@...ifault.com>,
Changwoo Min <changwoo@...lia.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 3/4] sched_ext: Introduce SCX_OPS_NODE_BUILTIN_IDLE
On Mon, Dec 09, 2024 at 11:40:57AM +0100, Andrea Righi wrote:
> Add a flag (SCX_OPS_NODE_BUILTIN_IDLE) to toggle between a global flat
> idle cpumask and multiple per-node CPU masks.
>
> This allows each sched_ext scheduler to choose between the new or old
> model, preserving backward compatibility and preventing disruptions to
> existing behavior.
>
> Signed-off-by: Andrea Righi <arighi@...dia.com>
> ---
> kernel/sched/ext.c | 56 +++++++++++++++++++++++++++++++++++++++++-----
> 1 file changed, 50 insertions(+), 6 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index ed2f0d13915c..d0d57323bcfc 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -122,6 +122,12 @@ enum scx_ops_flags {
> */
> SCX_OPS_SWITCH_PARTIAL = 1LLU << 3,
>
> + /*
> + * If set, enable per-node idle cpumasks. If clear, use a single global
> + * flat idle cpumask.
> + */
> + SCX_OPS_BUILTIN_IDLE_PER_NODE = 1LLU << 4,
> +
> /*
> * CPU cgroup support flags
> */
> @@ -131,6 +137,7 @@ enum scx_ops_flags {
> SCX_OPS_ENQ_LAST |
> SCX_OPS_ENQ_EXITING |
> SCX_OPS_SWITCH_PARTIAL |
> + SCX_OPS_BUILTIN_IDLE_PER_NODE |
> SCX_OPS_HAS_CGROUP_WEIGHT,
> };
>
> @@ -886,6 +893,7 @@ static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
>
> #ifdef CONFIG_SMP
> static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
> +static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);
> #endif
>
> static struct static_key_false scx_has_op[SCX_OPI_END] =
> @@ -929,18 +937,32 @@ static struct delayed_work scx_watchdog_work;
> #define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
> #endif
>
> -static struct {
> +struct idle_cpumask {
> cpumask_var_t cpu;
> cpumask_var_t smt;
> -} **idle_masks CL_ALIGNED_IF_ONSTACK;
> +};
> +
> +/*
> + * cpumasks to track idle CPUs within each NUMA node.
> + *
> + * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not specified, a single flat cpumask
> + * from node 0 is used to track all idle CPUs system-wide.
> + */
> +static struct idle_cpumask **idle_masks CL_ALIGNED_IF_ONSTACK;
>
> static struct cpumask *get_idle_cpumask_node(int node)
> {
> + if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
> + return idle_masks[0]->cpu;
> +
> return idle_masks[node]->cpu;
> }
>
> static struct cpumask *get_idle_smtmask_node(int node)
> {
> + if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
> + return idle_masks[0]->smt;
> +
> return idle_masks[node]->smt;
> }
>
> @@ -3423,7 +3445,7 @@ static bool llc_numa_mismatch(void)
> * CPU belongs to a single LLC domain, and that each LLC domain is entirely
> * contained within a single NUMA node.
> */
> -static void update_selcpu_topology(void)
> +static void update_selcpu_topology(struct sched_ext_ops *ops)
> {
> bool enable_llc = false;
> unsigned int nr_cpus;
> @@ -3442,8 +3464,16 @@ static void update_selcpu_topology(void)
> rcu_read_lock();
> nr_cpus = llc_weight(cpu);
> if (nr_cpus > 0) {
> - if ((nr_cpus < num_online_cpus()) && llc_numa_mismatch())
> + if (nr_cpus < num_online_cpus())
> enable_llc = true;
> + /*
> + * No need to enable LLC optimization if the LLC domains are
> + * perfectly overlapping with the NUMA domains when per-node
> + * cpumasks are enabled.
> + */
> + if ((ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE) &&
> + !llc_numa_mismatch())
> + enable_llc = false;
> pr_debug("sched_ext: LLC=%*pb weight=%u\n",
> cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
> }
> @@ -3456,6 +3486,14 @@ static void update_selcpu_topology(void)
> static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
> else
> static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
> +
> + /*
> + * Check if we need to enable per-node cpumasks.
> + */
> + if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
> + static_branch_enable_cpuslocked(&scx_builtin_idle_per_node);
> + else
> + static_branch_disable_cpuslocked(&scx_builtin_idle_per_node);
> }
The patch that introduces the flag should go the very first in the series,
but should unconditionally disable scx_builtin_idle_per_node.
The following patches should add all the machinery you need. The machinery
should be conditional on the scx_builtin_idle_per_node, i.e. disabled for
a while.
Doing that, you'll be able to introduce your functionality as a whole:
static struct cpumask *get_idle_cpumask_node(int node)
{
if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
return idle_masks[0]->cpu;
return idle_masks[node]->cpu;
}
Much better than patching just introduced code, right?
The very last patch should only be a chunk that enables scx_builtin_idle_per_node
based on SCX_OPS_BUILTIN_IDLE_PER_NODE.
This way, when your feature will get merged, from git-bisect perspective
it will be enabled atomically by the very last patch, but those interested
in internals will have nice coherent history.
Thanks,
Yury
>
> /*
> @@ -3683,6 +3721,12 @@ static void reset_idle_masks(void)
> {
> int node;
>
> + if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
> + cpumask_copy(get_idle_cpumask_node(0), cpu_online_mask);
> + cpumask_copy(get_idle_smtmask_node(0), cpu_online_mask);
> + return;
> + }
> +
> /*
> * Consider all online cpus idle. Should converge to the actual state
> * quickly.
> @@ -3740,7 +3784,7 @@ static void handle_hotplug(struct rq *rq, bool online)
> atomic_long_inc(&scx_hotplug_seq);
>
> if (scx_enabled())
> - update_selcpu_topology();
> + update_selcpu_topology(&scx_ops);
>
> if (online && SCX_HAS_OP(cpu_online))
> SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
> @@ -5618,7 +5662,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
>
> check_hotplug_seq(ops);
> #ifdef CONFIG_SMP
> - update_selcpu_topology();
> + update_selcpu_topology(ops);
> #endif
> cpus_read_unlock();
>
> --
> 2.47.1
Powered by blists - more mailing lists