[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z0oYTbacS0lnO-jS@yury-ThinkPad>
Date: Fri, 29 Nov 2024 11:38:53 -0800
From: Yury Norov <yury.norov@...il.com>
To: Andrea Righi <arighi@...dia.com>
Cc: Tejun Heo <tj@...nel.org>, David Vernet <void@...ifault.com>,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH 2/2] sched_ext: Introduce per-NUMA idle cpumasks
On Fri, Nov 29, 2024 at 06:54:32PM +0100, Andrea Righi wrote:
> Using a single global idle mask can lead to inefficiencies and a lot of
> stress on the cache coherency protocol on large systems with multiple
> NUMA nodes, since all the CPUs can create a really intense read/write
> activity on the single global cpumask.
>
> Therefore, split the global cpumask into multiple per-NUMA node cpumasks
> to improve scalability and performance on large systems.
>
> The concept is that each cpumask will track only the idle CPUs within
> its corresponding NUMA node, treating CPUs in other NUMA nodes as busy.
> In this way concurrent access to the idle cpumask will be restricted
> within each NUMA node.
>
> [Open issue]
>
> The scx_bpf_get_idle_cpu/smtmask() kfunc's, that are supposed to return
> a single cpumask for all the CPUs, have been changed to report only the
> cpumask of the current NUMA node (using the current CPU); this breaks
> the old behavior, so it can potentially introduce regressions in some
> scx schedulers.
>
> An alternative approach could be to construct a global cpumask
> on-the-fly, but this could add significant overhead to ops.select_cpu()
> for schedulers relying on these kfunc's. Additionally, it would be less
> reliable than accessing the actual cpumasks, as the copy could quickly
> become out of sync and not represent the actual idle state very well.
>
> Probably a better way to solve this issue is to introduce new kfunc's to
> explicitly select specific per-NUMA cpumask and modify the scx
> schedulers to transition to this new API, for example:
>
> const struct cpumask *scx_bpf_get_idle_numa_cpumask(int node)
> const struct cpumask *scx_bpf_get_idle_numa_smtmask(int node)
>
> Signed-off-by: Andrea Righi <arighi@...dia.com>
> ---
> kernel/sched/ext.c | 115 +++++++++++++++++++++++++++++++--------------
> 1 file changed, 79 insertions(+), 36 deletions(-)
>
> diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
> index 508845f0c25a..c10131171dfb 100644
> --- a/kernel/sched/ext.c
> +++ b/kernel/sched/ext.c
> @@ -933,7 +933,37 @@ static struct delayed_work scx_watchdog_work;
> static struct {
> cpumask_var_t cpu;
> cpumask_var_t smt;
> -} idle_masks CL_ALIGNED_IF_ONSTACK;
> +} **idle_masks CL_ALIGNED_IF_ONSTACK;
> +
> +static struct cpumask *get_idle_cpumask(int cpu)
> +{
> + int node = cpu_to_node(cpu);
> +
> + return idle_masks[node]->cpu;
> +}
> +
> +static struct cpumask *get_idle_smtmask(int cpu)
> +{
> + int node = cpu_to_node(cpu);
> +
> + return idle_masks[node]->smt;
> +}
> +
> +static void idle_masks_init(void)
> +{
> + int node;
> +
> + idle_masks = kcalloc(num_possible_nodes(), sizeof(*idle_masks), GFP_KERNEL);
> + BUG_ON(!idle_masks);
> +
> + for_each_node_state(node, N_POSSIBLE) {
> + idle_masks[node] = kzalloc_node(sizeof(**idle_masks), GFP_KERNEL, node);
> + BUG_ON(!idle_masks[node]);
> +
> + BUG_ON(!alloc_cpumask_var_node(&idle_masks[node]->cpu, GFP_KERNEL, node));
> + BUG_ON(!alloc_cpumask_var_node(&idle_masks[node]->smt, GFP_KERNEL, node));
> + }
> +}
>
> #endif /* CONFIG_SMP */
>
> @@ -3156,39 +3186,48 @@ static bool test_and_clear_cpu_idle(int cpu)
> */
> if (sched_smt_active()) {
> const struct cpumask *smt = cpu_smt_mask(cpu);
> + struct cpumask *idle_smt = get_idle_smtmask(cpu);
>
> /*
> * If offline, @cpu is not its own sibling and
> * scx_pick_idle_cpu() can get caught in an infinite loop as
> - * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
> - * is eventually cleared.
> + * @cpu is never cleared from the idle SMT mask. Ensure that
> + * @cpu is eventually cleared.
> */
> - if (cpumask_intersects(smt, idle_masks.smt))
> - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
> - else if (cpumask_test_cpu(cpu, idle_masks.smt))
> - __cpumask_clear_cpu(cpu, idle_masks.smt);
> + cpumask_andnot(idle_smt, idle_smt, smt);
> + __cpumask_clear_cpu(cpu, idle_smt);
> }
> #endif
> - return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
> + return cpumask_test_and_clear_cpu(cpu, get_idle_cpumask(cpu));
> }
>
> static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
> {
> - int cpu;
> + int start = cpu_to_node(smp_processor_id());
> + int node, cpu;
>
> retry:
> if (sched_smt_active()) {
> - cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
> - if (cpu < nr_cpu_ids)
> - goto found;
> + for_each_node_state_wrap(node, N_ONLINE, start) {
> + if (!cpumask_intersects(idle_masks[node]->smt, cpus_allowed))
> + continue;
> + cpu = cpumask_any_and_distribute(idle_masks[node]->smt, cpus_allowed);
> + if (cpu < nr_cpu_ids)
> + goto found;
> + }
Here the same consideration is applicable as for v1:
if idle_masks[node]->smt and cpus_allowed are disjoint, the
cpumask_any_and_distribute() will return >= nr_cpu_ids, and we'll go to
the next iteration. No need to call cpumask_intersects().
>
> if (flags & SCX_PICK_IDLE_CORE)
> return -EBUSY;
> }
>
> - cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
> - if (cpu >= nr_cpu_ids)
> - return -EBUSY;
> + for_each_node_state_wrap(node, N_ONLINE, start) {
> + if (!cpumask_intersects(idle_masks[node]->cpu, cpus_allowed))
> + continue;
> + cpu = cpumask_any_and_distribute(idle_masks[node]->cpu, cpus_allowed);
> + if (cpu < nr_cpu_ids)
> + goto found;
> + }
> + return -EBUSY;
>
> found:
> if (test_and_clear_cpu_idle(cpu))
> @@ -3459,9 +3498,9 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
> * piled up on it even if there is an idle core elsewhere on
> * the system.
> */
> - if (!cpumask_empty(idle_masks.cpu) &&
> - !(current->flags & PF_EXITING) &&
> - cpu_rq(cpu)->scx.local_dsq.nr == 0) {
> + if (!(current->flags & PF_EXITING) &&
> + cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
> + !cpumask_empty(get_idle_cpumask(cpu))) {
> if (cpumask_test_cpu(cpu, p->cpus_ptr))
> goto cpu_found;
> }
> @@ -3475,7 +3514,7 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
> /*
> * Keep using @prev_cpu if it's part of a fully idle core.
> */
> - if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
> + if (cpumask_test_cpu(prev_cpu, get_idle_smtmask(prev_cpu)) &&
> test_and_clear_cpu_idle(prev_cpu)) {
> cpu = prev_cpu;
> goto cpu_found;
> @@ -3618,12 +3657,18 @@ static void set_cpus_allowed_scx(struct task_struct *p,
>
> static void reset_idle_masks(void)
> {
> + int node;
> +
> /*
> * Consider all online cpus idle. Should converge to the actual state
> * quickly.
> */
> - cpumask_copy(idle_masks.cpu, cpu_online_mask);
> - cpumask_copy(idle_masks.smt, cpu_online_mask);
> + for_each_node_state(node, N_POSSIBLE) {
> + const struct cpumask *node_mask = cpumask_of_node(node);
> +
> + cpumask_and(idle_masks[node]->cpu, cpu_online_mask, node_mask);
> + cpumask_copy(idle_masks[node]->smt, idle_masks[node]->cpu);
> + }
> }
>
> void __scx_update_idle(struct rq *rq, bool idle)
> @@ -3636,14 +3681,13 @@ void __scx_update_idle(struct rq *rq, bool idle)
> return;
> }
>
> - if (idle)
> - cpumask_set_cpu(cpu, idle_masks.cpu);
> - else
> - cpumask_clear_cpu(cpu, idle_masks.cpu);
> + assign_cpu(cpu, get_idle_cpumask(cpu), idle);
>
> #ifdef CONFIG_SCHED_SMT
> if (sched_smt_active()) {
> const struct cpumask *smt = cpu_smt_mask(cpu);
> + struct cpumask *idle_cpu = get_idle_cpumask(cpu);
> + struct cpumask *idle_smt = get_idle_smtmask(cpu);
>
> if (idle) {
> /*
> @@ -3651,12 +3695,12 @@ void __scx_update_idle(struct rq *rq, bool idle)
> * it's only for optimization and self-correcting.
> */
> for_each_cpu(cpu, smt) {
> - if (!cpumask_test_cpu(cpu, idle_masks.cpu))
> + if (!cpumask_test_cpu(cpu, idle_cpu))
> return;
> }
> - cpumask_or(idle_masks.smt, idle_masks.smt, smt);
> + cpumask_or(idle_smt, idle_smt, smt);
> } else {
> - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
> + cpumask_andnot(idle_smt, idle_smt, smt);
> }
> }
> #endif
> @@ -6232,8 +6276,7 @@ void __init init_sched_ext_class(void)
>
> BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
> #ifdef CONFIG_SMP
> - BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
> - BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
> + idle_masks_init();
> #endif
> scx_kick_cpus_pnt_seqs =
> __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids,
> @@ -7379,7 +7422,7 @@ __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
>
> /**
> * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
> - * per-CPU cpumask.
> + * per-CPU cpumask of the current NUMA node.
> *
> * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
> */
> @@ -7391,7 +7434,7 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
> }
>
> #ifdef CONFIG_SMP
> - return idle_masks.cpu;
> + return get_idle_cpumask(smp_processor_id());
> #else
> return cpu_none_mask;
> #endif
> @@ -7399,8 +7442,8 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
>
> /**
> * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
> - * per-physical-core cpumask. Can be used to determine if an entire physical
> - * core is free.
> + * per-physical-core cpumask of the current NUMA node. Can be used to determine
> + * if an entire physical core is free.
> *
> * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
> */
> @@ -7413,9 +7456,9 @@ __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
>
> #ifdef CONFIG_SMP
> if (sched_smt_active())
> - return idle_masks.smt;
> + return get_idle_smtmask(smp_processor_id());
> else
> - return idle_masks.cpu;
> + return get_idle_cpumask(smp_processor_id());
> #else
> return cpu_none_mask;
> #endif
> --
> 2.47.1
Powered by blists - more mailing lists