[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z1eH8_rP16IyJ8LI@gpd3>
Date: Tue, 10 Dec 2024 01:14:43 +0100
From: Andrea Righi <arighi@...dia.com>
To: Yury Norov <yury.norov@...il.com>
Cc: Tejun Heo <tj@...nel.org>, David Vernet <void@...ifault.com>,
Changwoo Min <changwoo@...lia.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 1/4] sched_ext: Introduce per-NUMA idle cpumasks
On Mon, Dec 09, 2024 at 11:32:56AM -0800, Yury Norov wrote:
...
> > +static struct cpumask *get_idle_cpumask_node(int node)
> > +{
> > + return idle_masks[node]->cpu;
> > +}
> > +
> > +static struct cpumask *get_idle_smtmask_node(int node)
> > +{
> > + return idle_masks[node]->smt;
> > +}
>
> This implies that NUMA_NO_NODE, which is -1, will never be passed.
> Is that indeed impossible?
In PATCH 4/4 I adds some checks to make sure "node" is valid and return
NULL otherwise, but thinking more about it, it seems better to return
cpu_none_mask.
>
> > +
> > +static struct cpumask *get_curr_idle_cpumask(void)
> > +{
> > + int node = cpu_to_node(smp_processor_id());
> > +
> > + return get_idle_cpumask_node(node);
> > +}
> > +
> > +static struct cpumask *get_curr_idle_smtmask(void)
> > +{
> > + int node = cpu_to_node(smp_processor_id());
> > +
> > + if (sched_smt_active())
> > + return get_idle_smtmask_node(node);
> > + else
> > + return get_idle_cpumask_node(node);
> > +}
> > +
> > +static void idle_masks_init(void)
> > +{
> > + int node;
> > +
> > + idle_masks = kcalloc(num_possible_nodes(), sizeof(*idle_masks), GFP_KERNEL);
> > + BUG_ON(!idle_masks);
> > +
> > + for_each_node_state(node, N_POSSIBLE) {
>
> for_each_node(node)
Ok.
>
> > + idle_masks[node] = kzalloc_node(sizeof(**idle_masks), GFP_KERNEL, node);
> > + BUG_ON(!idle_masks[node]);
> > +
> > + BUG_ON(!alloc_cpumask_var_node(&idle_masks[node]->cpu, GFP_KERNEL, node));
> > + BUG_ON(!alloc_cpumask_var_node(&idle_masks[node]->smt, GFP_KERNEL, node));
> > + }
> > +}
> > +#else /* !CONFIG_SMP */
> > +static struct cpumask *get_curr_idle_cpumask(void)
> > +{
> > + return cpu_none_mask;
> > +}
> > +
> > +static struct cpumask *get_curr_idle_smtmask(void)
> > +{
> > + return cpu_none_mask;
> > +}
> > #endif /* CONFIG_SMP */
> >
> > /* for %SCX_KICK_WAIT */
> > @@ -3166,6 +3218,9 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
> >
> > static bool test_and_clear_cpu_idle(int cpu)
> > {
> > + int node = cpu_to_node(cpu);
> > + struct cpumask *idle_cpu = get_idle_cpumask_node(node);
> > +
> > #ifdef CONFIG_SCHED_SMT
> > /*
> > * SMT mask should be cleared whether we can claim @cpu or not. The SMT
> > @@ -3174,29 +3229,34 @@ static bool test_and_clear_cpu_idle(int cpu)
> > */
> > if (sched_smt_active()) {
> > const struct cpumask *smt = cpu_smt_mask(cpu);
> > + struct cpumask *idle_smt = get_idle_smtmask_node(node);
> >
> > /*
> > * If offline, @cpu is not its own sibling and
> > * scx_pick_idle_cpu() can get caught in an infinite loop as
> > - * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
> > - * is eventually cleared.
> > + * @cpu is never cleared from the idle SMT mask. Ensure that
> > + * @cpu is eventually cleared.
> > + *
> > + * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
> > + * reduce memory writes, which may help alleviate cache
> > + * coherence pressure.
> > */
> > - if (cpumask_intersects(smt, idle_masks.smt))
> > - cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
> > - else if (cpumask_test_cpu(cpu, idle_masks.smt))
> > - __cpumask_clear_cpu(cpu, idle_masks.smt);
> > + if (cpumask_intersects(smt, idle_smt))
> > + cpumask_andnot(idle_smt, idle_smt, smt);
> > + else if (cpumask_test_cpu(cpu, idle_smt))
> > + __cpumask_clear_cpu(cpu, idle_smt);
> > }
> > #endif
> > - return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
> > + return cpumask_test_and_clear_cpu(cpu, idle_cpu);
> > }
> >
> > -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
> > +static s32 scx_pick_idle_cpu_from_node(int node, const struct cpumask *cpus_allowed, u64 flags)
> > {
> > int cpu;
> >
> > retry:
> > if (sched_smt_active()) {
> > - cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
> > + cpu = cpumask_any_and_distribute(get_idle_smtmask_node(node), cpus_allowed);
> > if (cpu < nr_cpu_ids)
> > goto found;
> >
> > @@ -3204,15 +3264,66 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
> > return -EBUSY;
> > }
> >
> > - cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
> > - if (cpu >= nr_cpu_ids)
> > - return -EBUSY;
> > + cpu = cpumask_any_and_distribute(get_idle_cpumask_node(node), cpus_allowed);
> > + if (cpu < nr_cpu_ids)
> > + goto found;
> > +
> > + return -EBUSY;
>
> What for did you change the error handling logic? Can you keep the
> original?
I changed that to follow the same pattern as the one inside
sched_smt_active(), it should be equivalent, but I can keep the
original.
>
> >
> > found:
> > if (test_and_clear_cpu_idle(cpu))
> > return cpu;
> > else
> > goto retry;
> > +
> > +}
> > +
> > +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, s32 prev_cpu, u64 flags)
> > +{
> > + const struct cpumask *node_mask;
> > + s32 cpu;
> > +
> > + /*
> > + * Only node 0 is used if per-node idle cpumasks are disabled.
> > + */
> > + if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
> > + return scx_pick_idle_cpu_from_node(0, cpus_allowed, flags);
> > +
> > + /*
> > + * Traverse all nodes in order of increasing distance, starting from
> > + * prev_cpu's node.
> > + */
> > + rcu_read_lock();
> > + for_each_numa_hop_mask(node_mask, cpu_to_node(prev_cpu)) {
>
> This 'node_mask' misleads. This is not a nodemask. This is a cpumask -
> all cpus in the hop. Can you name it 'hop_cpus', or similar?
Ack.
>
> > + /*
> > + * scx_pick_idle_cpu_from_node() can be expensive and redundant
> > + * if none of the CPUs in the NUMA node can be used (according
> > + * to cpus_allowed).
> > + *
> > + * Therefore, check if the NUMA node is usable in advance to
> > + * save some CPU cycles.
> > + */
> > + if (!cpumask_intersects(node_mask, cpus_allowed))
> > + continue;
> > +
> > + /*
> > + * It would be nice to have a "node" iterator, instead of the
> > + * cpumask, to get rid of the cpumask_first() to determine the
> > + * node.
> > + */
>
> I'm not aware about such case. And this one doesn't look like that because you
> filter against a cpumask (cpus_allowed). Right?
I'm interested to get the node id here to use it with
scx_pick_idle_cpu_from_node().
>
> > + cpu = cpumask_first(node_mask);
> > + if (cpu >= nr_cpu_ids)
> > + continue;
>
> for_each_numa_hop_mask() doesn't traverse per-node CPUs. It traverses
> per-hop CPUs. The difference is that the hop may include more than one
> node if distances from a given node to those belonging the hop are the
> same.
Hm... I was missing this aspect. Thanks for clarifying this.
>
> So, imagine we have nodes 1 and 2 in the same hop, but cpus_allowed
> has only cpus from node 2. With this configuration you pass the
> cpumask_intersects() check, but when picking a CPU, you ignore the
> cpus_allowed and pick node 1. This is wrong, I guess.
Yeah, that's not what I would like to do.
>
> Instead, I would make a single check:
>
> cpu = cpumask_first_and(node_mask, cpus_allowed);
> if (cpu >= nr_cpu_ids)
> continue;
Ok, but at this point I'm not sure that for_each_numa_hop_mask() is the
most efficient way to iterate NUMA nodes.
>
> > +
> > + cpu = scx_pick_idle_cpu_from_node(cpu_to_node(cpu), cpus_allowed, flags);
> > + if (cpu >= 0)
> > + goto out_unlock;
>
> The code in scx_pick_idle_cpu_from_node() is written such that CPUs
> that your picks are well distributed across the nodes. Your code above
> always picks 1st node in the hop. I think here you should use
>
> cpumask_any_and_distribute()
>
> like the scx_pick_idle_cpu_from_node() does with idle_masks.
Right, that's because I was (incorrectly) assuming that each hop was a
single node.
>
> Another problem is that high-level hops include CPUs from lower-level
> ones. It means that you will traverse the same lower-level CPUs again
> for nothing. So, you need to mask them out.
>
> Another-another problem: if your hop has more than one node, you should
> not jump to the next hop until you tried every node from the current hop.
> This brings another loop, but doesn't increase complexity if you
> carefully mask-out all visited nodes.
>
> > + }
> > + cpu = -EBUSY;
>
> This hides the actual error returned from scx_pick_idle_cpu_from_node().
Right, scx_pick_idle_cpu_from_node() can only returns -EBUSY at the
moment, but it'd be nicer to pass the returned error.
>
> > +
> > +out_unlock:
> > + rcu_read_unlock();
> > + return cpu;
> > }
>
> And altogether this should look like:
>
> int scx_pick_idle_cpu_from_hop(struct cpumask *hop_cpus, struct cpumask *cpus_allowed)
> {
> int node, cpu, random_cpu;
>
> do {
>
> /* Pick a 'random' CPU in the hop */
> random_cpu = cpumask_any_and_distribute(hop_cpus, cpus_allowed);
> if (random_cpu >= nr_cpu_ids)
> continue;
>
> node = cpu_to_node(random_cpu);
>
> /* Find an idle CPU in the same node */
> cpu = scx_pick_idle_cpu_from_node(node, cpus_allowed, flags);
> if (cpu >= 0)
> break;
>
> /* No luck? Try other nodes */
> } while (cpumask_andnot(hop_cpus, hop_cpus, cpumask_of_node(node)));
>
> return cpu;
> }
>
> static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, s32 prev_cpu, u64 flags)
> {
> const struct cpumask *next, *prev = cpu_none_mask;
> int prev_node = cpu_to_node(prev_cpu);
> ...
> for_each_numa_hop_mask(next, prev_node) {
> cpumask_andnot(hop_cpus, next, prev);
> cpu = scx_pick_idle_cpu_from_hop(hop_cpus, cpus_allowed);
> prev = next;
> }
> ...
> }
>
> Not tested, but should work.
Makes sense to me, I'll do some testing with this.
> > @@ -3636,17 +3748,27 @@ static void set_cpus_allowed_scx(struct task_struct *p,
> >
> > static void reset_idle_masks(void)
> > {
> > + int node;
> > +
> > /*
> > * Consider all online cpus idle. Should converge to the actual state
> > * quickly.
> > */
> > - cpumask_copy(idle_masks.cpu, cpu_online_mask);
> > - cpumask_copy(idle_masks.smt, cpu_online_mask);
> > + for_each_node_state(node, N_POSSIBLE) {
>
> for_each_node()
>
Ok.
And I think I haven't missed any comment this time, sorry about that and
thanks again for the review!
-Andrea
Powered by blists - more mailing lists