linux-kernel - Re: [PATCH 1/4] sched_ext: Introduce per-NUMA idle cpumasks

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Z1eH8_rP16IyJ8LI@gpd3>
Date: Tue, 10 Dec 2024 01:14:43 +0100
From: Andrea Righi <arighi@...dia.com>
To: Yury Norov <yury.norov@...il.com>
Cc: Tejun Heo <tj@...nel.org>, David Vernet <void@...ifault.com>,
	Changwoo Min <changwoo@...lia.com>, linux-kernel@...r.kernel.org
Subject: Re: [PATCH 1/4] sched_ext: Introduce per-NUMA idle cpumasks

On Mon, Dec 09, 2024 at 11:32:56AM -0800, Yury Norov wrote:
...
> > +static struct cpumask *get_idle_cpumask_node(int node)
> > +{
> > +	return idle_masks[node]->cpu;
> > +}
> > +
> > +static struct cpumask *get_idle_smtmask_node(int node)
> > +{
> > +	return idle_masks[node]->smt;
> > +}
> 
> This implies that NUMA_NO_NODE, which is -1, will never be passed.
> Is that indeed impossible?

In PATCH 4/4 I adds some checks to make sure "node" is valid and return
NULL otherwise, but thinking more about it, it seems better to return
cpu_none_mask.

> 
> > +
> > +static struct cpumask *get_curr_idle_cpumask(void)
> > +{
> > +	int node = cpu_to_node(smp_processor_id());
> > +
> > +	return get_idle_cpumask_node(node);
> > +}
> > +
> > +static struct cpumask *get_curr_idle_smtmask(void)
> > +{
> > +	int node = cpu_to_node(smp_processor_id());
> > +
> > +	if (sched_smt_active())
> > +		return get_idle_smtmask_node(node);
> > +	else
> > +		return get_idle_cpumask_node(node);
> > +}
> > +
> > +static void idle_masks_init(void)
> > +{
> > +	int node;
> > +
> > +	idle_masks = kcalloc(num_possible_nodes(), sizeof(*idle_masks), GFP_KERNEL);
> > +	BUG_ON(!idle_masks);
> > +
> > +	for_each_node_state(node, N_POSSIBLE) {
> 
> for_each_node(node)

Ok.

> 
> > +		idle_masks[node] = kzalloc_node(sizeof(**idle_masks), GFP_KERNEL, node);
> > +		BUG_ON(!idle_masks[node]);
> > +
> > +		BUG_ON(!alloc_cpumask_var_node(&idle_masks[node]->cpu, GFP_KERNEL, node));
> > +		BUG_ON(!alloc_cpumask_var_node(&idle_masks[node]->smt, GFP_KERNEL, node));
> > +	}
> > +}
> > +#else	/* !CONFIG_SMP */
> > +static struct cpumask *get_curr_idle_cpumask(void)
> > +{
> > +	return cpu_none_mask;
> > +}
> > +
> > +static struct cpumask *get_curr_idle_smtmask(void)
> > +{
> > +	return cpu_none_mask;
> > +}
> >  #endif	/* CONFIG_SMP */
> >  
> >  /* for %SCX_KICK_WAIT */
> > @@ -3166,6 +3218,9 @@ bool scx_prio_less(const struct task_struct *a, const struct task_struct *b,
> >  
> >  static bool test_and_clear_cpu_idle(int cpu)
> >  {
> > +	int node = cpu_to_node(cpu);
> > +	struct cpumask *idle_cpu = get_idle_cpumask_node(node);
> > +
> >  #ifdef CONFIG_SCHED_SMT
> >  	/*
> >  	 * SMT mask should be cleared whether we can claim @cpu or not. The SMT
> > @@ -3174,29 +3229,34 @@ static bool test_and_clear_cpu_idle(int cpu)
> >  	 */
> >  	if (sched_smt_active()) {
> >  		const struct cpumask *smt = cpu_smt_mask(cpu);
> > +		struct cpumask *idle_smt = get_idle_smtmask_node(node);
> >  
> >  		/*
> >  		 * If offline, @cpu is not its own sibling and
> >  		 * scx_pick_idle_cpu() can get caught in an infinite loop as
> > -		 * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
> > -		 * is eventually cleared.
> > +		 * @cpu is never cleared from the idle SMT mask. Ensure that
> > +		 * @cpu is eventually cleared.
> > +		 *
> > +		 * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
> > +		 * reduce memory writes, which may help alleviate cache
> > +		 * coherence pressure.
> >  		 */
> > -		if (cpumask_intersects(smt, idle_masks.smt))
> > -			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
> > -		else if (cpumask_test_cpu(cpu, idle_masks.smt))
> > -			__cpumask_clear_cpu(cpu, idle_masks.smt);
> > +		if (cpumask_intersects(smt, idle_smt))
> > +			cpumask_andnot(idle_smt, idle_smt, smt);
> > +		else if (cpumask_test_cpu(cpu, idle_smt))
> > +			__cpumask_clear_cpu(cpu, idle_smt);
> >  	}
> >  #endif
> > -	return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
> > +	return cpumask_test_and_clear_cpu(cpu, idle_cpu);
> >  }
> >  
> > -static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
> > +static s32 scx_pick_idle_cpu_from_node(int node, const struct cpumask *cpus_allowed, u64 flags)
> >  {
> >  	int cpu;
> >  
> >  retry:
> >  	if (sched_smt_active()) {
> > -		cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
> > +		cpu = cpumask_any_and_distribute(get_idle_smtmask_node(node), cpus_allowed);
> >  		if (cpu < nr_cpu_ids)
> >  			goto found;
> >  
> > @@ -3204,15 +3264,66 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
> >  			return -EBUSY;
> >  	}
> >  
> > -	cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
> > -	if (cpu >= nr_cpu_ids)
> > -		return -EBUSY;
> > +	cpu = cpumask_any_and_distribute(get_idle_cpumask_node(node), cpus_allowed);
> > +	if (cpu < nr_cpu_ids)
> > +		goto found;
> > +
> > +	return -EBUSY;
> 
> What for did you change the error handling logic? Can you keep the
> original?

I changed that to follow the same pattern as the one inside
sched_smt_active(), it should be equivalent, but I can keep the
original.

> 
> >  
> >  found:
> >  	if (test_and_clear_cpu_idle(cpu))
> >  		return cpu;
> >  	else
> >  		goto retry;
> > +
> > +}
> > +
> > +static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, s32 prev_cpu, u64 flags)
> > +{
> > +	const struct cpumask *node_mask;
> > +	s32 cpu;
> > +
> > +	/*
> > +	 * Only node 0 is used if per-node idle cpumasks are disabled.
> > +	 */
> > +	if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
> > +		return scx_pick_idle_cpu_from_node(0, cpus_allowed, flags);
> > +
> > +	/*
> > +	 * Traverse all nodes in order of increasing distance, starting from
> > +	 * prev_cpu's node.
> > +	 */
> > +	rcu_read_lock();
> > +	for_each_numa_hop_mask(node_mask, cpu_to_node(prev_cpu)) {
> 
> This 'node_mask' misleads. This is not a nodemask. This is a cpumask -
> all cpus in the hop. Can you name it 'hop_cpus', or similar?

Ack.

> 
> > +		/*
> > +		 * scx_pick_idle_cpu_from_node() can be expensive and redundant
> > +		 * if none of the CPUs in the NUMA node can be used (according
> > +		 * to cpus_allowed).
> > +		 *
> > +		 * Therefore, check if the NUMA node is usable in advance to
> > +		 * save some CPU cycles.
> > +		 */
> > +		if (!cpumask_intersects(node_mask, cpus_allowed))
> > +			continue;
> > +
> > +		/*
> > +		 * It would be nice to have a "node" iterator, instead of the
> > +		 * cpumask, to get rid of the cpumask_first() to determine the
> > +		 * node.
> > +		 */
> 
> I'm not aware about such case. And this one doesn't look like that because you
> filter against a cpumask (cpus_allowed). Right?

I'm interested to get the node id here to use it with
scx_pick_idle_cpu_from_node().

> 
> > +		cpu = cpumask_first(node_mask);
> > +		if (cpu >= nr_cpu_ids)
> > +			continue;
> 
> for_each_numa_hop_mask() doesn't traverse per-node CPUs. It traverses
> per-hop CPUs. The difference is that the hop may include more than one
> node if distances from a given node to those belonging the hop are the
> same.

Hm... I was missing this aspect. Thanks for clarifying this.

> 
> So, imagine we have nodes 1 and 2 in the same hop, but cpus_allowed
> has only cpus from node 2. With this configuration you pass the
> cpumask_intersects() check, but when picking a CPU, you ignore the
> cpus_allowed and pick node 1. This is wrong, I guess.

Yeah, that's not what I would like to do.

> 
> Instead, I would make a single check:
> 
>         cpu = cpumask_first_and(node_mask, cpus_allowed);
>         if (cpu >= nr_cpu_ids)
>                 continue;

Ok, but at this point I'm not sure that for_each_numa_hop_mask() is the
most efficient way to iterate NUMA nodes.

> 
> > +
> > +		cpu = scx_pick_idle_cpu_from_node(cpu_to_node(cpu), cpus_allowed, flags);
> > +		if (cpu >= 0)
> > +			goto out_unlock;
> 
> The code in scx_pick_idle_cpu_from_node() is written such that CPUs
> that your picks are well distributed across the nodes. Your code above
> always picks 1st node in the hop. I think here you should use 
> 
>         cpumask_any_and_distribute()
>  
> like the scx_pick_idle_cpu_from_node() does with idle_masks.

Right, that's because I was (incorrectly) assuming that each hop was a
single node.

> 
> Another problem is that high-level hops include CPUs from lower-level
> ones. It means that you will traverse the same lower-level CPUs again
> for nothing. So, you need to mask them out.
> 
> Another-another problem: if your hop has more than one node, you should
> not jump to the next hop until you tried every node from the current hop.
> This brings another loop, but doesn't increase complexity if you
> carefully mask-out all visited nodes.
> 
> > +	}
> > +	cpu = -EBUSY;
> 
> This hides the actual error returned from scx_pick_idle_cpu_from_node().

Right, scx_pick_idle_cpu_from_node() can only returns -EBUSY at the
moment, but it'd be nicer to pass the returned error.

> 
> > +
> > +out_unlock:
> > +	rcu_read_unlock();
> > +	return cpu;
> >  }
> 
> And altogether this should look like:
> 
>  int scx_pick_idle_cpu_from_hop(struct cpumask *hop_cpus, struct cpumask *cpus_allowed)
>  {
>          int node, cpu, random_cpu;
> 
>          do {
> 
>                  /* Pick a 'random' CPU in the hop */
>                  random_cpu = cpumask_any_and_distribute(hop_cpus, cpus_allowed);
>                  if (random_cpu >= nr_cpu_ids)
>                          continue;
> 
>                  node = cpu_to_node(random_cpu);
> 
>                  /* Find an idle CPU in the same node */
>                  cpu = scx_pick_idle_cpu_from_node(node, cpus_allowed, flags);
>                  if (cpu >= 0)
>                          break;
> 
>                  /* No luck? Try other nodes */
>          } while (cpumask_andnot(hop_cpus, hop_cpus, cpumask_of_node(node)));
> 
>          return cpu;
>  }
> 
>  static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, s32 prev_cpu, u64 flags)
>  {
>         const struct cpumask *next, *prev = cpu_none_mask;
>         int prev_node = cpu_to_node(prev_cpu);
>  ...
> 	for_each_numa_hop_mask(next, prev_node) {
>                 cpumask_andnot(hop_cpus, next, prev);
>                 cpu = scx_pick_idle_cpu_from_hop(hop_cpus, cpus_allowed);
>                 prev = next;
>         }
>  ...
>  }
> 
> Not tested, but should work.

Makes sense to me, I'll do some testing with this.

> > @@ -3636,17 +3748,27 @@ static void set_cpus_allowed_scx(struct task_struct *p,
> >  
> >  static void reset_idle_masks(void)
> >  {
> > +	int node;
> > +
> >  	/*
> >  	 * Consider all online cpus idle. Should converge to the actual state
> >  	 * quickly.
> >  	 */
> > -	cpumask_copy(idle_masks.cpu, cpu_online_mask);
> > -	cpumask_copy(idle_masks.smt, cpu_online_mask);
> > +	for_each_node_state(node, N_POSSIBLE) {
> 
> for_each_node()
> 

Ok.

And I think I haven't missed any comment this time, sorry about that and
thanks again for the review!

-Andrea