linux-kernel - RE: [PATCH net-next] net: mana: Assigning IRQ affinity on HT cores

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <BN7PR02MB41487FDBD42B364683A5ACA6D4B0A@BN7PR02MB4148.namprd02.prod.outlook.com>
Date:   Thu, 16 Nov 2023 05:25:50 +0000
From:   Michael Kelley <mhklinux@...look.com>
To:     Souradeep Chakrabarti <schakrabarti@...ux.microsoft.com>,
        "kys@...rosoft.com" <kys@...rosoft.com>,
        "haiyangz@...rosoft.com" <haiyangz@...rosoft.com>,
        "wei.liu@...nel.org" <wei.liu@...nel.org>,
        "decui@...rosoft.com" <decui@...rosoft.com>,
        "davem@...emloft.net" <davem@...emloft.net>,
        "edumazet@...gle.com" <edumazet@...gle.com>,
        "kuba@...nel.org" <kuba@...nel.org>,
        "pabeni@...hat.com" <pabeni@...hat.com>,
        "longli@...rosoft.com" <longli@...rosoft.com>,
        "sharmaajay@...rosoft.com" <sharmaajay@...rosoft.com>,
        "leon@...nel.org" <leon@...nel.org>,
        "cai.huoqing@...ux.dev" <cai.huoqing@...ux.dev>,
        "ssengar@...ux.microsoft.com" <ssengar@...ux.microsoft.com>,
        "vkuznets@...hat.com" <vkuznets@...hat.com>,
        "tglx@...utronix.de" <tglx@...utronix.de>,
        "linux-hyperv@...r.kernel.org" <linux-hyperv@...r.kernel.org>,
        "netdev@...r.kernel.org" <netdev@...r.kernel.org>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        "linux-rdma@...r.kernel.org" <linux-rdma@...r.kernel.org>
CC:     "schakrabarti@...rosoft.com" <schakrabarti@...rosoft.com>,
        "paulros@...rosoft.com" <paulros@...rosoft.com>
Subject: RE: [PATCH net-next] net: mana: Assigning IRQ affinity on HT cores

From: Souradeep Chakrabarti <schakrabarti@...ux.microsoft.com> Sent: Wednesday, November 15, 2023 5:49 AM
> 
> Existing MANA design assigns IRQ affinity to every sibling CPUs, which causes

Let's make this more specific by saying "...  assigns IRQs to every CPU,
Including sibling hyper-threads in a core, which causes ...."

> IRQ coalescing and may reduce the network performance with RSS.

What is "IRQ coalescing"?

> 
> Improve the performance by adhering the configuration for RSS, which
> prioritise IRQ affinity on HT cores.

I don't understand this sentence.  What is "adhering the configuration
for RSS"?  And what does it mean to prioritise IRQ affinity on HT cores?
I think you mean to first assign IRQs to only one hyper-thread in a core,
and to use the additional hyper-threads in a core only when there are
more IRQs than cores.

> 
> Signed-off-by: Souradeep Chakrabarti <schakrabarti@...ux.microsoft.com>
> ---
>  .../net/ethernet/microsoft/mana/gdma_main.c   | 126 ++++++++++++++++-
> -
>  1 file changed, 117 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 6367de0c2c2e..839be819d46e 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -1243,13 +1243,115 @@ void mana_gd_free_res_map(struct
> gdma_resource *r)
>  	r->size = 0;
>  }
> 
> +static void cpu_mask_set(cpumask_var_t *filter_mask, cpumask_var_t
> **filter_mask_list)
> +{
> +	unsigned int core_count = 0, cpu;
> +	cpumask_var_t *filter_mask_list_tmp;
> +
> +	BUG_ON(!filter_mask || !filter_mask_list);

This check seems superfluous. The function is invoked from only
one call site in the code below, and that site call site can easily
ensure that it doesn't pass a NULL value for either parameter.

> +	filter_mask_list_tmp = *filter_mask_list;
> +	cpumask_copy(*filter_mask, cpu_online_mask);
> +	/* for each core create a cpumask lookup table,
> +	 * which stores all the corresponding siblings
> +	 */
> +	for_each_cpu(cpu, *filter_mask) {
> +		BUG_ON(!alloc_cpumask_var(&filter_mask_list_tmp[core_count], GFP_KERNEL));

Don't panic if a memory allocation fails.  Must back out, clean up,
and return an error.

> +		cpumask_or(filter_mask_list_tmp[core_count], filter_mask_list_tmp[core_count],
> +			   topology_sibling_cpumask(cpu));

alloc_cpumask_var() does not zero out the returned cpumask.  So the
cpumask_or() is operating on uninitialized garbage.  Use
zalloc_cpumask_var() to get a cpumask initialized to zero.

But why is the cpumask_or() being done at all?  Couldn't this
just be a cpumask_copy()?  In that case, alloc_cpumask_var() is OK.

> +		cpumask_andnot(*filter_mask, *filter_mask, topology_sibling_cpumask(cpu));
> +		core_count++;
> +	}
> +}
> +
> +static int irq_setup(int *irqs, int nvec)
> +{
> +	cpumask_var_t filter_mask;
> +	cpumask_var_t *filter_mask_list;
> +	unsigned int cpu_first, cpu, irq_start, cores = 0;
> +	int i, core_count = 0, numa_node, cpu_count = 0, err = 0;
> +
> +	BUG_ON(!alloc_cpumask_var(&filter_mask, GFP_KERNEL));

Again, don't panic if a memory allocation fails.  Must back out and
return an error.

> +	cpus_read_lock();
> +	cpumask_copy(filter_mask, cpu_online_mask);

For readability, I'd suggest adding a blank line before any
of the multi-line comments below.

> +	/* count the number of cores
> +	 */
> +	for_each_cpu(cpu, filter_mask) {
> +		cpumask_andnot(filter_mask, filter_mask, topology_sibling_cpumask(cpu));
> +		cores++;
> +	}
> +	filter_mask_list = kcalloc(cores, sizeof(cpumask_var_t), GFP_KERNEL);
> +	if (!filter_mask_list) {
> +		err = -ENOMEM;
> +		goto free_irq;
> +	}
> +	/* if number of cpus are equal to max_queues per port, then
> +	 * one extra interrupt for the hardware channel communication.
> +	 */

Note that higher level code may set nvec based on the # of
online CPUs, but until the cpus_read_lock is held, the # of online
CPUs can change. So nvec might have been determined when the
# of CPUs was 8, but 2 CPUs could go offline before the cpus_read_lock
is obtained.  So nvec could be bigger than just 1 more than
num_online_cpus().  I'm not sure how to handle the check below to
special case the hardware communication channel.  But realize
that the main "for" loop below could end up assigning multiple
IRQs to the same CPU if nvec > num_online_cpus().

> +	if (nvec - 1 == num_online_cpus()) {
> +		irq_start = 1;
> +		cpu_first = cpumask_first(cpu_online_mask);
> +		irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu_first));
> +	} else {
> +		irq_start = 0;
> +	}
> +	/* reset the core_count and num_node to 0.
> +	 */
> +	core_count = 0;
> +	numa_node = 0;
> +	cpu_mask_set(&filter_mask, &filter_mask_list);

FWIW, passing filter_mask as the first argument above was
confusing to me until I realized that the value of filter_mask
doesn't matter -- it's just to use the memory allocated for
filter_mask.  Maybe a comment would help.  And ditto for
the invocation of cpu_mask_set() further down.

> +	/* for each interrupt find the cpu of a particular
> +	 * sibling set and if it belongs to the specific numa
> +	 * then assign irq to it and clear the cpu bit from
> +	 * the corresponding sibling list from filter_mask_list.
> +	 * Increase the cpu_count for that node.
> +	 * Once all cpus for a numa node is assigned, then
> +	 * move to different numa node and continue the same.
> +	 */
> +	for (i = irq_start; i < nvec; ) {
> +		cpu_first = cpumask_first(filter_mask_list[core_count]);
> +		if (cpu_first < nr_cpu_ids && cpu_to_node(cpu_first) == numa_node) {

Note that it's possible to have a NUMA node with zero online CPUs.
It could be a NUMA node that was originally a memory-only NUMA
node and never had any CPUs, or the NUMA node could have had
CPUs, but they were all later taken offline.  Such a NUMA node is
still considered to be online because it has memory, but it has
no online CPUs.

If this code ever sets "numa_node" to such a NUMA node,
the "for" loop will become an infinite loop because the "if"
statement above will never find a CPU that is assigned to
"numa_node".

> +			irq_set_affinity_and_hint(irqs[i], cpumask_of(cpu_first));
> +			cpumask_clear_cpu(cpu_first, filter_mask_list[core_count]);
> +			cpu_count = cpu_count + 1;
> +			i = i + 1;
> +			/* checking if all the cpus are used from the
> +			 * particular node.
> +			 */
> +			if (cpu_count == nr_cpus_node(numa_node)) {
> +				numa_node = numa_node + 1;
> +				if (numa_node == num_online_nodes()) {
> +					cpu_mask_set(&filter_mask, &filter_mask_list);

The second and subsequent calls to cpu_mask_set() will
leak any memory previously allocated by alloc_cpumask_var()
in cpu_mask_set().

I agree with Haiyang's comment about starting with a NUMA
node other than NUMA node zero.  But if you do so, note that
testing for wrap-around at num_online_nodes() won't be
equivalent to testing for getting back to the starting NUMA node.
You really want to run cpu_mask_set() again only when you get
back to the starting NUMA node.

> +					numa_node = 0;
> +				}
> +				cpu_count = 0;
> +				core_count = 0;
> +				continue;
> +			}
> +		}
> +		if ((core_count + 1) % cores == 0)
> +			core_count = 0;
> +		else
> +			core_count++;

The if/else could be more compactly written as:

		if (++core_count == cores)
			core_count = 0;

> +	}
> +free_irq:
> +	cpus_read_unlock();
> +	if (filter_mask)

This check for non-NULL filter_mask is incorrect and might
not even compile if CONFIG_CPUMASK_OFFSTACK=n.  For testing
purposes, you should make sure to build and run with each
option:  with CONFIG_CPUMASK_OFFSTACK=y and
also CONFIG_CPUMASK_OFFSTACK=n.

It's safe to just call free_cpumask_var() without the check.

> +		free_cpumask_var(filter_mask);
> +	if (filter_mask_list) {
> +		for (core_count = 0; core_count < cores; core_count++)
> +			free_cpumask_var(filter_mask_list[core_count]);
> +		kfree(filter_mask_list);
> +	}
> +	return err;
> +}
> +
>  static int mana_gd_setup_irqs(struct pci_dev *pdev)
>  {
>  	unsigned int max_queues_per_port = num_online_cpus();
>  	struct gdma_context *gc = pci_get_drvdata(pdev);
>  	struct gdma_irq_context *gic;
> -	unsigned int max_irqs, cpu;
> -	int nvec, irq;
> +	unsigned int max_irqs;
> +	int nvec, *irqs, irq;
>  	int err, i = 0, j;
> 
>  	if (max_queues_per_port > MANA_MAX_NUM_QUEUES)
> @@ -1261,6 +1363,11 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
>  	nvec = pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX);
>  	if (nvec < 0)
>  		return nvec;
> +	irqs = kmalloc_array(nvec, sizeof(int), GFP_KERNEL);
> +	if (!irqs) {
> +		err = -ENOMEM;
> +		goto free_irq_vector;
> +	}
> 
>  	gc->irq_contexts = kcalloc(nvec, sizeof(struct gdma_irq_context),
>  				   GFP_KERNEL);
> @@ -1281,20 +1388,20 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
>  			snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s",
>  				 i - 1, pci_name(pdev));
> 
> -		irq = pci_irq_vector(pdev, i);
> -		if (irq < 0) {
> -			err = irq;
> +		irqs[i] = pci_irq_vector(pdev, i);
> +		if (irqs[i] < 0) {
> +			err = irqs[i];
>  			goto free_irq;
>  		}
> 
> -		err = request_irq(irq, mana_gd_intr, 0, gic->name, gic);
> +		err = request_irq(irqs[i], mana_gd_intr, 0, gic->name, gic);
>  		if (err)
>  			goto free_irq;
> -
> -		cpu = cpumask_local_spread(i, gc->numa_node);
> -		irq_set_affinity_and_hint(irq, cpumask_of(cpu));
>  	}
> 
> +	err = irq_setup(irqs, nvec);
> +	if (err)
> +		goto free_irq;
>  	err = mana_gd_alloc_res_map(nvec, &gc->msix_resource);
>  	if (err)
>  		goto free_irq;
> @@ -1314,6 +1421,7 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev)
>  	}
> 
>  	kfree(gc->irq_contexts);
> +	kfree(irqs);
>  	gc->irq_contexts = NULL;
>  free_irq_vector:
>  	pci_free_irq_vectors(pdev);
> --
> 2.34.1