linux-kernel - Re: [PATCH v12 3/7] genirq: Add mechanism to multiplex a single HW IPI

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <86k03fmkox.wl-maz@kernel.org>
Date:   Mon, 28 Nov 2022 10:34:38 +0000
From:   Marc Zyngier <maz@...nel.org>
To:     Anup Patel <apatel@...tanamicro.com>
Cc:     Palmer Dabbelt <palmer@...belt.com>,
        Paul Walmsley <paul.walmsley@...ive.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Daniel Lezcano <daniel.lezcano@...aro.org>,
        Atish Patra <atishp@...shpatra.org>,
        Alistair Francis <Alistair.Francis@....com>,
        Anup Patel <anup@...infault.org>,
        linux-riscv@...ts.infradead.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v12 3/7] genirq: Add mechanism to multiplex a single HW IPI

On Sat, 26 Nov 2022 17:34:49 +0000,
Anup Patel <apatel@...tanamicro.com> wrote:
> 
> All RISC-V platforms have a single HW IPI provided by the INTC local
> interrupt controller. The HW method to trigger INTC IPI can be through
> external irqchip (e.g. RISC-V AIA), through platform specific device
> (e.g. SiFive CLINT timer), or through firmware (e.g. SBI IPI call).
> 
> To support multiple IPIs on RISC-V, we add a generic IPI multiplexing
> mechanism which help us create multiple virtual IPIs using a single
> HW IPI. This generic IPI multiplexing is inspired from the Apple AIC
> irqchip driver and it is shared by various RISC-V irqchip drivers.
> 
> Signed-off-by: Anup Patel <apatel@...tanamicro.com>
> ---
>  include/linux/irq.h  |   4 +
>  kernel/irq/Kconfig   |   5 ++
>  kernel/irq/Makefile  |   1 +
>  kernel/irq/ipi-mux.c | 210 +++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 220 insertions(+)
>  create mode 100644 kernel/irq/ipi-mux.c
> 
> diff --git a/include/linux/irq.h b/include/linux/irq.h
> index c3eb89606c2b..6024e1ee1257 100644
> --- a/include/linux/irq.h
> +++ b/include/linux/irq.h
> @@ -1266,6 +1266,10 @@ int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest);
>  int ipi_send_single(unsigned int virq, unsigned int cpu);
>  int ipi_send_mask(unsigned int virq, const struct cpumask *dest);
>  
> +void ipi_mux_process(void);
> +int ipi_mux_create(unsigned int nr_ipi,
> +		   void (*mux_send)(const struct cpumask *));
> +
>  #ifdef CONFIG_GENERIC_IRQ_MULTI_HANDLER
>  /*
>   * Registers a generic IRQ handling function as the top-level IRQ handler in
> diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
> index db3d174c53d4..df17dbc54b02 100644
> --- a/kernel/irq/Kconfig
> +++ b/kernel/irq/Kconfig
> @@ -86,6 +86,11 @@ config GENERIC_IRQ_IPI
>  	depends on SMP
>  	select IRQ_DOMAIN_HIERARCHY
>  
> +# Generic IRQ IPI Mux support
> +config GENERIC_IRQ_IPI_MUX
> +	bool
> +	depends on SMP
> +
>  # Generic MSI interrupt support
>  config GENERIC_MSI_IRQ
>  	bool
> diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
> index b4f53717d143..f19d3080bf11 100644
> --- a/kernel/irq/Makefile
> +++ b/kernel/irq/Makefile
> @@ -15,6 +15,7 @@ obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
>  obj-$(CONFIG_PM_SLEEP) += pm.o
>  obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
>  obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o
> +obj-$(CONFIG_GENERIC_IRQ_IPI_MUX) += ipi-mux.o
>  obj-$(CONFIG_SMP) += affinity.o
>  obj-$(CONFIG_GENERIC_IRQ_DEBUGFS) += debugfs.o
>  obj-$(CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR) += matrix.o
> diff --git a/kernel/irq/ipi-mux.c b/kernel/irq/ipi-mux.c
> new file mode 100644
> index 000000000000..366d8cd5320b
> --- /dev/null
> +++ b/kernel/irq/ipi-mux.c
> @@ -0,0 +1,210 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * Multiplex several virtual IPIs over a single HW IPI.
> + *
> + * Copyright The Asahi Linux Contributors
> + * Copyright (c) 2022 Ventana Micro Systems Inc.
> + */
> +
> +#define pr_fmt(fmt) "ipi-mux: " fmt
> +#include <linux/cpu.h>
> +#include <linux/init.h>
> +#include <linux/irq.h>
> +#include <linux/irqchip.h>
> +#include <linux/irqchip/chained_irq.h>
> +#include <linux/irqdomain.h>
> +#include <linux/jump_label.h>
> +#include <linux/percpu.h>
> +#include <linux/smp.h>
> +
> +struct ipi_mux_cpu {
> +	atomic_t			enable;
> +	atomic_t			bits;
> +	struct cpumask			send_mask;
> +};
> +
> +static struct ipi_mux_cpu __percpu *ipi_mux_pcpu;
> +static struct irq_domain *ipi_mux_domain;
> +static void (*ipi_mux_send)(const struct cpumask *mask);
> +
> +static void ipi_mux_mask(struct irq_data *d)
> +{
> +	struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> +
> +	atomic_andnot(BIT(irqd_to_hwirq(d)), &icpu->enable);
> +}
> +
> +static void ipi_mux_unmask(struct irq_data *d)
> +{
> +	u32 ibit = BIT(irqd_to_hwirq(d));
> +	struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> +
> +	atomic_or(ibit, &icpu->enable);
> +
> +	/*
> +	 * The atomic_or() above must complete before the atomic_read()
> +	 * below to avoid racing ipi_mux_send_mask().
> +	 */
> +	smp_mb__after_atomic();
> +
> +	/* If a pending IPI was unmasked, raise a parent IPI immediately. */
> +	if (atomic_read(&icpu->bits) & ibit)
> +		ipi_mux_send(cpumask_of(smp_processor_id()));
> +}
> +
> +static void ipi_mux_send_mask(struct irq_data *d, const struct cpumask *mask)
> +{
> +	u32 ibit = BIT(irqd_to_hwirq(d));
> +	struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> +	struct cpumask *send_mask = &icpu->send_mask;
> +	unsigned long flags;
> +	int cpu;
> +
> +	/*
> +	 * We use send_mask as a per-CPU variable so disable local
> +	 * interrupts to avoid being preempted.
> +	 */
> +	local_irq_save(flags);

The correct way to avoid preemption is to use preempt_disable(), which
is a lot cheaper than disabling interrupt on most architectures.

> +
> +	cpumask_clear(send_mask);

This thing is likely to be unnecessarily expensive on very large
systems, as it is proportional to the number of CPUs.

> +
> +	for_each_cpu(cpu, mask) {
> +		icpu = per_cpu_ptr(ipi_mux_pcpu, cpu);
> +		atomic_or(ibit, &icpu->bits);

The original code had an atomic_fetch_or_release() to allow eliding
the IPI if the target interrupt was already pending. Why is that code
gone? This is a pretty cheap and efficient optimisation.

> +
> +		/*
> +		 * The atomic_or() above must complete before
> +		 * the atomic_read() below to avoid racing with
> +		 * ipi_mux_unmask().
> +		 */
> +		smp_mb__after_atomic();
> +
> +		if (atomic_read(&icpu->enable) & ibit)
> +			cpumask_set_cpu(cpu, send_mask);
> +	}
> +
> +	/* Trigger the parent IPI */
> +	ipi_mux_send(send_mask);

IPIs are very rarely made pending on more than a single CPU at a
time. The overwhelming majority of them are targeting a single CPU. So
accumulating bits to avoid doing two or more "send" actions only
penalises the generic case.

My conclusion is that this "send_mask" can probably be removed,
together with the preemption fiddling.

> +
> +	local_irq_restore(flags);
> +}
> +
> +static const struct irq_chip ipi_mux_chip = {
> +	.name		= "IPI Mux",
> +	.irq_mask	= ipi_mux_mask,
> +	.irq_unmask	= ipi_mux_unmask,
> +	.ipi_send_mask	= ipi_mux_send_mask,
> +};

OK, you have now dropped the superfluous pre/post handlers. But the
need still exists. Case in point, the aic_handle_ipi() prologue and
epilogue to the interrupt handling. I have suggested last time that
the driver could provide the actual struct irq_chip in order to
provide the callbacks it requires.

Please realise that I will not take this patch if this cannot be made
to work with the single existing in-tree instance of an IPI MUX. 90%
of the code having been lifted from there, I think this is a pretty
fair ask.

> +
> +static int ipi_mux_domain_alloc(struct irq_domain *d, unsigned int virq,
> +				unsigned int nr_irqs, void *arg)
> +{
> +	int i;
> +
> +	for (i = 0; i < nr_irqs; i++) {
> +		irq_set_percpu_devid(virq + i);
> +		irq_domain_set_info(d, virq + i, i,
> +				    &ipi_mux_chip, d->host_data,

What does d->host_data represent here?

> +				    handle_percpu_devid_irq, NULL, NULL);
> +	}
> +
> +	return 0;
> +}
> +
> +static const struct irq_domain_ops ipi_mux_domain_ops = {
> +	.alloc		= ipi_mux_domain_alloc,
> +	.free		= irq_domain_free_irqs_top,
> +};
> +
> +/**
> + * ipi_mux_process - Process multiplexed virtual IPIs
> + */
> +void ipi_mux_process(void)
> +{
> +	struct ipi_mux_cpu *icpu = this_cpu_ptr(ipi_mux_pcpu);
> +	irq_hw_number_t hwirq;
> +	unsigned long ipis;
> +	unsigned int en;
> +
> +	/*
> +	 * Reading enable mask does not need to be ordered as long as
> +	 * this function called from interrupt handler because only
> +	 * the CPU itself can change it's own enable mask.
> +	 */
> +	en = atomic_read(&icpu->enable);
> +
> +	/*
> +	 * Clear the IPIs we are about to handle. This pairs with the
> +	 * atomic_fetch_or_release() in ipi_mux_send_mask().
> +	 */
> +	ipis = atomic_fetch_andnot(en, &icpu->bits) & en;
> +
> +	for_each_set_bit(hwirq, &ipis, BITS_PER_LONG)

BITS_PER_LONG...

> +		generic_handle_domain_irq(ipi_mux_domain, hwirq);
> +}
> +
> +/**
> + * ipi_mux_create - Create virtual IPIs multiplexed on top of a single
> + * parent IPI.
> + * @nr_ipi:		number of virtual IPIs to create. This should
> + *			be <= BITS_PER_TYPE(int)
> + * @mux_send:		callback to trigger parent IPI
> + *
> + * Returns first virq of the newly created virtual IPIs upon success
> + * or <=0 upon failure
> + */
> +int ipi_mux_create(unsigned int nr_ipi,
> +		   void (*mux_send)(const struct cpumask *))
> +{
> +	struct fwnode_handle *fwnode;
> +	struct irq_domain *domain;
> +	int rc;
> +
> +	if (ipi_mux_domain)
> +		return -EEXIST;
> +
> +	if (BITS_PER_TYPE(int) < nr_ipi || !mux_send)

... vs BITS_PER_TYPE(int) ...

	M.

-- 
Without deviation from the norm, progress is not possible.