[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <626c1d010ff720c8c2beb7bdd36b0565850a6ab3.camel@linux.ibm.com>
Date: Fri, 21 Nov 2025 14:27:38 +0100
From: Gerd Bayer <gbayer@...ux.ibm.com>
To: Tobias Schumacher <ts@...ux.ibm.com>, Heiko Carstens
<hca@...ux.ibm.com>,
Vasily Gorbik <gor@...ux.ibm.com>,
Alexander Gordeev
<agordeev@...ux.ibm.com>,
Christian Borntraeger
<borntraeger@...ux.ibm.com>,
Sven Schnelle <svens@...ux.ibm.com>,
Niklas
Schnelle <schnelle@...ux.ibm.com>,
Gerald Schaefer
<gerald.schaefer@...ux.ibm.com>,
Halil Pasic <pasic@...ux.ibm.com>,
Matthew Rosato <mjrosato@...ux.ibm.com>,
Thomas Gleixner
<tglx@...utronix.de>
Cc: linux-kernel@...r.kernel.org, linux-s390@...r.kernel.org,
Farhan Ali
<alifm@...ux.ibm.com>
Subject: Re: [PATCH v5 2/2] s390/pci: Migrate s390 IRQ logic to IRQ domain
API
Hi Tobias,
sorry for being late with my comments...
On Fri, 2025-11-21 at 06:32 +0100, Tobias Schumacher wrote:
> s390 is one of the last architectures using the legacy API for setup and
> teardown of PCI MSI IRQs. Migrate the s390 IRQ allocation and teardown
> to the MSI parent domain API. For details, see:
>
> https://lore.kernel.org/lkml/20221111120501.026511281@linutronix.de
>
> In detail, create an MSI parent domain for each PCI domain. When a PCI
> device sets up MSI or MSI-X IRQs, the library creates a per-device IRQ
> domain for this device, which is used by the device for allocating and
> freeing IRQs.
>
> The per-device domain delegates this allocation and freeing to the
> parent-domain. In the end, the corresponding callbacks of the parent
> domain are responsible for allocating and freeing the IRQs.
>
> The allocation is split into two parts:
> - zpci_msi_prepare() is called once for each device and allocates the
> required resources. On s390, each PCI function has its own airq
> vector and a summary bit, which must be configured once per function.
> This is done in prepare().
> - zpci_msi_alloc() can be called multiple times for allocating one or
> more MSI/MSI-X IRQs. This creates a mapping between the virtual IRQ
> number in the kernel and the hardware IRQ number.
>
> Freeing is split into two counterparts:
> - zpci_msi_free() reverts the effects of zpci_msi_alloc() and
> - zpci_msi_teardown() reverts the effects of zpci_msi_prepare(). This is
> called once when all IRQs are freed before a device is removed.
>
> Since the parent domain in the end allocates the IRQs, the hwirq
> encoding must be unambiguous for all IRQs of all devices. This is
> achieved by
>
> encoding the hwirq using the PCI function id and the MSI
> index.
This is no longer true with the per-PCI-domain irq domains! But you
encode the hwirq with the devfn within the PCI domain, instead.
> Reviewed-by: Niklas Schnelle <schnelle@...ux.ibm.com>
> Reviewed-by: Farhan Ali <alifm@...ux.ibm.com>
> Signed-off-by: Tobias Schumacher <ts@...ux.ibm.com>
> ---
> arch/s390/Kconfig | 1 +
> arch/s390/include/asm/pci.h | 4 +
> arch/s390/pci/pci_bus.c | 21 ++-
> arch/s390/pci/pci_irq.c | 333 +++++++++++++++++++++++++++-----------------
> 4 files changed, 227 insertions(+), 132 deletions(-)
>
> diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
> index 778ce20d34046cad84dd4ef57cab5a662e5796d9..46ab67d607f0db7f5f108106172699a5eebfc8c8 100644
> --- a/arch/s390/Kconfig
> +++ b/arch/s390/Kconfig
> @@ -251,6 +251,7 @@ config S390
> select HOTPLUG_SMT
> select IOMMU_HELPER if PCI
> select IOMMU_SUPPORT if PCI
> + select IRQ_MSI_LIB if PCI
Nit: There's precedence for both versions (above and below!) but I
personally would prefer to indent the pre-condition "if PCI" so it
stands out. Maybe that's a generic clean-up task for this arch-specific
file...
> select KASAN_VMALLOC if KASAN
> select LOCK_MM_AND_FIND_VMA
> select MMU_GATHER_MERGE_VMAS
> diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h
> index a32f465ecf73a5cc3408a312d94ec888d62848cc..60abc84cf14fe6fb1ee149df688eea94f0983ed0 100644
> --- a/arch/s390/include/asm/pci.h
> +++ b/arch/s390/include/asm/pci.h
> @@ -5,6 +5,7 @@
> #include <linux/pci.h>
> #include <linux/mutex.h>
> #include <linux/iommu.h>
> +#include <linux/irqdomain.h>
> #include <linux/pci_hotplug.h>
> #include <asm/pci_clp.h>
> #include <asm/pci_debug.h>
> @@ -109,6 +110,7 @@ struct zpci_bus {
> struct list_head resources;
> struct list_head bus_next;
> struct resource bus_resource;
> + struct irq_domain *msi_parent_domain;
> int topo; /* TID if topo_is_tid, PCHID otherwise */
> int domain_nr;
> u8 multifunction : 1;
> @@ -310,6 +312,8 @@ int zpci_dma_exit_device(struct zpci_dev *zdev);
> /* IRQ */
> int __init zpci_irq_init(void);
> void __init zpci_irq_exit(void);
> +int zpci_create_parent_msi_domain(struct zpci_bus *zbus);
> +void zpci_remove_parent_msi_domain(struct zpci_bus *zbus);
>
> /* FMB */
> int zpci_fmb_enable_device(struct zpci_dev *);
> diff --git a/arch/s390/pci/pci_bus.c b/arch/s390/pci/pci_bus.c
> index be8c697fea0cc755cfdb4fb0a9e3b95183bec0dc..9da261b600df805ef76f3331975ac8fce6178908 100644
> --- a/arch/s390/pci/pci_bus.c
> +++ b/arch/s390/pci/pci_bus.c
> @@ -15,6 +15,7 @@
> #include <linux/err.h>
> #include <linux/delay.h>
> #include <linux/seq_file.h>
> +#include <linux/irqdomain.h>
> #include <linux/jump_label.h>
> #include <linux/pci.h>
> #include <linux/printk.h>
> @@ -189,7 +190,7 @@ static bool zpci_bus_is_multifunction_root(struct zpci_dev *zdev)
> static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *fr, struct pci_ops *ops)
> {
> struct pci_bus *bus;
> - int domain;
> + int domain, rc;
>
> domain = zpci_alloc_domain((u16)fr->uid);
> if (domain < 0)
> @@ -199,19 +200,28 @@ static int zpci_bus_create_pci_bus(struct zpci_bus *zbus, struct zpci_dev *fr, s
> zbus->multifunction = zpci_bus_is_multifunction_root(fr);
> zbus->max_bus_speed = fr->max_bus_speed;
>
> + rc = zpci_create_parent_msi_domain(zbus);
> + if (rc)
> + goto out_free_domain;
> +
If you shortened this to use the call to
zpci_create_parent_msi_domain() as predicate for "if" you could do
without the additional rc.
>
> /*
> * Note that the zbus->resources are taken over and zbus->resources
> * is empty after a successful call
> */
> bus = pci_create_root_bus(NULL, ZPCI_BUS_NR, ops, zbus, &zbus->resources);
> - if (!bus) {
> - zpci_free_domain(zbus->domain_nr);
> - return -EFAULT;
> - }
> + if (!bus)
> + goto out_remove_msi_domain;
Or do you want to set rc to -EFAULT here, and return the "original" rc
in the error exits?
>
> zbus->bus = bus;
> + dev_set_msi_domain(&zbus->bus->dev, zbus->msi_parent_domain);
>
> return 0;
> +
> +out_remove_msi_domain:
> + zpci_remove_parent_msi_domain(zbus);
> +out_free_domain:
> + zpci_free_domain(zbus->domain_nr);
> + return -EFAULT;
> }
>
> static void zpci_bus_release(struct kref *kref)
> @@ -232,6 +242,7 @@ static void zpci_bus_release(struct kref *kref)
> mutex_lock(&zbus_list_lock);
> list_del(&zbus->bus_next);
> mutex_unlock(&zbus_list_lock);
> + zpci_remove_parent_msi_domain(zbus);
> kfree(zbus);
> }
>
> diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c
> index e73be96ce5fe6473fc193d65b8f0ff635d6a98ba..b0be21ab56995e81f54339fc77167f5930182542 100644
> --- a/arch/s390/pci/pci_irq.c
> +++ b/arch/s390/pci/pci_irq.c
> @@ -7,6 +7,7 @@
> #include <linux/kernel_stat.h>
> #include <linux/pci.h>
> #include <linux/msi.h>
> +#include <linux/irqchip/irq-msi-lib.h>
> #include <linux/smp.h>
>
> #include <asm/isc.h>
> @@ -110,43 +111,41 @@ static int zpci_set_irq(struct zpci_dev *zdev)
> return rc;
> }
>
> -/* Clear adapter interruptions */
> -static int zpci_clear_irq(struct zpci_dev *zdev)
Any specific reason, why you removed zpci_clear_irq() indirecting to
airq vs. directed_irq - but kept zpci_set_irq()? Just saying this is
imbalanced now.
> +static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *dest,
> + bool force)
> {
> - int rc;
> -
> - if (irq_delivery == DIRECTED)
> - rc = zpci_clear_directed_irq(zdev);
> - else
> - rc = zpci_clear_airq(zdev);
> -
> - return rc;
> + irq_data_update_affinity(data, dest);
> + return IRQ_SET_MASK_OK;
> }
>
> -static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *dest,
> - bool force)
> +static void zpci_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
> {
> - struct msi_desc *entry = irq_data_get_msi_desc(data);
> - struct msi_msg msg = entry->msg;
> - int cpu_addr = smp_cpu_get_cpu_address(cpumask_first(dest));
> + struct msi_desc *desc = irq_data_get_msi_desc(data);
> + struct zpci_dev *zdev = to_zpci_dev(desc->dev);
>
> - msg.address_lo &= 0xff0000ff;
> - msg.address_lo |= (cpu_addr << 8);
> - pci_write_msi_msg(data->irq, &msg);
> + if (irq_delivery == DIRECTED) {
> + int cpu = cpumask_first(irq_data_get_affinity_mask(data));
>
> - return IRQ_SET_MASK_OK;
> + msg->address_lo = zdev->msi_addr & 0xff0000ff;
> + msg->address_lo |= (smp_cpu_get_cpu_address(cpu) << 8);
> + } else {
> + msg->address_lo = zdev->msi_addr & 0xffffffff;
> + }
> + msg->address_hi = zdev->msi_addr >> 32;
> + msg->data = data->hwirq & 0xffffffff;
> }
>
> static struct irq_chip zpci_irq_chip = {
> .name = "PCI-MSI",
> - .irq_unmask = pci_msi_unmask_irq,
> - .irq_mask = pci_msi_mask_irq,
> + .irq_compose_msi_msg = zpci_compose_msi_msg
> };
>
> static void zpci_handle_cpu_local_irq(bool rescan)
> {
> struct airq_iv *dibv = zpci_ibv[smp_processor_id()];
> union zpci_sic_iib iib = {{0}};
> + struct irq_domain *msi_domain;
> + irq_hw_number_t hwirq;
> unsigned long bit;
> int irqs_on = 0;
>
[... snip ...]
>
> @@ -290,146 +297,217 @@ static int __alloc_airq(struct zpci_dev *zdev, int msi_vecs,
> return 0;
> }
>
> -int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
> +bool arch_restore_msi_irqs(struct pci_dev *pdev)
> {
> - unsigned int hwirq, msi_vecs, irqs_per_msi, i, cpu;
> struct zpci_dev *zdev = to_zpci(pdev);
> - struct msi_desc *msi;
> - struct msi_msg msg;
> - unsigned long bit;
> - int cpu_addr;
> - int rc, irq;
>
> - zdev->aisb = -1UL;
> - zdev->msi_first_bit = -1U;
> + zpci_set_irq(zdev);
> + return true;
> +}
> +
> +static struct airq_struct zpci_airq = {
> + .handler = zpci_floating_irq_handler,
> + .isc = PCI_ISC,
> +};
> +
> +/*
> + * Encode the hwirq number for the parent domain. The encoding must be unique
> + * for each IRQ of each device in the parent domain, so it uses the devfn to
> + * identify the device and the msi_index to identify the IRQ within that device.
> + */
> +static inline u32 zpci_encode_hwirq(u8 devfn, u16 msi_index)
> +{
> + return (devfn << 16) | msi_index;
> +}
> +
> +static inline u16 zpci_decode_hwirq_msi_index(irq_hw_number_t irq)
> +{
> + return irq & GENMASK_U16(15, 0);
Please don't use GENMASK_U16. It is harder to read than any explicit
constant like 0x00FF, especially since its parameters contradict the
architecture's endianess.
But then, is this called anywhere?
> +}
> +
> +static int zpci_msi_prepare(struct irq_domain *domain,
> + struct device *dev, int nvec,
> + msi_alloc_info_t *info)
> +{
> + struct zpci_dev *zdev = to_zpci_dev(dev);
> + struct pci_dev *pdev = to_pci_dev(dev);
> + unsigned long bit;
> + int msi_vecs, rc;
>
> msi_vecs = min_t(unsigned int, nvec, zdev->max_msi);
> - if (msi_vecs < nvec) {
> - pr_info("%s requested %d irqs, allocate system limit of %d",
> + if (msi_vecs < nvec)
> + pr_info("%s requested %d IRQs, allocate system limit of %d\n",
> pci_name(pdev), nvec, zdev->max_msi);
> - }
>
> rc = __alloc_airq(zdev, msi_vecs, &bit);
> - if (rc < 0)
> + if (rc) {
> + pr_err("Allocating adapter IRQs for %s failed\n", pci_name(pdev));
> return rc;
> + }
>
> - /*
> - * Request MSI interrupts:
> - * When using MSI, nvec_used interrupt sources and their irq
> - * descriptors are controlled through one msi descriptor.
> - * Thus the outer loop over msi descriptors shall run only once,
> - * while two inner loops iterate over the interrupt vectors.
> - * When using MSI-X, each interrupt vector/irq descriptor
> - * is bound to exactly one msi descriptor (nvec_used is one).
> - * So the inner loops are executed once, while the outer iterates
> - * over the MSI-X descriptors.
> - */
> - hwirq = bit;
> - msi_for_each_desc(msi, &pdev->dev, MSI_DESC_NOTASSOCIATED) {
> - if (hwirq - bit >= msi_vecs)
> - break;
> - irqs_per_msi = min_t(unsigned int, msi_vecs, msi->nvec_used);
> - irq = __irq_alloc_descs(-1, 0, irqs_per_msi, 0, THIS_MODULE,
> - (irq_delivery == DIRECTED) ?
> - msi->affinity : NULL);
> - if (irq < 0)
> - return -ENOMEM;
> -
> - for (i = 0; i < irqs_per_msi; i++) {
> - rc = irq_set_msi_desc_off(irq, i, msi);
> - if (rc)
> - return rc;
> - irq_set_chip_and_handler(irq + i, &zpci_irq_chip,
> - handle_percpu_irq);
> + zdev->msi_first_bit = bit;
> + zdev->msi_nr_irqs = msi_vecs;
> + rc = zpci_set_irq(zdev);
> + if (rc) {
> + pr_err("Registering adapter IRQs for %s failed\n",
> + pci_name(pdev));
> + if (irq_delivery == DIRECTED) {
> + airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, msi_vecs);
> + } else {
> + zpci_clear_airq(zdev);
> + airq_iv_release(zdev->aibv);
> + zdev->aibv = NULL;
> + airq_iv_free_bit(zpci_sbv, zdev->aisb);
> + zdev->aisb = -1UL;
These two failure clean-ups look a lot like
zpci_msi_teardown_directed/_floating() below. Could these be called
instead of duplicating the code?
> }
> + zdev->msi_first_bit = -1U;
> + return rc;
> + }
>
> - msg.data = hwirq - bit;
> - if (irq_delivery == DIRECTED) {
> - if (msi->affinity)
> - cpu = cpumask_first(&msi->affinity->mask);
> - else
> - cpu = 0;
> - cpu_addr = smp_cpu_get_cpu_address(cpu);
> + return 0;
> +}
> +
> +static void zpci_msi_teardown_directed(struct zpci_dev *zdev)
> +{
> + zpci_clear_directed_irq(zdev);
> + airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->max_msi);
> + zdev->msi_first_bit = -1U;
> +}
> +
> +static void zpci_msi_teardown_floating(struct zpci_dev *zdev)
> +{
> + zpci_clear_airq(zdev);
> + airq_iv_release(zdev->aibv);
> + zdev->aibv = NULL;
> + airq_iv_free_bit(zpci_sbv, zdev->aisb);
> + zdev->aisb = -1UL;
> + zdev->msi_first_bit = -1U;
> +}
>
> - msg.address_lo = zdev->msi_addr & 0xff0000ff;
> - msg.address_lo |= (cpu_addr << 8);
> +static void zpci_msi_teardown(struct irq_domain *domain, msi_alloc_info_t *arg)
> +{
> + struct zpci_dev *zdev = to_zpci_dev(domain->dev);
>
> + if (irq_delivery == DIRECTED)
> + zpci_msi_teardown_directed(zdev);
> + else
> + zpci_msi_teardown_floating(zdev);
> +}
> +
[... snip ...]
No more findings at this point in time.
Thanks for your work!
Gerd
Powered by blists - more mailing lists