[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CAK9=C2X0Occ4jmeQ+Vp7EZA0WyTVQBaG1-95GNd_hK7PUTdrhg@mail.gmail.com>
Date: Sat, 8 Feb 2025 22:09:02 +0530
From: Anup Patel <apatel@...tanamicro.com>
To: Thomas Gleixner <tglx@...utronix.de>
Cc: Marc Zyngier <maz@...nel.org>, Shawn Guo <shawnguo@...nel.org>,
Sascha Hauer <s.hauer@...gutronix.de>, Pengutronix Kernel Team <kernel@...gutronix.de>,
Andrew Lunn <andrew@...n.ch>, Gregory Clement <gregory.clement@...tlin.com>,
Sebastian Hesselbarth <sebastian.hesselbarth@...il.com>, Palmer Dabbelt <palmer@...belt.com>,
Paul Walmsley <paul.walmsley@...ive.com>, Atish Patra <atishp@...shpatra.org>,
Andrew Jones <ajones@...tanamicro.com>, Sunil V L <sunilvl@...tanamicro.com>,
Anup Patel <anup@...infault.org>, linux-riscv@...ts.infradead.org,
linux-arm-kernel@...ts.infradead.org, linux-kernel@...r.kernel.org,
imx@...ts.linux.dev
Subject: Re: [PATCH v4 04/11] genirq: Introduce common irq_force_complete_move()
implementation
On Fri, Feb 7, 2025 at 1:22 AM Thomas Gleixner <tglx@...utronix.de> wrote:
>
> On Wed, Feb 05 2025 at 21:29, Anup Patel wrote:
> > The GENERIC_PENDING_IRQ requires an arch specific implementation
> > of irq_force_complete_move(). At the moment, only x86 implements
> > this but for RISC-V the irq_force_complete_move() is only needed
> > when RISC-V IMSIC driver is in use and not needed otherwise.
> >
> > To address the above, introduce common weak implementation of
> > the irq_force_complete_move() which lets irqchip do the actual
> > irq_force_complete_move().
> >
> > +void __weak irq_force_complete_move(struct irq_desc *desc)
> > +{
> > + struct irq_data *d = irq_desc_get_irq_data(desc);
> > + struct irq_chip *chip = irq_data_get_irq_chip(d);
> > +
> > + if (chip && chip->irq_force_complete_move)
> > + chip->irq_force_complete_move(d);
> > +}
>
> I seriously doubt that works as intended.
Ahh, yes. This function needs to consider hierarchical domains.
>
> In patch 9 you implement this as callback for the imsic_irq_base_chip,
> but that's not the top level chip in the hierarchy. The top level chip
> is the PCI/MSI device domain chip, which obviously does not have that
> callback set.
>
> Something like uncompiled below should work. That avoids the weak
> function and handles x86 in exactly the same way.
Okay, I will try the below code and include it in the next revision.
Thanks,
Anup
>
> Thanks,
>
> tglx
> ---
> --- a/arch/x86/kernel/apic/vector.c
> +++ b/arch/x86/kernel/apic/vector.c
> @@ -888,8 +888,85 @@ static int apic_set_affinity(struct irq_
> return err ? err : IRQ_SET_MASK_OK;
> }
>
> +/*
> + * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
> + */
> +static void apic_force_complete_move(struct irq_data *irqd)
> +{
> + unsigned int cpu = smp_processor_id();
> + struct apic_chip_data *apicd;
> + unsigned int vector;
> +
> + guard(raw_spinlock)(&vector_lock);
> + apicd = apic_chip_data(irqd);
> + if (!apicd)
> + return;
> +
> + /*
> + * If prev_vector is empty or the descriptor is neither currently
> + * nor previously on the outgoing CPU no action required.
> + */
> + vector = apicd->prev_vector;
> + if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu))
> + return;
> +
> + /*
> + * This is tricky. If the cleanup of the old vector has not been
> + * done yet, then the following setaffinity call will fail with
> + * -EBUSY. This can leave the interrupt in a stale state.
> + *
> + * All CPUs are stuck in stop machine with interrupts disabled so
> + * calling __irq_complete_move() would be completely pointless.
> + *
> + * 1) The interrupt is in move_in_progress state. That means that we
> + * have not seen an interrupt since the io_apic was reprogrammed to
> + * the new vector.
> + *
> + * 2) The interrupt has fired on the new vector, but the cleanup IPIs
> + * have not been processed yet.
> + */
> + if (apicd->move_in_progress) {
> + /*
> + * In theory there is a race:
> + *
> + * set_ioapic(new_vector) <-- Interrupt is raised before update
> + * is effective, i.e. it's raised on
> + * the old vector.
> + *
> + * So if the target cpu cannot handle that interrupt before
> + * the old vector is cleaned up, we get a spurious interrupt
> + * and in the worst case the ioapic irq line becomes stale.
> + *
> + * But in case of cpu hotplug this should be a non issue
> + * because if the affinity update happens right before all
> + * cpus rendezvous in stop machine, there is no way that the
> + * interrupt can be blocked on the target cpu because all cpus
> + * loops first with interrupts enabled in stop machine, so the
> + * old vector is not yet cleaned up when the interrupt fires.
> + *
> + * So the only way to run into this issue is if the delivery
> + * of the interrupt on the apic/system bus would be delayed
> + * beyond the point where the target cpu disables interrupts
> + * in stop machine. I doubt that it can happen, but at least
> + * there is a theoretical chance. Virtualization might be
> + * able to expose this, but AFAICT the IOAPIC emulation is not
> + * as stupid as the real hardware.
> + *
> + * Anyway, there is nothing we can do about that at this point
> + * w/o refactoring the whole fixup_irq() business completely.
> + * We print at least the irq number and the old vector number,
> + * so we have the necessary information when a problem in that
> + * area arises.
> + */
> + pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
> + irqd->irq, vector);
> + }
> + free_moved_vector(apicd);
> +}
> +
> #else
> -# define apic_set_affinity NULL
> +# define apic_set_affinity NULL
> +# define apic_force_complete_move NULL
> #endif
>
> static int apic_retrigger_irq(struct irq_data *irqd)
> @@ -923,11 +1000,12 @@ static void x86_vector_msi_compose_msg(s
> }
>
> static struct irq_chip lapic_controller = {
> - .name = "APIC",
> - .irq_ack = apic_ack_edge,
> - .irq_set_affinity = apic_set_affinity,
> - .irq_compose_msi_msg = x86_vector_msi_compose_msg,
> - .irq_retrigger = apic_retrigger_irq,
> + .name = "APIC",
> + .irq_ack = apic_ack_edge,
> + .irq_set_affinity = apic_set_affinity,
> + .irq_compose_msi_msg = x86_vector_msi_compose_msg,
> + .irq_force_complete_move = apic_force_complete_move,
> + .irq_retrigger = apic_retrigger_irq,
> };
>
> #ifdef CONFIG_SMP
> @@ -1068,99 +1146,6 @@ void irq_complete_move(struct irq_cfg *c
> __vector_schedule_cleanup(apicd);
> }
>
> -/*
> - * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
> - */
> -void irq_force_complete_move(struct irq_desc *desc)
> -{
> - unsigned int cpu = smp_processor_id();
> - struct apic_chip_data *apicd;
> - struct irq_data *irqd;
> - unsigned int vector;
> -
> - /*
> - * The function is called for all descriptors regardless of which
> - * irqdomain they belong to. For example if an IRQ is provided by
> - * an irq_chip as part of a GPIO driver, the chip data for that
> - * descriptor is specific to the irq_chip in question.
> - *
> - * Check first that the chip_data is what we expect
> - * (apic_chip_data) before touching it any further.
> - */
> - irqd = irq_domain_get_irq_data(x86_vector_domain,
> - irq_desc_get_irq(desc));
> - if (!irqd)
> - return;
> -
> - raw_spin_lock(&vector_lock);
> - apicd = apic_chip_data(irqd);
> - if (!apicd)
> - goto unlock;
> -
> - /*
> - * If prev_vector is empty or the descriptor is neither currently
> - * nor previously on the outgoing CPU no action required.
> - */
> - vector = apicd->prev_vector;
> - if (!vector || (apicd->cpu != cpu && apicd->prev_cpu != cpu))
> - goto unlock;
> -
> - /*
> - * This is tricky. If the cleanup of the old vector has not been
> - * done yet, then the following setaffinity call will fail with
> - * -EBUSY. This can leave the interrupt in a stale state.
> - *
> - * All CPUs are stuck in stop machine with interrupts disabled so
> - * calling __irq_complete_move() would be completely pointless.
> - *
> - * 1) The interrupt is in move_in_progress state. That means that we
> - * have not seen an interrupt since the io_apic was reprogrammed to
> - * the new vector.
> - *
> - * 2) The interrupt has fired on the new vector, but the cleanup IPIs
> - * have not been processed yet.
> - */
> - if (apicd->move_in_progress) {
> - /*
> - * In theory there is a race:
> - *
> - * set_ioapic(new_vector) <-- Interrupt is raised before update
> - * is effective, i.e. it's raised on
> - * the old vector.
> - *
> - * So if the target cpu cannot handle that interrupt before
> - * the old vector is cleaned up, we get a spurious interrupt
> - * and in the worst case the ioapic irq line becomes stale.
> - *
> - * But in case of cpu hotplug this should be a non issue
> - * because if the affinity update happens right before all
> - * cpus rendezvous in stop machine, there is no way that the
> - * interrupt can be blocked on the target cpu because all cpus
> - * loops first with interrupts enabled in stop machine, so the
> - * old vector is not yet cleaned up when the interrupt fires.
> - *
> - * So the only way to run into this issue is if the delivery
> - * of the interrupt on the apic/system bus would be delayed
> - * beyond the point where the target cpu disables interrupts
> - * in stop machine. I doubt that it can happen, but at least
> - * there is a theoretical chance. Virtualization might be
> - * able to expose this, but AFAICT the IOAPIC emulation is not
> - * as stupid as the real hardware.
> - *
> - * Anyway, there is nothing we can do about that at this point
> - * w/o refactoring the whole fixup_irq() business completely.
> - * We print at least the irq number and the old vector number,
> - * so we have the necessary information when a problem in that
> - * area arises.
> - */
> - pr_warn("IRQ fixup: irq %d move in progress, old vector %d\n",
> - irqd->irq, vector);
> - }
> - free_moved_vector(apicd);
> -unlock:
> - raw_spin_unlock(&vector_lock);
> -}
> -
> #ifdef CONFIG_HOTPLUG_CPU
> /*
> * Note, this is not accurate accounting, but at least good enough to
> --- a/include/linux/irq.h
> +++ b/include/linux/irq.h
> @@ -486,6 +486,7 @@ static inline irq_hw_number_t irqd_to_hw
> * @ipi_send_mask: send an IPI to destination cpus in cpumask
> * @irq_nmi_setup: function called from core code before enabling an NMI
> * @irq_nmi_teardown: function called from core code after disabling an NMI
> + * @irq_force_complete_move: optional function to force complete pending irq move
> * @flags: chip specific flags
> */
> struct irq_chip {
> @@ -537,6 +538,8 @@ struct irq_chip {
> int (*irq_nmi_setup)(struct irq_data *data);
> void (*irq_nmi_teardown)(struct irq_data *data);
>
> + void (*irq_force_complete_move)(struct irq_data *data);
> +
> unsigned long flags;
> };
>
> @@ -619,11 +622,9 @@ static inline void irq_move_irq(struct i
> __irq_move_irq(data);
> }
> void irq_move_masked_irq(struct irq_data *data);
> -void irq_force_complete_move(struct irq_desc *desc);
> #else
> static inline void irq_move_irq(struct irq_data *data) { }
> static inline void irq_move_masked_irq(struct irq_data *data) { }
> -static inline void irq_force_complete_move(struct irq_desc *desc) { }
> #endif
>
> extern int no_irq_affinity;
> --- a/kernel/irq/internals.h
> +++ b/kernel/irq/internals.h
> @@ -442,6 +442,7 @@ static inline struct cpumask *irq_desc_g
> return desc->pending_mask;
> }
> bool irq_fixup_move_pending(struct irq_desc *desc, bool force_clear);
> +void irq_force_complete_move(struct irq_desc *desc);
> #else /* CONFIG_GENERIC_PENDING_IRQ */
> static inline bool irq_can_move_pcntxt(struct irq_data *data)
> {
> @@ -467,6 +468,7 @@ static inline bool irq_fixup_move_pendin
> {
> return false;
> }
> +static inline void irq_force_complete_move(struct irq_desc *desc) { }
> #endif /* !CONFIG_GENERIC_PENDING_IRQ */
>
> #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY)
> --- a/kernel/irq/migration.c
> +++ b/kernel/irq/migration.c
> @@ -35,6 +35,16 @@ bool irq_fixup_move_pending(struct irq_d
> return true;
> }
>
> +void irq_force_complete_move(struct irq_desc *desc)
> +{
> + for (struct irq_data *d = irq_desc_get_irq_data(desc); d; d = d->parent_data) {
> + if (d->chip && d->chip->irq_force_complete_move) {
> + d->chip->irq_force_complete_move(d);
> + return;
> + }
> + }
> +}
> +
> void irq_move_masked_irq(struct irq_data *idata)
> {
> struct irq_desc *desc = irq_data_to_desc(idata);
Powered by blists - more mailing lists