[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20121002045518.GA7756@gmail.com>
Date: Tue, 2 Oct 2012 06:55:18 +0200
From: Ingo Molnar <mingo@...nel.org>
To: Alexander Gordeev <agordeev@...hat.com>
Cc: linux-kernel@...r.kernel.org, Ingo Molnar <mingo@...hat.com>,
Thomas Gleixner <tglx@...utronix.de>,
Bjorn Helgaas <bhelgaas@...gle.com>,
Suresh Siddha <suresh.b.siddha@...el.com>,
Yinghai Lu <yinghai@...nel.org>,
Jeff Garzik <jgarzik@...ox.com>,
Matthew Wilcox <willy@...ux.intel.com>, x86@...nel.org,
linux-pci@...r.kernel.org, linux-ide@...r.kernel.org
Subject: Re: [PATCH v3 -tip 1/5] x86, MSI: Support multiple MSIs in presense
of IRQ remapping
* Alexander Gordeev <agordeev@...hat.com> wrote:
> The MSI specification has several constraints in comparison with MSI-X,
> most notable of them is the inability to configure MSIs independently.
> As a result, it is impossible to dispatch interrupts from different
> queues to different CPUs. This is largely devalues the support of
> multiple MSIs in SMP systems.
>
> Also, a necessity to allocate a contiguous block of vector numbers for
> devices capable of multiple MSIs might cause a considerable pressure on
> x86 interrupt vector allocator and could lead to fragmentation of the
> interrupt vectors space.
>
> This patch overcomes both drawbacks in presense of IRQ remapping and
> lets devices take advantage of multiple queues and per-IRQ affinity
> assignments.
>
> Signed-off-by: Alexander Gordeev <agordeev@...hat.com>
> ---
> arch/x86/kernel/apic/io_apic.c | 174 +++++++++++++++++++++++++++++++++------
> include/linux/irq.h | 6 ++
> kernel/irq/chip.c | 30 +++++--
> kernel/irq/irqdesc.c | 31 +++++++
> 4 files changed, 206 insertions(+), 35 deletions(-)
>
> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
> index c265593..d5cb13c 100644
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -305,6 +305,11 @@ static int alloc_irq_from(unsigned int from, int node)
> return irq_alloc_desc_from(from, node);
> }
>
> +static int alloc_irqs_from(unsigned int from, unsigned int count, int node)
> +{
> + return irq_alloc_descs_from(from, count, node);
> +}
> +
> static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
> {
> free_irq_cfg(at, cfg);
> @@ -2991,37 +2996,58 @@ device_initcall(ioapic_init_ops);
> /*
> * Dynamic irq allocate and deallocation
> */
> -unsigned int create_irq_nr(unsigned int from, int node)
> +unsigned int __create_irqs(unsigned int from, unsigned int count, int node)
> {
> - struct irq_cfg *cfg;
> + struct irq_cfg **cfg;
> unsigned long flags;
> - unsigned int ret = 0;
> - int irq;
> + int irq, i;
>
> if (from < nr_irqs_gsi)
> from = nr_irqs_gsi;
>
> - irq = alloc_irq_from(from, node);
> - if (irq < 0)
> - return 0;
> - cfg = alloc_irq_cfg(irq, node);
> - if (!cfg) {
> - free_irq_at(irq, NULL);
> + cfg = kzalloc_node(count * sizeof(cfg[0]), GFP_KERNEL, node);
> + if (!cfg)
> return 0;
> +
> + irq = alloc_irqs_from(from, count, node);
> + if (irq < 0)
> + goto out_cfgs;
> +
> + for (i = 0; i < count; i++) {
> + cfg[i] = alloc_irq_cfg(irq + i, node);
> + if (!cfg[i])
> + goto out_irqs;
> }
>
> raw_spin_lock_irqsave(&vector_lock, flags);
> - if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
> - ret = irq;
> + for (i = 0; i < count; i++)
> + if (__assign_irq_vector(irq + i, cfg[i], apic->target_cpus()))
> + goto out_vecs;
> raw_spin_unlock_irqrestore(&vector_lock, flags);
>
> - if (ret) {
> - irq_set_chip_data(irq, cfg);
> - irq_clear_status_flags(irq, IRQ_NOREQUEST);
> - } else {
> - free_irq_at(irq, cfg);
> + for (i = 0; i < count; i++) {
> + irq_set_chip_data(irq + i, cfg[i]);
> + irq_clear_status_flags(irq + i, IRQ_NOREQUEST);
> }
> - return ret;
> +
> + kfree(cfg);
> + return irq;
> +
> +out_vecs:
> + for (; i; i--)
> + __clear_irq_vector(irq + i - 1, cfg[i - 1]);
> + raw_spin_unlock_irqrestore(&vector_lock, flags);
> +out_irqs:
> + for (i = 0; i < count; i++)
> + free_irq_at(irq + i, cfg[i]);
> +out_cfgs:
> + kfree(cfg);
> + return 0;
> +}
> +
> +unsigned int create_irq_nr(unsigned int from, int node)
> +{
> + return __create_irqs(from, 1, node);
> }
>
> int create_irq(void)
> @@ -3054,6 +3080,27 @@ void destroy_irq(unsigned int irq)
> free_irq_at(irq, cfg);
> }
>
> +static inline void destroy_irqs(unsigned int irq, unsigned int count)
> +{
> + unsigned int i;
> + for (i = 0; i < count; i++)
Missing newline.
> + destroy_irq(irq + i);
> +}
> +
> +static inline int
> +can_create_pow_of_two_irqs(unsigned int from, unsigned int count)
> +{
> + if ((count > 1) && (count % 2))
> + return -EINVAL;
> +
> + for (; count; count = count / 2) {
> + if (!irq_can_alloc_irqs(from, count))
> + return count;
> + }
> +
> + return -ENOSPC;
> +}
> +
> /*
> * MSI message composition
> */
> @@ -3145,18 +3192,25 @@ static struct irq_chip msi_chip = {
> .irq_retrigger = ioapic_retrigger_irq,
> };
>
> -static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
> +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
> + unsigned int irq_base, unsigned int irq_offset)
> {
> struct irq_chip *chip = &msi_chip;
> struct msi_msg msg;
> + unsigned int irq = irq_base + irq_offset;
> int ret;
>
> ret = msi_compose_msg(dev, irq, &msg, -1);
> if (ret < 0)
> return ret;
>
> - irq_set_msi_desc(irq, msidesc);
> - write_msi_msg(irq, &msg);
> + irq_set_msi_desc_off(irq_base, irq_offset, msidesc);
> +
> + /* MSI-X message is written per-IRQ, the offset is always 0.
> + * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
> + */
Please use the customary (multi-line) comment style:
/*
* Comment .....
* ...... goes here.
*/
specified in Documentation/CodingStyle.
> + if (!irq_offset)
> + write_msi_msg(irq, &msg);
>
> if (irq_remapped(irq_get_chip_data(irq))) {
> irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
> @@ -3170,16 +3224,12 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
> return 0;
> }
>
> -int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> +int setup_msix_irqs(struct pci_dev *dev, int nvec)
> {
> int node, ret, sub_handle, index = 0;
> unsigned int irq, irq_want;
> struct msi_desc *msidesc;
>
> - /* x86 doesn't support multiple MSI yet */
> - if (type == PCI_CAP_ID_MSI && nvec > 1)
> - return 1;
> -
> node = dev_to_node(&dev->dev);
> irq_want = nr_irqs_gsi;
> sub_handle = 0;
> @@ -3208,7 +3258,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> goto error;
> }
> no_ir:
> - ret = setup_msi_irq(dev, msidesc, irq);
> + ret = setup_msi_irq(dev, msidesc, irq, 0);
> if (ret < 0)
> goto error;
> sub_handle++;
> @@ -3220,6 +3270,76 @@ error:
> return ret;
> }
>
> +int setup_msi_irqs(struct pci_dev *dev, int nvec)
> +{
> + int node, ret, sub_handle, index = 0;
> + unsigned int irq;
> + struct msi_desc *msidesc;
> +
> + if (nvec > 1 && !irq_remapping_enabled)
> + return 1;
> +
> + nvec = __roundup_pow_of_two(nvec);
> + ret = can_create_pow_of_two_irqs(nr_irqs_gsi, nvec);
> + if (ret != nvec)
> + return ret;
> +
> + WARN_ON(!list_is_singular(&dev->msi_list));
> + msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
> + WARN_ON(msidesc->irq);
> + WARN_ON(msidesc->msi_attrib.multiple);
> +
> + node = dev_to_node(&dev->dev);
> + irq = __create_irqs(nr_irqs_gsi, nvec, node);
> + if (irq == 0)
> + return -ENOSPC;
> +
> + if (!irq_remapping_enabled) {
> + ret = setup_msi_irq(dev, msidesc, irq, 0);
> + if (ret < 0)
> + goto error;
> + return 0;
> + }
> +
> + msidesc->msi_attrib.multiple = ilog2(nvec);
> + for (sub_handle = 0; sub_handle < nvec; sub_handle++) {
> + if (!sub_handle) {
> + index = msi_alloc_remapped_irq(dev, irq, nvec);
> + if (index < 0) {
> + ret = index;
> + goto error;
> + }
> + } else {
> + ret = msi_setup_remapped_irq(dev, irq + sub_handle,
> + index, sub_handle);
> + if (ret < 0)
> + goto error;
> + }
> + ret = setup_msi_irq(dev, msidesc, irq, sub_handle);
> + if (ret < 0)
> + goto error;
> + }
> + return 0;
> +
> +error:
> + destroy_irqs(irq, nvec);
> +
> + /* Restore altered MSI descriptor fields and prevent just destroyed
> + * IRQs from tearing down again in default_teardown_msi_irqs()
> + */
Ditto.
> + msidesc->irq = 0;
> + msidesc->msi_attrib.multiple = 0;
> +
> + return ret;
> +}
> +
> +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> +{
> + if (type == PCI_CAP_ID_MSI)
> + return setup_msi_irqs(dev, nvec);
> + return setup_msix_irqs(dev, nvec);
> +}
> +
> void native_teardown_msi_irq(unsigned int irq)
> {
> destroy_irq(irq);
> diff --git a/include/linux/irq.h b/include/linux/irq.h
> index 216b0ba..c3ba39f 100644
> --- a/include/linux/irq.h
> +++ b/include/linux/irq.h
> @@ -522,6 +522,8 @@ extern int irq_set_handler_data(unsigned int irq, void *data);
> extern int irq_set_chip_data(unsigned int irq, void *data);
> extern int irq_set_irq_type(unsigned int irq, unsigned int type);
> extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry);
> +extern int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
> + struct msi_desc *entry);
> extern struct irq_data *irq_get_irq_data(unsigned int irq);
>
> static inline struct irq_chip *irq_get_chip(unsigned int irq)
> @@ -584,8 +586,12 @@ int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
> #define irq_alloc_desc_from(from, node) \
> irq_alloc_descs(-1, from, 1, node)
>
> +#define irq_alloc_descs_from(from, cnt, node) \
> + irq_alloc_descs(-1, from, cnt, node)
> +
Please use inlines instead of macros. Might transform the one
above it as well in the process.
> void irq_free_descs(unsigned int irq, unsigned int cnt);
> int irq_reserve_irqs(unsigned int from, unsigned int cnt);
> +int irq_can_alloc_irqs(unsigned int from, unsigned int cnt);
>
> static inline void irq_free_desc(unsigned int irq)
> {
> diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
> index 57d86d0..2230389 100644
> --- a/kernel/irq/chip.c
> +++ b/kernel/irq/chip.c
> @@ -90,27 +90,41 @@ int irq_set_handler_data(unsigned int irq, void *data)
> EXPORT_SYMBOL(irq_set_handler_data);
>
> /**
> - * irq_set_msi_desc - set MSI descriptor data for an irq
> - * @irq: Interrupt number
> - * @entry: Pointer to MSI descriptor data
> + * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
> + * @irq_base: Interrupt number base
> + * @irq_offset: Interrupt number offset
> + * @entry: Pointer to MSI descriptor data
> *
> - * Set the MSI descriptor entry for an irq
> + * Set the MSI descriptor entry for an irq at offset
> */
> -int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
> +int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
> + struct msi_desc *entry)
> {
> unsigned long flags;
> - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
> + struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
>
> if (!desc)
> return -EINVAL;
> desc->irq_data.msi_desc = entry;
> - if (entry)
> - entry->irq = irq;
> + if (entry && !irq_offset)
> + entry->irq = irq_base;
> irq_put_desc_unlock(desc, flags);
> return 0;
> }
>
> /**
> + * irq_set_msi_desc - set MSI descriptor data for an irq
> + * @irq: Interrupt number
> + * @entry: Pointer to MSI descriptor data
> + *
> + * Set the MSI descriptor entry for an irq
> + */
> +int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
> +{
> + return irq_set_msi_desc_off(irq, 0, entry);
> +}
> +
> +/**
> * irq_set_chip_data - set irq chip data for an irq
> * @irq: Interrupt number
> * @data: Pointer to chip specific data
> diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
> index 192a302..8287b78 100644
> --- a/kernel/irq/irqdesc.c
> +++ b/kernel/irq/irqdesc.c
> @@ -210,6 +210,13 @@ static int irq_expand_nr_irqs(unsigned int nr)
> return 0;
> }
>
> +static int irq_can_expand_nr_irqs(unsigned int nr)
> +{
> + if (nr > IRQ_BITMAP_BITS)
> + return -ENOMEM;
> + return 0;
> +}
> +
> int __init early_irq_init(void)
> {
> int i, initcnt, node = first_online_node;
> @@ -414,6 +421,30 @@ int irq_reserve_irqs(unsigned int from, unsigned int cnt)
> }
>
> /**
> + * irq_can_alloc_irqs - checks if a range of irqs could be allocated
> + * @from: check from irq number
> + * @cnt: number of irqs to check
> + *
> + * Returns 0 on success or an appropriate error code
> + */
> +int irq_can_alloc_irqs(unsigned int from, unsigned int cnt)
> +{
> + unsigned int start;
> + int ret = 0;
> +
> + if (!cnt)
> + return -EINVAL;
> +
> + mutex_lock(&sparse_irq_lock);
> + start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
> + from, cnt, 0);
> + mutex_unlock(&sparse_irq_lock);
> + if (start + cnt > nr_irqs)
> + ret = irq_can_expand_nr_irqs(start + cnt);
> + return ret;
How is this supposed to work wrt. races?
Thanks,
Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists