[<prev] [next>] [day] [month] [year] [list]
Message-Id: <1238620209-11980-1-git-send-email-matthew@wil.cx>
Date: Wed, 1 Apr 2009 17:10:09 -0400
From: Matthew Wilcox <matthew@....cx>
To: mingo@...e.hu, linux-kernel@...r.kernel.org,
linux-pci@...r.kernel.org
Cc: Matthew Wilcox <matthew@....cx>,
Matthew Wilcox <willy@...ux.intel.com>
Subject: [PATCH] x86: Support for multiple MSI
Add a new function __assign_irq_vector_block() which allocates an aligned
block of vectors suitable for multiple-MSI.
Change create_irq_nr, msi_compose_msg and setup_msi_irq to take a 'count'.
Split arch_setup_msi_irqs() into setup_msi_irqs and setup_msix_irqs.
Signed-off-by: Matthew Wilcox <willy@...ux.intel.com>
---
arch/x86/include/asm/pci.h | 1 +
arch/x86/kernel/apic/io_apic.c | 390 +++++++++++++++++++++++++++++++---------
arch/x86/kernel/dumpstack.c | 1 +
include/linux/irq.h | 2 +-
4 files changed, 310 insertions(+), 84 deletions(-)
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index a0301bf..7fcb9ab 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -88,6 +88,7 @@ extern void pci_iommu_alloc(void);
/* MSI arch hook */
#define arch_setup_msi_irqs arch_setup_msi_irqs
+#define arch_teardown_msi_irqs arch_teardown_msi_irqs
#endif /* __KERNEL__ */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1bb5c6c..df055e8 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -572,6 +572,41 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
static int
assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
+static int
+assign_irq_vector_block(int irq, unsigned count, const struct cpumask *mask);
+
+/*
+ * The P6 family and Pentium processors (presumably also earlier processors),
+ * can queue no more than two interrupts per priority level, and will ignore
+ * other interrupts that are received within the same priority level (the
+ * priority level is the vector number shifted right by 4), so we try to
+ * spread these out a bit to avoid this happening.
+ *
+ * Pentium 4, Xeon and later processors do not have this limitation.
+ * It is unknown what limitations AMD, Cyrix, Transmeta, VIA, IDT and
+ * other manufacturers have.
+ */
+static int many_vectors_per_prio(void)
+{
+ struct cpuinfo_x86 *c;
+ static char init, result;
+ if (init)
+ return result;
+
+ c = &boot_cpu_data;
+ switch (c->x86_vendor) {
+ case X86_VENDOR_INTEL:
+ if (c->x86 > 6 ||
+ ((c->x86 == 6) && (c->x86_model >= 13)))
+ result = 1;
+ break;
+ default:
+ break;
+ }
+
+ init = 1;
+ return result;
+}
/*
* Either sets desc->affinity to a valid value, and returns
@@ -589,13 +624,30 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
irq = desc->irq;
cfg = desc->chip_data;
- if (assign_irq_vector(irq, cfg, mask))
- return BAD_APICID;
- /* check that before desc->addinity get updated */
- set_extra_move_desc(desc, mask);
+ if (many_vectors_per_prio()) {
+ struct msi_desc *msi_desc = desc->msi_desc;
+ unsigned i, count = 1;
- cpumask_copy(desc->affinity, mask);
+ if (msi_desc)
+ count = 1 << msi_desc->msi_attrib.multiple;
+
+ /* Multiple MSIs all go to the same destination */
+ if (assign_irq_vector_block(irq, count, mask))
+ return BAD_APICID;
+ for (i = 0; i < count; i++) {
+ desc = irq_to_desc(irq + i);
+ set_extra_move_desc(desc, mask);
+ cpumask_copy(desc->affinity, mask);
+ }
+ } else {
+ if (assign_irq_vector(irq, cfg, mask))
+ return BAD_APICID;
+
+ /* check that before desc->addinity get updated */
+ set_extra_move_desc(desc, mask);
+ cpumask_copy(desc->affinity, mask);
+ }
return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
}
@@ -1285,18 +1337,7 @@ void unlock_vector_lock(void)
static int
__assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
{
- /*
- * NOTE! The local APIC isn't very good at handling
- * multiple interrupts at the same interrupt level.
- * As the interrupt level is determined by taking the
- * vector number and shifting that right by 4, we
- * want to spread these out a bit so that they don't
- * all fall in the same interrupt level.
- *
- * Also, we've got to be careful not to trash gate
- * 0x80, because int 0x80 is hm, kind of importantish. ;)
- */
- static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+ static int current_vector = FIRST_DEVICE_VECTOR;
unsigned int old_vector;
int cpu, err;
cpumask_var_t tmp_mask;
@@ -1321,19 +1362,15 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
err = -ENOSPC;
for_each_cpu_and(cpu, mask, cpu_online_mask) {
int new_cpu;
- int vector, offset;
+ int vector;
apic->vector_allocation_domain(cpu, tmp_mask);
vector = current_vector;
- offset = current_offset;
next:
- vector += 8;
- if (vector >= first_system_vector) {
- /* If out of vectors on large boxen, must share them. */
- offset = (offset + 1) % 8;
- vector = FIRST_DEVICE_VECTOR + offset;
- }
+ vector += 4;
+ if (vector >= first_system_vector)
+ vector = FIRST_DEVICE_VECTOR;
if (unlikely(current_vector == vector))
continue;
@@ -1345,7 +1382,6 @@ next:
goto next;
/* Found one! */
current_vector = vector;
- current_offset = offset;
if (old_vector) {
cfg->move_in_progress = 1;
cpumask_copy(cfg->old_domain, cfg->domain);
@@ -1362,13 +1398,113 @@ next:
}
static int
+__assign_irq_vector_block(int irq, unsigned count, const struct cpumask *mask)
+{
+ static int current_vector = FIRST_DEVICE_VECTOR;
+ unsigned int old_vector;
+ unsigned i, cpu;
+ int err;
+ struct irq_cfg *cfg;
+ cpumask_var_t tmp_mask;
+
+ BUG_ON(irq + count > NR_IRQS);
+ BUG_ON(count & (count - 1));
+
+ for (i = 0; i < count; i++) {
+ cfg = irq_cfg(irq + i);
+ if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+ return -EBUSY;
+ }
+
+ if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
+ return -ENOMEM;
+
+ cfg = irq_cfg(irq);
+ old_vector = cfg->vector;
+ if (old_vector) {
+ err = 0;
+ cpumask_and(tmp_mask, mask, cpu_online_mask);
+ cpumask_and(tmp_mask, cfg->domain, tmp_mask);
+ if (!cpumask_empty(tmp_mask))
+ goto out;
+ }
+
+ /* Only try and allocate irqs on cpus that are present */
+ err = -ENOSPC;
+ for_each_cpu_and(cpu, mask, cpu_online_mask) {
+ int new_cpu;
+ int vector;
+
+ apic->vector_allocation_domain(cpu, tmp_mask);
+
+ vector = current_vector & ~(count - 1);
+next:
+ vector += count;
+ if (vector + count >= first_system_vector) {
+ vector = FIRST_DEVICE_VECTOR & ~(count - 1);
+ if (vector < FIRST_DEVICE_VECTOR)
+ vector += count;
+ }
+ if (unlikely((current_vector & ~(count - 1)) == vector))
+ continue;
+
+ for (i = 0; i < count; i++)
+ if (test_bit(vector + i, used_vectors))
+ goto next;
+
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask) {
+ for (i = 0; i < count; i++) {
+ if (per_cpu(vector_irq, new_cpu)[vector + i]
+ != -1)
+ goto next;
+ }
+ }
+ /* Found one! */
+ current_vector = vector + count - 1;
+ for (i = 0; i < count; i++) {
+ cfg = irq_cfg(irq + i);
+ if (old_vector) {
+ cfg->move_in_progress = 1;
+ cpumask_copy(cfg->old_domain, cfg->domain);
+ }
+ for_each_cpu_and(new_cpu, tmp_mask, cpu_online_mask)
+ per_cpu(vector_irq, new_cpu)[vector + i] =
+ irq + i;
+ cfg->vector = vector;
+ cpumask_copy(cfg->domain, tmp_mask);
+ }
+ err = 0;
+ break;
+ }
+ out:
+ free_cpumask_var(tmp_mask);
+ return err;
+}
+
+static int
assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
{
int err;
unsigned long flags;
spin_lock_irqsave(&vector_lock, flags);
- err = __assign_irq_vector(irq, cfg, mask);
+ if (many_vectors_per_prio())
+ err = __assign_irq_vector_block(irq, 1, mask);
+ else
+ err = __assign_irq_vector(irq, cfg, mask);
+ spin_unlock_irqrestore(&vector_lock, flags);
+ return err;
+}
+
+/* Assumes that count is a power of two and aligns to that power of two */
+static int
+assign_irq_vector_block(int irq, unsigned count, const struct cpumask *mask)
+{
+ int err;
+ unsigned long flags;
+
+ spin_lock_irqsave(&vector_lock, flags);
+ err = __assign_irq_vector_block(irq, count, mask);
spin_unlock_irqrestore(&vector_lock, flags);
return err;
}
@@ -3166,59 +3302,75 @@ device_initcall(ioapic_init_sysfs);
static int nr_irqs_gsi = NR_IRQS_LEGACY;
/*
* Dynamic irq allocate and deallocation
+ *
+ * Returns the interrupt number created, or 0 on error
*/
-unsigned int create_irq_nr(unsigned int irq_want)
+unsigned int create_irq_nr(unsigned int irq_want, unsigned count)
{
- /* Allocate an unused irq */
- unsigned int irq;
- unsigned int new;
+ /* Allocate 'count' consecutive unused irqs */
+ unsigned i, irq, new, run;
unsigned long flags;
struct irq_cfg *cfg_new = NULL;
int cpu = boot_cpu_id;
struct irq_desc *desc_new = NULL;
- irq = 0;
+ if (count > 1 && !many_vectors_per_prio())
+ return 0;
+
+ irq = run = 0;
+
if (irq_want < nr_irqs_gsi)
irq_want = nr_irqs_gsi;
spin_lock_irqsave(&vector_lock, flags);
for (new = irq_want; new < nr_irqs; new++) {
+ int err;
desc_new = irq_to_desc_alloc_cpu(new, cpu);
if (!desc_new) {
printk(KERN_INFO "can not get irq_desc for %d\n", new);
- continue;
+ goto retry;
}
cfg_new = desc_new->chip_data;
if (cfg_new->vector != 0)
+ goto retry;
+ run++;
+ if (run < count)
continue;
- if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
- irq = new;
- break;
+
+ irq = new - run + 1;
+ if (many_vectors_per_prio())
+ err = __assign_irq_vector_block(irq, run,
+ apic->target_cpus());
+ else
+ err = __assign_irq_vector(irq, cfg_new,
+ apic->target_cpus());
+ if (err == 0)
+ break;
+ irq = 0;
+ retry:
+ run = 0;
}
spin_unlock_irqrestore(&vector_lock, flags);
- if (irq > 0) {
- dynamic_irq_init(irq);
+ if (irq == 0)
+ return 0;
+
+ for (i = 0; i < count; i++) {
+ desc_new = irq_to_desc(irq + i);
+ cfg_new = desc_new->chip_data;
+ dynamic_irq_init(irq + i);
/* restore it, in case dynamic_irq_init clear it */
- if (desc_new)
- desc_new->chip_data = cfg_new;
+ desc_new->chip_data = cfg_new;
}
+
return irq;
}
int create_irq(void)
{
- unsigned int irq_want;
- int irq;
-
- irq_want = nr_irqs_gsi;
- irq = create_irq_nr(irq_want);
-
- if (irq == 0)
- irq = -1;
-
- return irq;
+ int irq = create_irq_nr(nr_irqs_gsi, 1);
+ return irq ? irq : -1;
}
void destroy_irq(unsigned int irq)
@@ -3245,7 +3397,8 @@ void destroy_irq(unsigned int irq)
* MSI message composition
*/
#ifdef CONFIG_PCI_MSI
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
+ unsigned count, struct msi_msg *msg)
{
struct irq_cfg *cfg;
int err;
@@ -3255,7 +3408,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
return -ENXIO;
cfg = irq_cfg(irq);
- err = assign_irq_vector(irq, cfg, apic->target_cpus());
+ if (count == 1)
+ err = assign_irq_vector(irq, cfg, apic->target_cpus());
+ else
+ err = assign_irq_vector_block(irq, count, apic->target_cpus());
if (err)
return err;
@@ -3432,52 +3588,107 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
return index;
}
-static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
+static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
+ unsigned count, unsigned base_irq)
{
int ret;
struct msi_msg msg;
+ unsigned irq;
- ret = msi_compose_msg(dev, irq, &msg);
+ ret = msi_compose_msg(dev, base_irq, count, &msg);
if (ret < 0)
return ret;
- set_irq_msi(irq, msidesc);
- write_msi_msg(irq, &msg);
+ msidesc->msi_attrib.multiple = order_base_2(count);
- if (irq_remapped(irq)) {
- struct irq_desc *desc = irq_to_desc(irq);
- /*
- * irq migration in process context
- */
- desc->status |= IRQ_MOVE_PCNTXT;
- set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
- } else
- set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+ /*
+ * The loop is in reverse order so set_irq_msi ends up setting
+ * desc->irq to base_irq
+ */
+ for (irq = base_irq + count - 1; irq >= base_irq; irq--) {
+ set_irq_msi(irq, msidesc);
+ if (irq_remapped(irq)) {
+ struct irq_desc *desc = irq_to_desc(irq);
+ desc->status |= IRQ_MOVE_PCNTXT;
+ set_irq_chip_and_handler_name(irq, &msi_ir_chip,
+ handle_edge_irq, "edge");
+ } else {
+ set_irq_chip_and_handler_name(irq, &msi_chip,
+ handle_edge_irq, "edge");
+ }
+ }
+
+ write_msi_msg(base_irq, &msg);
- dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
+ dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", base_irq);
return 0;
}
-int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+static int setup_msi_irqs(struct pci_dev *dev, int nvec)
+{
+ unsigned base_irq, alloc, i;
+ int ret;
+ struct msi_desc *msidesc = list_first_entry(&dev->msi_list,
+ struct msi_desc, list);
+ struct intel_iommu *iommu = map_dev_to_ir(dev);
+
+ if (intr_remapping_enabled && !iommu)
+ return -ENOENT;
+ if (nvec > 1 && !many_vectors_per_prio())
+ return 1;
+
+ /*
+ * MSI only lets you program the device with nvec that is a power
+ * of two. We could possibly trust the device driver that it'll
+ * only use the number it asked for, but to be safe, let's reserve
+ * all the interrupts we're telling the device it can use.
+ */
+ alloc = roundup_pow_of_two(nvec);
+
+ base_irq = create_irq_nr(nr_irqs_gsi, alloc);
+ if (base_irq == 0)
+ return (alloc > 1) ? alloc / 2 : -ENOSPC;
+
+ if (intr_remapping_enabled) {
+ ret = msi_alloc_irte(dev, base_irq, alloc);
+ if (ret < 0)
+ goto error;
+
+ for (i = 1; i < alloc; i++)
+ set_irte_irq(base_irq + i, iommu, ret, i);
+ }
+
+ ret = setup_msi_irq(dev, msidesc, alloc, base_irq);
+ if (ret < 0)
+ goto error;
+
+ return 0;
+
+error:
+ for (i = 0; i < alloc; i++)
+ destroy_irq(base_irq + i);
+ return ret;
+}
+
+static int setup_msix_irqs(struct pci_dev *dev, int nvec)
{
unsigned int irq;
int ret, sub_handle;
struct msi_desc *msidesc;
unsigned int irq_want;
- struct intel_iommu *iommu = NULL;
+ struct intel_iommu *iommu = map_dev_to_ir(dev);
int index = 0;
- /* x86 doesn't support multiple MSI yet */
- if (type == PCI_CAP_ID_MSI && nvec > 1)
- return 1;
+ if (intr_remapping_enabled && !iommu)
+ return -ENOENT;
irq_want = nr_irqs_gsi;
sub_handle = 0;
list_for_each_entry(msidesc, &dev->msi_list, list) {
- irq = create_irq_nr(irq_want);
+ irq = create_irq_nr(irq_want, 1);
if (irq == 0)
- return -1;
+ return -ENOSPC;
irq_want = irq + 1;
if (!intr_remapping_enabled)
goto no_ir;
@@ -3493,11 +3704,6 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
goto error;
}
} else {
- iommu = map_dev_to_ir(dev);
- if (!iommu) {
- ret = -ENOENT;
- goto error;
- }
/*
* setup the mapping between the irq and the IRTE
* base index, the sub_handle pointing to the
@@ -3506,7 +3712,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
set_irte_irq(irq, iommu, index, sub_handle);
}
no_ir:
- ret = setup_msi_irq(dev, msidesc, irq);
+ ret = setup_msi_irq(dev, msidesc, 1, irq);
if (ret < 0)
goto error;
sub_handle++;
@@ -3518,9 +3724,27 @@ error:
return ret;
}
-void arch_teardown_msi_irq(unsigned int irq)
+int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
+{
+ if (type == PCI_CAP_ID_MSI) {
+ return setup_msi_irqs(dev, nvec);
+ } else {
+ return setup_msix_irqs(dev, nvec);
+ }
+}
+
+void arch_teardown_msi_irqs(struct pci_dev *dev)
{
- destroy_irq(irq);
+ struct msi_desc *desc;
+ unsigned i;
+
+ list_for_each_entry(desc, &dev->msi_list, list) {
+ if (desc->irq == 0)
+ continue;
+ for (i = 0; i < (1 << desc->msi_attrib.multiple); i++) {
+ destroy_irq(desc->irq + i);
+ }
+ }
}
#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
@@ -3566,7 +3790,7 @@ int arch_setup_dmar_msi(unsigned int irq)
int ret;
struct msi_msg msg;
- ret = msi_compose_msg(NULL, irq, &msg);
+ ret = msi_compose_msg(NULL, irq, 1, &msg);
if (ret < 0)
return ret;
dmar_msi_write(irq, &msg);
@@ -3620,7 +3844,7 @@ int arch_setup_hpet_msi(unsigned int irq)
int ret;
struct msi_msg msg;
- ret = msi_compose_msg(NULL, irq, &msg);
+ ret = msi_compose_msg(NULL, irq, 1, &msg);
if (ret < 0)
return ret;
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index dd2130b..0c77a09 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -29,6 +29,7 @@ void printk_address(unsigned long address, int reliable)
{
printk(" [<%p>] %s%pS\n", (void *) address,
reliable ? "" : "? ", (void *) address);
+ mdelay(2000);
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 974890b..f77e53b 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -381,7 +381,7 @@ extern void set_irq_noprobe(unsigned int irq);
extern void set_irq_probe(unsigned int irq);
/* Handle dynamic irq creation and destruction */
-extern unsigned int create_irq_nr(unsigned int irq_want);
+extern unsigned int create_irq_nr(unsigned int irq_want, unsigned count);
extern int create_irq(void);
extern void destroy_irq(unsigned int irq);
--
1.6.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists