[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5caa8414415ab35e74662ac0a30bb4ac@www.loen.fr>
Date: Tue, 10 Dec 2019 18:32:35 +0000
From: Marc Zyngier <maz@...nel.org>
To: John Garry <john.garry@...wei.com>
Cc: Ming Lei <ming.lei@...hat.com>, <tglx@...utronix.de>,
<chenxiang66@...ilicon.com>, <bigeasy@...utronix.de>,
<linux-kernel@...r.kernel.org>, <hare@...e.com>, <hch@....de>,
<axboe@...nel.dk>, <bvanassche@....org>, <peterz@...radead.org>,
<mingo@...hat.com>
Subject: Re: [PATCH RFC 1/1] genirq: Make threaded handler use irq affinity for managed interrupt
On 2019-12-10 12:05, John Garry wrote:
> On 10/12/2019 11:36, Marc Zyngier wrote:
>> On 2019-12-10 10:59, John Garry wrote:
>>>>>
>>>>> There is no lockup, just a potential performance boost in this
>>>>> change.
>>>>>
>>>>> My colleague Xiang Chen can provide specifics of the test, as he
>>>>> is
>>>>> the one running it.
>>>>>
>>>>> But one key bit of info - which I did not think most relevant
>>>>> before
>>>>> - that is we have 2x SAS controllers running the throughput test
>>>>> on
>>>>> the same host.
>>>>>
>>>>> As such, the completion queue interrupts would be spread
>>>>> identically
>>>>> over the CPUs for each controller. I notice that ARM GICv3 ITS
>>>>> interrupt controller (which we use) does not use the generic irq
>>>>> matrix allocator, which I think would really help with this.
>>>>>
>>>>> Hi Marc,
>>>>>
>>>>> Is there any reason for which we couldn't utilise of the generic
>>>>> irq
>>>>> matrix allocator for GICv3?
>>>>
>>>
>>> Hi Marc,
>>>
>>>> For a start, the ITS code predates the matrix allocator by about
>>>> three
>>>> years. Also, my understanding of this allocator is that it allows
>>>> x86 to cope with a very small number of possible interrupt vectors
>>>> per CPU. The ITS doesn't have such issue, as:
>>>> 1) the namespace is global, and not per CPU
>>>> 2) the namespace is *huge*
>>>> Now, what property of the matrix allocator is the ITS code
>>>> missing?
>>>> I'd be more than happy to improve it.
>>>
>>> I think specifically the property that the matrix allocator will
>>> try
>>> to find a CPU for irq affinity which "has the lowest number of
>>> managed
>>> IRQs allocated" - I'm quoting the comment on
>>> matrix_find_best_cpu_managed().
>> But that decision is due to allocation constraints. You can have at
>> most
>> 256 interrupts per CPU, so the allocator tries to balance it.
>> On the contrary, the ITS does care about how many interrupt target
>> any
>> given CPU. The whole 2^24 interrupt namespace can be thrown at a
>> single
>> CPU.
>>
>>> The ITS code will make the lowest online CPU in the affinity mask
>>> the
>>> target CPU for the interrupt, which may result in some CPUs
>>> handling
>>> so many interrupts.
>> If what you want is for the *default* affinity to be spread around,
>> that should be achieved pretty easily. Let me have a think about how
>> to do that.
>
> Cool, I anticipate that it should help my case.
>
> I can also seek out some NVMe cards to see how it would help a more
> "generic" scenario.
Can you give the following a go? It probably has all kind of warts on
top of the quality debug information, but I managed to get my D05 and
a couple of guests to boot with it. It will probably eat your data,
so use caution! ;-)
Thanks,
M.
diff --git a/drivers/irqchip/irq-gic-v3-its.c
b/drivers/irqchip/irq-gic-v3-its.c
index e05673bcd52b..301ee3bc0602 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -177,6 +177,8 @@ static DEFINE_IDA(its_vpeid_ida);
#define gic_data_rdist_rd_base() (gic_data_rdist()->rd_base)
#define gic_data_rdist_vlpi_base() (gic_data_rdist_rd_base() +
SZ_128K)
+static DEFINE_PER_CPU(atomic_t, cpu_lpi_count);
+
static u16 get_its_list(struct its_vm *vm)
{
struct its_node *its;
@@ -1287,42 +1289,76 @@ static void its_unmask_irq(struct irq_data *d)
lpi_update_config(d, 0, LPI_PROP_ENABLED);
}
+static int its_pick_target_cpu(struct its_device *its_dev, const
struct cpumask *cpu_mask)
+{
+ unsigned int cpu = nr_cpu_ids, tmp;
+ int count = S32_MAX;
+
+ for_each_cpu_and(tmp, cpu_mask, cpu_online_mask) {
+ int this_count = per_cpu(cpu_lpi_count, tmp).counter;
+ if (this_count < count) {
+ cpu = tmp;
+ count = this_count;
+ }
+ }
+
+ return cpu;
+}
+
static int its_set_affinity(struct irq_data *d, const struct cpumask
*mask_val,
bool force)
{
- unsigned int cpu;
- const struct cpumask *cpu_mask = cpu_online_mask;
struct its_device *its_dev = irq_data_get_irq_chip_data(d);
- struct its_collection *target_col;
+ int ret = IRQ_SET_MASK_OK_DONE;
u32 id = its_get_event_id(d);
+ cpumask_var_t tmpmask;
/* A forwarded interrupt should use irq_set_vcpu_affinity */
if (irqd_is_forwarded_to_vcpu(d))
return -EINVAL;
- /* lpi cannot be routed to a redistributor that is on a foreign
node */
- if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) {
- if (its_dev->its->numa_node >= 0) {
- cpu_mask = cpumask_of_node(its_dev->its->numa_node);
- if (!cpumask_intersects(mask_val, cpu_mask))
- return -EINVAL;
+ if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ cpumask_and(tmpmask, mask_val, cpu_online_mask);
+
+ if (its_dev->its->numa_node >= 0)
+ cpumask_and(tmpmask, tmpmask,
cpumask_of_node(its_dev->its->numa_node));
+
+ if (cpumask_empty(tmpmask)) {
+ /* LPI cannot be routed to a redistributor that is on a foreign node
*/
+ if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144) {
+ ret = -EINVAL;
+ goto out;
}
+
+ cpumask_copy(tmpmask, cpu_online_mask);
}
- cpu = cpumask_any_and(mask_val, cpu_mask);
+ if (!cpumask_test_cpu(its_dev->event_map.col_map[id], tmpmask)) {
+ struct its_collection *target_col;
+ int cpu;
- if (cpu >= nr_cpu_ids)
- return -EINVAL;
+ cpu = its_pick_target_cpu(its_dev, tmpmask);
+ if (cpu >= nr_cpu_ids) {
+ ret = -EINVAL;
+ goto out;
+ }
- /* don't set the affinity when the target cpu is same as current one
*/
- if (cpu != its_dev->event_map.col_map[id]) {
+ pr_info("IRQ%d CPU%d -> CPU%d\n",
+ d->irq, its_dev->event_map.col_map[id], cpu);
+ atomic_inc(per_cpu_ptr(&cpu_lpi_count, cpu));
+ atomic_dec(per_cpu_ptr(&cpu_lpi_count,
+ its_dev->event_map.col_map[id]));
target_col = &its_dev->its->collections[cpu];
its_send_movi(its_dev, target_col, id);
its_dev->event_map.col_map[id] = cpu;
irq_data_update_effective_affinity(d, cpumask_of(cpu));
}
- return IRQ_SET_MASK_OK_DONE;
+out:
+ free_cpumask_var(tmpmask);
+ return ret;
}
static u64 its_irq_get_msi_base(struct its_device *its_dev)
@@ -2773,22 +2809,28 @@ static int its_irq_domain_activate(struct
irq_domain *domain,
{
struct its_device *its_dev = irq_data_get_irq_chip_data(d);
u32 event = its_get_event_id(d);
- const struct cpumask *cpu_mask = cpu_online_mask;
- int cpu;
+ int cpu = nr_cpu_ids;
- /* get the cpu_mask of local node */
- if (its_dev->its->numa_node >= 0)
- cpu_mask = cpumask_of_node(its_dev->its->numa_node);
+ /* Find the least loaded CPU on the local node */
+ if (its_dev->its->numa_node >= 0) {
+ cpu = its_pick_target_cpu(its_dev,
+ cpumask_of_node(its_dev->its->numa_node));
+ if (cpu < 0)
+ return cpu;
- /* Bind the LPI to the first possible CPU */
- cpu = cpumask_first_and(cpu_mask, cpu_online_mask);
- if (cpu >= nr_cpu_ids) {
- if (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144)
+ if (cpu >= nr_cpu_ids &&
+ (its_dev->its->flags & ITS_FLAGS_WORKAROUND_CAVIUM_23144))
return -EINVAL;
+ }
- cpu = cpumask_first(cpu_online_mask);
+ if (cpu >= nr_cpu_ids) {
+ cpu = its_pick_target_cpu(its_dev, cpu_online_mask);
+ if (cpu < 0)
+ return cpu;
}
+ pr_info("picked CPU%d IRQ%d\n", cpu, d->irq);
+ atomic_inc(per_cpu_ptr(&cpu_lpi_count, cpu));
its_dev->event_map.col_map[event] = cpu;
irq_data_update_effective_affinity(d, cpumask_of(cpu));
@@ -2803,6 +2845,8 @@ static void its_irq_domain_deactivate(struct
irq_domain *domain,
struct its_device *its_dev = irq_data_get_irq_chip_data(d);
u32 event = its_get_event_id(d);
+ atomic_dec(per_cpu_ptr(&cpu_lpi_count,
+ its_dev->event_map.col_map[event]));
/* Stop the delivery of interrupts */
its_send_discard(its_dev, event);
}
--
Jazz is not dead. It just smells funny...
Powered by blists - more mailing lists