[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2a30a07f-982c-c291-e263-0cf72ec61235@grimberg.me>
Date: Tue, 20 Aug 2019 10:33:38 -0700
From: Sagi Grimberg <sagi@...mberg.me>
To: longli@...uxonhyperv.com, Ingo Molnar <mingo@...hat.com>,
Peter Zijlstra <peterz@...radead.org>,
Keith Busch <keith.busch@...el.com>, Jens Axboe <axboe@...com>,
Christoph Hellwig <hch@....de>, linux-nvme@...ts.infradead.org,
linux-kernel@...r.kernel.org
Cc: Long Li <longli@...rosoft.com>
Subject: Re: [PATCH 3/3] nvme: complete request in work queue on CPU with
flooded interrupts
> From: Long Li <longli@...rosoft.com>
>
> When a NVMe hardware queue is mapped to several CPU queues, it is possible
> that the CPU this hardware queue is bound to is flooded by returning I/O for
> other CPUs.
>
> For example, consider the following scenario:
> 1. CPU 0, 1, 2 and 3 share the same hardware queue
> 2. the hardware queue interrupts CPU 0 for I/O response
> 3. processes from CPU 1, 2 and 3 keep sending I/Os
>
> CPU 0 may be flooded with interrupts from NVMe device that are I/O responses
> for CPU 1, 2 and 3. Under heavy I/O load, it is possible that CPU 0 spends
> all the time serving NVMe and other system interrupts, but doesn't have a
> chance to run in process context.
>
> To fix this, CPU 0 can schedule a work to complete the I/O request when it
> detects the scheduler is not making progress. This serves multiple purposes:
>
> 1. This CPU has to be scheduled to complete the request. The other CPUs can't
> issue more I/Os until some previous I/Os are completed. This helps this CPU
> get out of NVMe interrupts.
>
> 2. This acts a throttling mechanisum for NVMe devices, in that it can not
> starve a CPU while servicing I/Os from other CPUs.
>
> 3. This CPU can make progress on RCU and other work items on its queue.
The problem is indeed real, but this is the wrong approach in my mind.
We already have irqpoll which takes care proper budgeting polling
cycles and not hogging the cpu.
I've sent rfc for this particular problem before [1]. At the time IIRC,
Christoph suggested that we will poll the first batch directly from
the irq context and reap the rest in irqpoll handler.
[1]:
http://lists.infradead.org/pipermail/linux-nvme/2016-October/006497.html
How about something like this instead:
--
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 71127a366d3c..84bf16d75109 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -24,6 +24,7 @@
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/sed-opal.h>
#include <linux/pci-p2pdma.h>
+#include <linux/irq_poll.h>
#include "trace.h"
#include "nvme.h"
@@ -32,6 +33,7 @@
#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion))
#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc))
+#define NVME_POLL_BUDGET_IRQ 256
/*
* These can be higher, but we need to ensure that any command doesn't
@@ -189,6 +191,7 @@ struct nvme_queue {
u32 *dbbuf_cq_db;
u32 *dbbuf_sq_ei;
u32 *dbbuf_cq_ei;
+ struct irq_poll iop;
struct completion delete_done;
};
@@ -1015,6 +1018,23 @@ static inline int nvme_process_cq(struct
nvme_queue *nvmeq, u16 *start,
return found;
}
+static int nvme_irqpoll_handler(struct irq_poll *iop, int budget)
+{
+ struct nvme_queue *nvmeq = container_of(iop, struct nvme_queue,
iop);
+ struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
+ u16 start, end;
+ int completed;
+
+ completed = nvme_process_cq(nvmeq, &start, &end, budget);
+ nvme_complete_cqes(nvmeq, start, end);
+ if (completed < budget) {
+ irq_poll_complete(&nvmeq->iop);
+ enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector));
+ }
+
+ return completed;
+}
+
static irqreturn_t nvme_irq(int irq, void *data)
{
struct nvme_queue *nvmeq = data;
@@ -1028,12 +1048,16 @@ static irqreturn_t nvme_irq(int irq, void *data)
rmb();
if (nvmeq->cq_head != nvmeq->last_cq_head)
ret = IRQ_HANDLED;
- nvme_process_cq(nvmeq, &start, &end, -1);
+ nvme_process_cq(nvmeq, &start, &end, NVME_POLL_BUDGET_IRQ);
nvmeq->last_cq_head = nvmeq->cq_head;
wmb();
if (start != end) {
nvme_complete_cqes(nvmeq, start, end);
+ if (nvme_cqe_pending(nvmeq)) {
+ disable_irq_nosync(irq);
+ irq_poll_sched(&nvmeq->iop);
+ }
return IRQ_HANDLED;
}
@@ -1347,6 +1371,7 @@ static enum blk_eh_timer_return
nvme_timeout(struct request *req, bool reserved)
static void nvme_free_queue(struct nvme_queue *nvmeq)
{
+ irq_poll_disable(&nvmeq->iop);
dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq),
(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
if (!nvmeq->sq_cmds)
@@ -1481,6 +1506,7 @@ static int nvme_alloc_queue(struct nvme_dev *dev,
int qid, int depth)
nvmeq->dev = dev;
spin_lock_init(&nvmeq->sq_lock);
spin_lock_init(&nvmeq->cq_poll_lock);
+ irq_poll_init(&nvmeq->iop, NVME_POLL_BUDGET_IRQ,
nvme_irqpoll_handler);
nvmeq->cq_head = 0;
nvmeq->cq_phase = 1;
nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
--
Powered by blists - more mailing lists