[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <7c5c5267cba2c03f6650444d4879ba0d13004584.1745394536.git.leon@kernel.org>
Date: Wed, 23 Apr 2025 11:13:14 +0300
From: Leon Romanovsky <leon@...nel.org>
To: Marek Szyprowski <m.szyprowski@...sung.com>,
Jens Axboe <axboe@...nel.dk>,
Christoph Hellwig <hch@....de>,
Keith Busch <kbusch@...nel.org>
Cc: Jake Edge <jake@....net>,
Jonathan Corbet <corbet@....net>,
Jason Gunthorpe <jgg@...pe.ca>,
Zhu Yanjun <zyjzyj2000@...il.com>,
Robin Murphy <robin.murphy@....com>,
Joerg Roedel <joro@...tes.org>,
Will Deacon <will@...nel.org>,
Sagi Grimberg <sagi@...mberg.me>,
Bjorn Helgaas <bhelgaas@...gle.com>,
Logan Gunthorpe <logang@...tatee.com>,
Yishai Hadas <yishaih@...dia.com>,
Shameer Kolothum <shameerali.kolothum.thodi@...wei.com>,
Kevin Tian <kevin.tian@...el.com>,
Alex Williamson <alex.williamson@...hat.com>,
Jérôme Glisse <jglisse@...hat.com>,
Andrew Morton <akpm@...ux-foundation.org>,
linux-doc@...r.kernel.org,
linux-kernel@...r.kernel.org,
linux-block@...r.kernel.org,
linux-rdma@...r.kernel.org,
iommu@...ts.linux.dev,
linux-nvme@...ts.infradead.org,
linux-pci@...r.kernel.org,
kvm@...r.kernel.org,
linux-mm@...ck.org,
Niklas Schnelle <schnelle@...ux.ibm.com>,
Chuck Lever <chuck.lever@...cle.com>,
Luis Chamberlain <mcgrof@...nel.org>,
Matthew Wilcox <willy@...radead.org>,
Dan Williams <dan.j.williams@...el.com>,
Kanchan Joshi <joshi.k@...sung.com>,
Chaitanya Kulkarni <kch@...dia.com>,
Nitesh Shetty <nj.shetty@...sung.com>,
Leon Romanovsky <leonro@...dia.com>
Subject: [PATCH v9 23/24] nvme-pci: convert to blk_rq_dma_map
From: Christoph Hellwig <hch@....de>
Use the blk_rq_dma_map API to DMA map requests instead of
scatterlists.
Signed-off-by: Christoph Hellwig <hch@....de>
Tested-by: Jens Axboe <axboe@...nel.dk>
[ Leon: squashed optimization patch from Kanchan ]
Signed-off-by: Kanchan Joshi <joshi.k@...sung.com>
Signed-off-by: Nitesh Shetty <nj.shetty@...sung.com>
[ Leon: rewrote original patch due to rebases and addition of metadata support ]
Signed-off-by: Leon Romanovsky <leonro@...dia.com>
---
drivers/nvme/host/pci.c | 597 ++++++++++++++++++++++------------------
1 file changed, 332 insertions(+), 265 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 7e93536d01cb..eb60a486331c 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -7,7 +7,7 @@
#include <linux/acpi.h>
#include <linux/async.h>
#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
+#include <linux/blk-mq-dma.h>
#include <linux/blk-integrity.h>
#include <linux/dmi.h>
#include <linux/init.h>
@@ -26,7 +26,6 @@
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/io-64-nonatomic-hi-lo.h>
#include <linux/sed-opal.h>
-#include <linux/pci-p2pdma.h>
#include "trace.h"
#include "nvme.h"
@@ -144,9 +143,6 @@ struct nvme_dev {
bool hmb;
struct sg_table *hmb_sgt;
- mempool_t *iod_mempool;
- mempool_t *iod_meta_mempool;
-
/* shadow doorbell buffer support: */
__le32 *dbbuf_dbs;
dma_addr_t dbbuf_dbs_dma_addr;
@@ -222,6 +218,7 @@ struct nvme_queue {
enum {
IOD_LARGE_DESCRIPTORS = 1, /* uses the full page sized descriptor pool */
+ IOD_SINGLE_SEGMENT = 2, /* single segment dma mapping */
};
/*
@@ -233,11 +230,11 @@ struct nvme_iod {
bool aborted;
u8 nr_descriptors; /* # of PRP/SGL descriptors */
unsigned int flags;
- unsigned int dma_len; /* length of single DMA segment mapping */
- dma_addr_t first_dma;
+ unsigned int total_len; /* length of the entire transfer */
+ unsigned int total_meta_len; /* length of the entire metadata transfer */
dma_addr_t meta_dma;
- struct sg_table sgt;
- struct sg_table meta_sgt;
+ struct dma_iova_state dma_state;
+ struct dma_iova_state dma_meta_state;
void *meta_list;
void *descriptors[NVME_MAX_NR_DESCRIPTORS];
};
@@ -546,9 +543,14 @@ static void nvme_free_descriptors(struct nvme_dev *dev, struct request *req)
{
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- dma_addr_t dma_addr = iod->first_dma;
+ dma_addr_t dma_addr;
int i;
+ if (iod->cmd.common.flags & NVME_CMD_SGL_METABUF)
+ dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr);
+ else
+ dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp2);
+
if (iod->nr_descriptors == 1) {
dma_pool_free(nvme_dma_pool(dev, iod), iod->descriptors[0],
dma_addr);
@@ -564,67 +566,178 @@ static void nvme_free_descriptors(struct nvme_dev *dev, struct request *req)
}
}
+static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ enum dma_data_direction dir = rq_dma_dir(req);
+ int length = iod->total_len;
+ dma_addr_t dma_addr;
+ int i, desc;
+ __le64 *prp_list;
+ u32 dma_len;
+
+ dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1);
+ dma_len = min_t(u32, length,
+ NVME_CTRL_PAGE_SIZE - (dma_addr & (NVME_CTRL_PAGE_SIZE - 1)));
+ length -= dma_len;
+ if (!length) {
+ dma_unmap_page(dev->dev, dma_addr, dma_len, dir);
+ return;
+ }
+
+ if (length <= NVME_CTRL_PAGE_SIZE) {
+ dma_unmap_page(dev->dev, dma_addr, dma_len, dir);
+ dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp2);
+ dma_unmap_page(dev->dev, dma_addr, length, dir);
+ return;
+ }
+
+ i = 0;
+ desc = 0;
+ prp_list = iod->descriptors[desc];
+ do {
+ dma_unmap_page(dev->dev, dma_addr, dma_len, dir);
+ if (i == NVME_CTRL_PAGE_SIZE >> 3) {
+ prp_list = iod->descriptors[++desc];
+ i = 0;
+ }
+
+ dma_addr = le64_to_cpu(prp_list[i++]);
+ dma_len = min(length, NVME_CTRL_PAGE_SIZE);
+ length -= dma_len;
+ } while (length);
+}
+
+
+static void nvme_free_sgls(struct nvme_dev *dev, struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr);
+ unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length);
+ struct nvme_sgl_desc *sg_list = iod->descriptors[0];
+ enum dma_data_direction dir = rq_dma_dir(req);
+
+ if (iod->nr_descriptors) {
+ unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i;
+
+ for (i = 0; i < nr_entries; i++)
+ dma_unmap_page(dev->dev, le64_to_cpu(sg_list[i].addr),
+ le32_to_cpu(sg_list[i].length), dir);
+ } else {
+ dma_unmap_page(dev->dev, sqe_dma_addr, sqe_dma_len, dir);
+ }
+}
+
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ dma_addr_t dma_addr;
- if (iod->dma_len) {
- dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
- rq_dma_dir(req));
+ if (iod->flags & IOD_SINGLE_SEGMENT) {
+ dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1);
+ dma_unmap_page(dev->dev, dma_addr, iod->total_len,
+ rq_dma_dir(req));
return;
}
- WARN_ON_ONCE(!iod->sgt.nents);
+ if (!blk_rq_dma_unmap(req, dev->dev, &iod->dma_state, iod->total_len)) {
+ if (iod->cmd.common.flags & NVME_CMD_SGL_METABUF)
+ nvme_free_sgls(dev, req);
+ else
+ nvme_free_prps(dev, req);
+ }
- dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
- nvme_free_descriptors(dev, req);
- mempool_free(iod->sgt.sgl, dev->iod_mempool);
+ if (iod->nr_descriptors)
+ nvme_free_descriptors(dev, req);
}
-static void nvme_print_sgl(struct scatterlist *sgl, int nents)
+static bool nvme_try_setup_prp_simple(struct nvme_dev *dev, struct request *req,
+ struct nvme_rw_command *cmnd,
+ struct blk_dma_iter *iter)
{
- int i;
- struct scatterlist *sg;
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct bio_vec bv = req_bvec(req);
+ unsigned int first_prp_len;
+
+ if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA))
+ return false;
+
+ if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + bv.bv_len >
+ NVME_CTRL_PAGE_SIZE * 2)
+ return false;
- for_each_sg(sgl, sg, nents, i) {
- dma_addr_t phys = sg_phys(sg);
- pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
- "dma_address:%pad dma_length:%d\n",
- i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
- sg_dma_len(sg));
+ iter->addr = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
+ if (dma_mapping_error(dev->dev, iter->addr)) {
+ iter->status = BLK_STS_RESOURCE;
+ return true;
}
+ iod->total_len = bv.bv_len;
+ cmnd->dptr.prp1 = cpu_to_le64(iter->addr);
+
+ first_prp_len = NVME_CTRL_PAGE_SIZE -
+ (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1));
+ if (bv.bv_len > first_prp_len)
+ cmnd->dptr.prp2 = cpu_to_le64(iter->addr + first_prp_len);
+ else
+ cmnd->dptr.prp2 = 0;
+
+ iter->status = BLK_STS_OK;
+ iod->flags |= IOD_SINGLE_SEGMENT;
+ return true;
}
static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmnd)
+ struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- int length = blk_rq_payload_bytes(req);
- struct scatterlist *sg = iod->sgt.sgl;
- int dma_len = sg_dma_len(sg);
- u64 dma_addr = sg_dma_address(sg);
- int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
+ struct nvme_rw_command *cmnd = &iod->cmd.rw;
+ unsigned int length = blk_rq_payload_bytes(req);
+ struct blk_dma_iter iter;
+ dma_addr_t prp1_dma, prp2_dma = 0;
+ unsigned int prp_len, i;
__le64 *prp_list;
- dma_addr_t prp_dma;
- int i;
+ unsigned int nr_segments = blk_rq_nr_phys_segments(req);
- length -= (NVME_CTRL_PAGE_SIZE - offset);
- if (length <= 0) {
- iod->first_dma = 0;
- goto done;
+ if (nr_segments == 1) {
+ if (nvme_try_setup_prp_simple(dev, req, cmnd, &iter))
+ return iter.status;
}
- dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
- if (dma_len) {
- dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
- } else {
- sg = sg_next(sg);
- dma_addr = sg_dma_address(sg);
- dma_len = sg_dma_len(sg);
+ if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
+ return iter.status;
+
+ /*
+ * PRP1 always points to the start of the DMA transfers.
+ *
+ * This is the only PRP (except for the list entries) that could be
+ * non-aligned.
+ */
+ prp1_dma = iter.addr;
+ prp_len = min(length, NVME_CTRL_PAGE_SIZE -
+ (iter.addr & (NVME_CTRL_PAGE_SIZE - 1)));
+ iod->total_len += prp_len;
+ iter.addr += prp_len;
+ iter.len -= prp_len;
+ length -= prp_len;
+ if (!length)
+ goto done;
+
+ if (!iter.len) {
+ if (!blk_rq_dma_map_iter_next(req, dev->dev, &iod->dma_state,
+ &iter)) {
+ if (WARN_ON_ONCE(!iter.status))
+ goto bad_sgl;
+ goto done;
+ }
}
+ /*
+ * PRP2 is usually a list, but can point to data if all data to be
+ * transferred fits into PRP1 + PRP2:
+ */
if (length <= NVME_CTRL_PAGE_SIZE) {
- iod->first_dma = dma_addr;
+ prp2_dma = iter.addr;
+ iod->total_len += length;
goto done;
}
@@ -633,58 +746,83 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
iod->flags |= IOD_LARGE_DESCRIPTORS;
prp_list = dma_pool_alloc(nvme_dma_pool(dev, iod), GFP_ATOMIC,
- &prp_dma);
- if (!prp_list)
- return BLK_STS_RESOURCE;
+ &prp2_dma);
+ if (!prp_list) {
+ iter.status = BLK_STS_RESOURCE;
+ goto done;
+ }
iod->descriptors[iod->nr_descriptors++] = prp_list;
- iod->first_dma = prp_dma;
+
i = 0;
for (;;) {
+ prp_list[i++] = cpu_to_le64(iter.addr);
+ prp_len = min(length, NVME_CTRL_PAGE_SIZE);
+ if (WARN_ON_ONCE(iter.len < prp_len))
+ goto bad_sgl;
+
+ iod->total_len += prp_len;
+ iter.addr += prp_len;
+ iter.len -= prp_len;
+ length -= prp_len;
+ if (!length)
+ break;
+
+ if (iter.len == 0) {
+ if (!blk_rq_dma_map_iter_next(req, dev->dev,
+ &iod->dma_state, &iter)) {
+ if (WARN_ON_ONCE(!iter.status))
+ goto bad_sgl;
+ goto done;
+ }
+ }
+
+ /*
+ * If we've filled the entire descriptor, allocate a new that is
+ * pointed to be the last entry in the previous PRP list. To
+ * accommodate for that move the last actual entry to the new
+ * descriptor.
+ */
if (i == NVME_CTRL_PAGE_SIZE >> 3) {
__le64 *old_prp_list = prp_list;
+ dma_addr_t prp_list_dma;
prp_list = dma_pool_alloc(dev->prp_page_pool,
- GFP_ATOMIC, &prp_dma);
- if (!prp_list)
- goto free_prps;
+ GFP_ATOMIC, &prp_list_dma);
+ if (!prp_list) {
+ iter.status = BLK_STS_RESOURCE;
+ goto done;
+ }
iod->descriptors[iod->nr_descriptors++] = prp_list;
+
prp_list[0] = old_prp_list[i - 1];
- old_prp_list[i - 1] = cpu_to_le64(prp_dma);
+ old_prp_list[i - 1] = cpu_to_le64(prp_list_dma);
i = 1;
}
- prp_list[i++] = cpu_to_le64(dma_addr);
- dma_len -= NVME_CTRL_PAGE_SIZE;
- dma_addr += NVME_CTRL_PAGE_SIZE;
- length -= NVME_CTRL_PAGE_SIZE;
- if (length <= 0)
- break;
- if (dma_len > 0)
- continue;
- if (unlikely(dma_len < 0))
- goto bad_sgl;
- sg = sg_next(sg);
- dma_addr = sg_dma_address(sg);
- dma_len = sg_dma_len(sg);
}
+
done:
- cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sgt.sgl));
- cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
- return BLK_STS_OK;
-free_prps:
- nvme_free_descriptors(dev, req);
- return BLK_STS_RESOURCE;
+ /*
+ * nvme_unmap_data uses the DPT field in the SQE to tear down the
+ * mapping, so initialize it even for failures.
+ */
+ cmnd->dptr.prp1 = cpu_to_le64(prp1_dma);
+ cmnd->dptr.prp2 = cpu_to_le64(prp2_dma);
+ if (unlikely(iter.status))
+ nvme_unmap_data(dev, req);
+ return iter.status;
+
bad_sgl:
- WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents),
- "Invalid SGL for payload:%d nents:%d\n",
- blk_rq_payload_bytes(req), iod->sgt.nents);
+ dev_err_once(dev->dev,
+ "Incorrectly formed request for payload:%d nents:%d\n",
+ blk_rq_payload_bytes(req), blk_rq_nr_phys_segments(req));
return BLK_STS_IOERR;
}
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
- struct scatterlist *sg)
+ struct blk_dma_iter *iter)
{
- sge->addr = cpu_to_le64(sg_dma_address(sg));
- sge->length = cpu_to_le32(sg_dma_len(sg));
+ sge->addr = cpu_to_le64(iter->addr);
+ sge->length = cpu_to_le32(iter->len);
sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}
@@ -696,21 +834,60 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
}
+static bool nvme_try_setup_sgl_simple(struct nvme_dev *dev, struct request *req,
+ struct nvme_rw_command *cmnd,
+ struct blk_dma_iter *iter)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct bio_vec bv = req_bvec(req);
+
+ if (IS_ENABLED(CONFIG_PCI_P2PDMA) && (req->cmd_flags & REQ_P2PDMA))
+ return false;
+
+ if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) + bv.bv_len >
+ NVME_CTRL_PAGE_SIZE * 2)
+ return false;
+
+ iter->addr = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
+ if (dma_mapping_error(dev->dev, iter->addr)) {
+ iter->status = BLK_STS_RESOURCE;
+ return true;
+ }
+ iod->total_len = bv.bv_len;
+ cmnd->dptr.sgl.addr = cpu_to_le64(iter->addr);
+ cmnd->dptr.sgl.length = cpu_to_le32(iod->total_len);
+ cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
+ iter->status = BLK_STS_OK;
+ iod->flags |= IOD_SINGLE_SEGMENT;
+ return true;
+}
+
static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmd)
+ struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_rw_command *cmd = &iod->cmd.rw;
+ unsigned int entries = blk_rq_nr_phys_segments(req);
struct nvme_sgl_desc *sg_list;
- struct scatterlist *sg = iod->sgt.sgl;
- unsigned int entries = iod->sgt.nents;
+ struct blk_dma_iter iter;
dma_addr_t sgl_dma;
- int i = 0;
+ unsigned int mapped = 0;
+ unsigned int nr_segments = blk_rq_nr_phys_segments(req);
/* setting the transfer type as SGL */
cmd->flags = NVME_CMD_SGL_METABUF;
- if (entries == 1) {
- nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
+ if (nr_segments == 1) {
+ if (nvme_try_setup_sgl_simple(dev, req, cmd, &iter))
+ return iter.status;
+ }
+
+ if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
+ return iter.status;
+
+ if (entries == 1 || blk_rq_dma_map_coalesce(&iod->dma_state)) {
+ nvme_pci_sgl_set_data(&cmd->dptr.sgl, &iter);
+ iod->total_len += iter.len;
return BLK_STS_OK;
}
@@ -721,168 +898,109 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
if (!sg_list)
return BLK_STS_RESOURCE;
iod->descriptors[iod->nr_descriptors++] = sg_list;
- iod->first_dma = sgl_dma;
- nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries);
do {
- nvme_pci_sgl_set_data(&sg_list[i++], sg);
- sg = sg_next(sg);
- } while (--entries > 0);
+ if (WARN_ON_ONCE(mapped == entries)) {
+ iter.status = BLK_STS_IOERR;
+ break;
+ }
+ nvme_pci_sgl_set_data(&sg_list[mapped++], &iter);
+ iod->total_len += iter.len;
+ } while (blk_rq_dma_map_iter_next(req, dev->dev, &iod->dma_state,
+ &iter));
- return BLK_STS_OK;
+ nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, mapped);
+ if (unlikely(iter.status))
+ nvme_free_sgls(dev, req);
+ return iter.status;
}
-static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmnd,
- struct bio_vec *bv)
+static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req)
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
- unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
-
- iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
- if (dma_mapping_error(dev->dev, iod->first_dma))
- return BLK_STS_RESOURCE;
- iod->dma_len = bv->bv_len;
-
- cmnd->dptr.prp1 = cpu_to_le64(iod->first_dma);
- if (bv->bv_len > first_prp_len)
- cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma + first_prp_len);
- else
- cmnd->dptr.prp2 = 0;
- return BLK_STS_OK;
+ if (nvme_pci_use_sgls(dev, req, blk_rq_nr_phys_segments(req)))
+ return nvme_pci_setup_sgls(dev, req);
+ return nvme_pci_setup_prps(dev, req);
}
-static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
- struct request *req, struct nvme_rw_command *cmnd,
- struct bio_vec *bv)
+static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
+ struct request *req)
{
+ unsigned int entries = req->nr_integrity_segments;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_sgl_desc *sg_list = iod->meta_list;
+ enum dma_data_direction dir = rq_dma_dir(req);
+ dma_addr_t dma_addr;
- iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
- if (dma_mapping_error(dev->dev, iod->first_dma))
- return BLK_STS_RESOURCE;
- iod->dma_len = bv->bv_len;
-
- cmnd->flags = NVME_CMD_SGL_METABUF;
- cmnd->dptr.sgl.addr = cpu_to_le64(iod->first_dma);
- cmnd->dptr.sgl.length = cpu_to_le32(iod->dma_len);
- cmnd->dptr.sgl.type = NVME_SGL_FMT_DATA_DESC << 4;
- return BLK_STS_OK;
-}
-
-static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
- struct nvme_command *cmnd)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- blk_status_t ret = BLK_STS_RESOURCE;
- int rc;
-
- if (blk_rq_nr_phys_segments(req) == 1) {
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- struct bio_vec bv = req_bvec(req);
-
- if (!is_pci_p2pdma_page(bv.bv_page)) {
- if (!nvme_pci_metadata_use_sgls(dev, req) &&
- (bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
- bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
- return nvme_setup_prp_simple(dev, req,
- &cmnd->rw, &bv);
-
- if (nvmeq->qid && sgl_threshold &&
- nvme_ctrl_sgl_supported(&dev->ctrl))
- return nvme_setup_sgl_simple(dev, req,
- &cmnd->rw, &bv);
- }
+ if (iod->flags & IOD_SINGLE_SEGMENT) {
+ dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr);
+ dma_unmap_page(dev->dev, dma_addr, iod->total_len, rq_dma_dir(req));
+ return;
}
- iod->dma_len = 0;
- iod->sgt.sgl = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
- if (!iod->sgt.sgl)
- return BLK_STS_RESOURCE;
- sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req));
- iod->sgt.orig_nents = blk_rq_map_sg(req, iod->sgt.sgl);
- if (!iod->sgt.orig_nents)
- goto out_free_sg;
+ if (!blk_rq_dma_unmap(req, dev->dev, &iod->dma_meta_state,
+ iod->total_meta_len)) {
+ if (iod->cmd.common.flags & NVME_CMD_SGL_METASEG) {
+ unsigned int i;
- rc = dma_map_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req),
- DMA_ATTR_NO_WARN);
- if (rc) {
- if (rc == -EREMOTEIO)
- ret = BLK_STS_TARGET;
- goto out_free_sg;
+ for (i = 0; i < entries; i++)
+ dma_unmap_page(dev->dev,
+ le64_to_cpu(sg_list[i].addr),
+ le32_to_cpu(sg_list[i].length), dir);
+ } else {
+ dma_unmap_page(dev->dev, iod->meta_dma,
+ rq_integrity_vec(req).bv_len, dir);
+ return;
+ }
}
- if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
- ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
- else
- ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
- if (ret != BLK_STS_OK)
- goto out_unmap_sg;
- return BLK_STS_OK;
-
-out_unmap_sg:
- dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
-out_free_sg:
- mempool_free(iod->sgt.sgl, dev->iod_mempool);
- return ret;
+ dma_pool_free(dev->prp_small_pool, iod->meta_list, iod->meta_dma);
}
static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
struct request *req)
{
+ unsigned int entries = req->nr_integrity_segments;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct nvme_rw_command *cmnd = &iod->cmd.rw;
+ struct nvme_rw_command *cmd = &iod->cmd.rw;
struct nvme_sgl_desc *sg_list;
- struct scatterlist *sgl, *sg;
- unsigned int entries;
+ struct blk_dma_iter iter;
+ unsigned int mapped = 0;
dma_addr_t sgl_dma;
- int rc, i;
-
- iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC);
- if (!iod->meta_sgt.sgl)
- return BLK_STS_RESOURCE;
- sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments);
- iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req,
- iod->meta_sgt.sgl);
- if (!iod->meta_sgt.orig_nents)
- goto out_free_sg;
+ cmd->flags = NVME_CMD_SGL_METASEG;
- rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req),
- DMA_ATTR_NO_WARN);
- if (rc)
- goto out_free_sg;
+ if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_meta_state,
+ &iter))
+ return iter.status;
sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma);
if (!sg_list)
- goto out_unmap_sg;
+ return BLK_STS_RESOURCE;
- entries = iod->meta_sgt.nents;
iod->meta_list = sg_list;
iod->meta_dma = sgl_dma;
+ cmd->metadata = cpu_to_le64(sgl_dma);
- cmnd->flags = NVME_CMD_SGL_METASEG;
- cmnd->metadata = cpu_to_le64(sgl_dma);
-
- sgl = iod->meta_sgt.sgl;
- if (entries == 1) {
- nvme_pci_sgl_set_data(sg_list, sgl);
+ if (entries == 1 || blk_rq_dma_map_coalesce(&iod->dma_meta_state)) {
+ nvme_pci_sgl_set_data(sg_list, &iter);
+ iod->total_meta_len += iter.len;
return BLK_STS_OK;
}
- sgl_dma += sizeof(*sg_list);
- nvme_pci_sgl_set_seg(sg_list, sgl_dma, entries);
- for_each_sg(sgl, sg, entries, i)
- nvme_pci_sgl_set_data(&sg_list[i + 1], sg);
-
- return BLK_STS_OK;
+ do {
+ if (WARN_ON_ONCE(mapped == entries)) {
+ iter.status = BLK_STS_IOERR;
+ break;
+ }
+ nvme_pci_sgl_set_data(&sg_list[mapped++], &iter);
+ iod->total_len += iter.len;
+ } while (blk_rq_dma_map_iter_next(req, dev->dev, &iod->dma_meta_state,
+ &iter));
-out_unmap_sg:
- dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
-out_free_sg:
- mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
- return BLK_STS_RESOURCE;
+ nvme_pci_sgl_set_seg(sg_list, sgl_dma, mapped);
+ if (unlikely(iter.status))
+ nvme_unmap_metadata(dev, req);
+ return iter.status;
}
static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev,
@@ -914,15 +1032,15 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
iod->aborted = false;
iod->nr_descriptors = 0;
iod->flags = 0;
- iod->sgt.nents = 0;
- iod->meta_sgt.nents = 0;
+ iod->total_len = 0;
+ iod->total_meta_len = 0;
ret = nvme_setup_cmd(req->q->queuedata, req);
if (ret)
return ret;
if (blk_rq_nr_phys_segments(req)) {
- ret = nvme_map_data(dev, req, &iod->cmd);
+ ret = nvme_map_data(dev, req);
if (ret)
goto out_free_cmd;
}
@@ -1026,23 +1144,6 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
*rqlist = requeue_list;
}
-static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
- struct request *req)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-
- if (!iod->meta_sgt.nents) {
- dma_unmap_page(dev->dev, iod->meta_dma,
- rq_integrity_vec(req).bv_len,
- rq_dma_dir(req));
- return;
- }
-
- dma_pool_free(dev->prp_small_pool, iod->meta_list, iod->meta_dma);
- dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
- mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
-}
-
static __always_inline void nvme_pci_unmap_rq(struct request *req)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
@@ -2859,31 +2960,6 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
dma_pool_destroy(dev->prp_small_pool);
}
-static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
-{
- size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
- size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS;
-
- dev->iod_mempool = mempool_create_node(1,
- mempool_kmalloc, mempool_kfree,
- (void *)alloc_size, GFP_KERNEL,
- dev_to_node(dev->dev));
- if (!dev->iod_mempool)
- return -ENOMEM;
-
- dev->iod_meta_mempool = mempool_create_node(1,
- mempool_kmalloc, mempool_kfree,
- (void *)meta_size, GFP_KERNEL,
- dev_to_node(dev->dev));
- if (!dev->iod_meta_mempool)
- goto free;
-
- return 0;
-free:
- mempool_destroy(dev->iod_mempool);
- return -ENOMEM;
-}
-
static void nvme_free_tagset(struct nvme_dev *dev)
{
if (dev->tagset.tags)
@@ -3252,15 +3328,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (result)
goto out_dev_unmap;
- result = nvme_pci_alloc_iod_mempool(dev);
- if (result)
- goto out_release_prp_pools;
-
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
result = nvme_pci_enable(dev);
if (result)
- goto out_release_iod_mempool;
+ goto out_release_prp_pools;
result = nvme_alloc_admin_tag_set(&dev->ctrl, &dev->admin_tagset,
&nvme_mq_admin_ops, sizeof(struct nvme_iod));
@@ -3327,9 +3399,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
nvme_dev_remove_admin(dev);
nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
-out_release_iod_mempool:
- mempool_destroy(dev->iod_mempool);
- mempool_destroy(dev->iod_meta_mempool);
out_release_prp_pools:
nvme_release_prp_pools(dev);
out_dev_unmap:
@@ -3394,8 +3463,6 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_dev_remove_admin(dev);
nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
- mempool_destroy(dev->iod_mempool);
- mempool_destroy(dev->iod_meta_mempool);
nvme_release_prp_pools(dev);
nvme_dev_unmap(dev);
nvme_uninit_ctrl(&dev->ctrl);
--
2.49.0
Powered by blists - more mailing lists