[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251218091050.55047-6-15927021679@163.com>
Date: Thu, 18 Dec 2025 17:09:45 +0800
From: Xiong Weimin <15927021679@....com>
To: "Michael S . Tsirkin" <mst@...hat.com>,
David Hildenbrand <david@...hat.com>,
Jason Wang <jasowang@...hat.com>,
Stefano Garzarella <sgarzare@...hat.com>,
Thomas Monjalon <thomas@...jalon.net>,
David Marchand <david.marchand@...hat.com>,
Luca Boccassi <bluca@...ian.org>,
Kevin Traynor <ktraynor@...hat.com>,
Christian Ehrhardt <christian.ehrhardt@...onical.com>,
Xuan Zhuo <xuanzhuo@...ux.alibaba.com>,
Eugenio Pérez <eperezma@...hat.com>,
Xueming Li <xuemingl@...dia.com>,
Maxime Coquelin <maxime.coquelin@...hat.com>,
Chenbo Xia <chenbox@...dia.com>,
Bruce Richardson <bruce.richardson@...el.com>
Cc: kvm@...r.kernel.org,
virtualization@...ts.linux.dev,
netdev@...r.kernel.org,
xiongweimin <xiongweimin@...inos.cn>
Subject: [PATCH 05/10] drivers/infiniband/hw/virtio: Implement memory mapping and MR scatter-gather support
From: xiongweimin <xiongweimin@...inos.cn>
This comprehensive commit adds critical memory management capabilities to the virtio RDMA driver:
1. Port link layer identification
- Reports Ethernet as the link layer (vrdma_port_link_layer)
2. Memory Region scatter-gather mapping
- Implements two-level page table for efficient large MR handling (vrdma_set_page)
- Adds SG list to MR mapping with device notification (vrdma_map_mr_sg)
3. User-space memory mapping
- Supports mmap() for CQ/QP resources (vrdma_mmap)
- Handles vring descriptors, user buffers, and fast doorbells
- Implements mmap entry cleanup (vrdma_mmap_free)
Key features:
- Efficient 2-level page table for MRs (512 entries per level)
- Virtio command for backend MR mapping notification
- Unified mmap handling for CQ/QP with size validation
- Support for fast doorbell mapping optimization
- Comprehensive error handling in all code paths
Signed-off-by: Xiong Weimin <xiongweimin@...inos.cn>
---
.../infiniband/hw/virtio/vrdma_dev_api.h | 13 +
.../drivers/infiniband/hw/virtio/vrdma_ib.c | 250 ++++++++++++++++++
2 files changed, 263 insertions(+)
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
index da99f1f32..84dc05a96 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
@@ -200,6 +200,19 @@ struct vrdma_cmd_del_gid {
__u32 port_num;
};
+struct vrdma_cmd_map_mr_sg {
+ __u32 mrn;
+ __u32 npages;
+ __u64 start;
+ __u64 length;
+
+ __u64 pages;
+};
+
+struct vrdma_rsp_map_mr_sg {
+ __u32 npages;
+};
+
#define VRDMA_CTRL_OK 0
#define VRDMA_CTRL_ERR 1
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
index b4c16ddbb..738935e3d 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
@@ -12,6 +12,7 @@
#include <rdma/ib_umem.h>
#include <rdma/ib_verbs.h>
#include <rdma/ib_addr.h>
+#include <linux/mm_types.h>
#include "vrdma.h"
#include "vrdma_dev.h"
@@ -21,6 +22,8 @@
#include "vrdma_mmap.h"
#include "vrdma_queue.h"
+#define VRTIO_RDMA_PAGE_PER_TBL 512
+
/**
* cmd_str - String representation of virtio RDMA control commands
*
@@ -1677,6 +1680,248 @@ static int vrdma_destroy_ah(struct ib_ah *ibah, u32 flags)
return 0;
}
+static void vrdma_get_fw_ver_str(struct ib_device *device, char *str)
+{
+ snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d\n", 1, 0, 0);
+}
+
+static enum rdma_link_layer vrdma_port_link_layer(struct ib_device *ibdev,
+ u32 port)
+{
+ return IB_LINK_LAYER_ETHERNET;
+}
+
+/**
+ * vrdma_set_page - Callback to collect physical pages from scatterlist
+ * @ibmr: Memory region being mapped
+ * @addr: Physical address of the current page
+ *
+ * This function is called by ib_sg_to_pages() for each page in the SG list.
+ * It stores the physical address into a two-level page table:
+ * - Level 1: Array of pointers to L2 tables (512 entries each)
+ * - Level 2: Each holds up to 512 page addresses
+ *
+ * The layout allows efficient DMA mapping of large MRs without allocating one huge array.
+ *
+ * Context: Called from ib_sg_to_pages(); may sleep if GFP_KERNEL used internally.
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if number of pages exceeds pre-allocated limit
+ */
+static int vrdma_set_page(struct ib_mr *ibmr, u64 addr)
+{
+ struct vrdma_mr *mr = to_vmr(ibmr);
+
+ if (mr->npages >= mr->max_pages) {
+ pr_debug("vRDMA: too many pages for MR (max=%u)\n", mr->max_pages);
+ return -ENOMEM;
+ }
+
+ /* Two-level indexing: [L1 index][L2 offset] */
+ mr->pages_k[mr->npages / VRTIO_RDMA_PAGE_PER_TBL][mr->npages % VRTIO_RDMA_PAGE_PER_TBL] = addr;
+ mr->npages++;
+ return 0;
+}
+
+/**
+ * vrdma_map_mr_sg - Map scatter-gather list into MR's page table and notify device
+ * @ibmr: The memory region to map
+ * @sg: Scatterlist describing user/kernel memory chunks
+ * @sg_nents: Number of entries in sg
+ * @sg_offset: Optional offset within first sg element (ignored here)
+ *
+ * This function:
+ * 1. Walks the SG list via ib_sg_to_pages()
+ * 2. Populates software page table using vrdma_set_page()
+ * 3. Sends VIRTIO_RDMA_CMD_MAP_MR_SG to inform backend about IOVA range and page list
+ *
+ * Note: The actual DMA mapping was already done during ib_umem_get() or get_dma_mr().
+ * This only sets up hardware-visible metadata.
+ *
+ * Context: Can sleep (called in process context).
+ * Return:
+ * * Number of successfully mapped sg entries (>0)
+ * * Negative errno on failure
+ */
+static int vrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+ int sg_nents, unsigned int *sg_offset)
+{
+ struct vrdma_dev *vdev = to_vdev(ibmr->device);
+ struct vrdma_mr *mr = to_vmr(ibmr);
+ struct vrdma_cmd_map_mr_sg *cmd;
+ struct vrdma_rsp_map_mr_sg *rsp;
+ struct scatterlist in, out;
+ int mapped;
+
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+ if (!rsp) {
+ kfree(cmd);
+ return -ENOMEM;
+ }
+
+ /* Reset page counter before traversal */
+ mr->npages = 0;
+
+ /* Use RDMA core helper to walk SG and call vrdma_set_page() per page */
+ mapped = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, vrdma_set_page);
+ if (mapped < 0) {
+ dev_err(&vdev->vdev->dev, "Failed to map SG to pages: %d\n", mapped);
+ kfree(rsp);
+ kfree(cmd);
+ return mapped;
+ }
+
+ /* Prepare command for device notification */
+ cmd->mrn = mr->mr_handle;
+ cmd->start = ibmr->iova;
+ cmd->length = ibmr->length;
+ cmd->npages = mr->npages;
+ cmd->pages = mr->dma_pages; /* Pre-DMA-mapped array of page addrs */
+
+ sg_init_one(&in, cmd, sizeof(*cmd));
+ sg_init_one(&out, rsp, sizeof(*rsp));
+
+ /* Notify backend about new mapping (optional optimization) */
+ int rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_MAP_MR_SG, &in, &out);
+ if (rc) {
+ dev_err(&vdev->vdev->dev,
+ "VIRTIO_RDMA_CMD_MAP_MR_SG failed for mrn=0x%x, err=%d\n",
+ mr->mr_handle, rc);
+ rc = -EIO;
+ goto out_free;
+ }
+
+ /* Success: return number of processed sg entries */
+ kfree(rsp);
+ kfree(cmd);
+ return mapped;
+
+out_free:
+ kfree(rsp);
+ kfree(cmd);
+ return rc;
+}
+
+/**
+ * vrdma_mmap - Map device memory (vring, ubuf, doorbell) into user space
+ * @ctx: User's RDMA context
+ * @vma: VMA describing the mapping request
+ *
+ * Maps memory regions associated with QP/CQ virtqueues into user space.
+ * Supports three components:
+ * - vring descriptors (shared ring buffer)
+ * - user buffer (optional data exchange area)
+ * - fast doorbell page (if enabled)
+ *
+ * Uses PFN-based remapping for normal memory and I/O remapping for doorbells.
+ *
+ * Context: Called during mmap() in process context.
+ * Return:
+ * * 0 on success
+ * * -EINVAL for invalid parameters or layout mismatch
+ * * -EAGAIN/-EFAULT if remap fails
+ */
+int vrdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
+{
+ struct vrdma_ucontext *uctx = to_vucontext(ctx);
+ size_t requested_size = vma->vm_end - vma->vm_start;
+ struct rdma_user_mmap_entry *rdma_entry;
+ struct vrdma_user_mmap_entry *entry;
+ int rc;
+
+ /* Must be page-aligned */
+ if (vma->vm_start & (PAGE_SIZE - 1)) {
+ pr_warn("vRDMA: mmap start not page aligned: %#lx\n", vma->vm_start);
+ return -EINVAL;
+ }
+
+ /* Look up the registered mmap entry */
+ rdma_entry = rdma_user_mmap_entry_get(&uctx->ibucontext, vma);
+ if (!rdma_entry) {
+ pr_err("vRDMA: mmap lookup failed: pgoff=%lu size=%zu\n",
+ vma->vm_pgoff, requested_size);
+ return -EINVAL;
+ }
+ entry = to_ventry(rdma_entry);
+
+ switch (entry->mmap_type) {
+ case VRDMA_MMAP_CQ:
+ case VRDMA_MMAP_QP:
+ {
+ unsigned long vq_size = PAGE_ALIGN(vring_size(virtqueue_get_vring_size(entry->vq),
+ SMP_CACHE_BYTES));
+ unsigned long total_size = vq_size + entry->ubuf_size;
+
+ if (uctx->dev->fast_doorbell && entry->mmap_type == VRDMA_MMAP_QP)
+ total_size += PAGE_SIZE;
+
+ if (requested_size != total_size) {
+ WARN(1, "mmap size mismatch: got=%zu, expected=%lu\n",
+ requested_size, total_size);
+ rc = -EINVAL;
+ goto out_put;
+ }
+
+ /* Map vring descriptor table */
+ rc = remap_pfn_range(vma, vma->vm_start,
+ page_to_pfn(virt_to_page(virtqueue_get_vring(entry->vq)->desc)),
+ vq_size, vma->vm_page_prot);
+ if (rc) {
+ pr_warn("vRDMA: remap vring failed: %d\n", rc);
+ goto out_put;
+ }
+
+ /* Map user buffer (shared data region) */
+ rc = remap_pfn_range(vma, vma->vm_start + vq_size,
+ page_to_pfn(virt_to_page(entry->user_buf)),
+ entry->ubuf_size, vma->vm_page_prot);
+ if (rc) {
+ pr_warn("vRDMA: remap ubuf failed: %d\n", rc);
+ goto out_put;
+ }
+
+ /* Optionally map fast doorbell register (QP only) */
+ if (uctx->dev->fast_doorbell && entry->mmap_type == VRDMA_MMAP_QP) {
+ unsigned long db_addr = vma->vm_start + vq_size + entry->ubuf_size;
+ struct virtqueue *vq = entry->vq;
+
+ rc = io_remap_pfn_range(vma, db_addr,
+ vmalloc_to_pfn(vq->priv),
+ PAGE_SIZE, vma->vm_page_prot);
+ if (rc) {
+ pr_warn("vRDMA: remap doorbell failed: %d\n", rc);
+ goto out_put;
+ }
+ }
+
+ break;
+ }
+ default:
+ pr_err("vRDMA: invalid mmap type %d\n", entry->mmap_type);
+ rc = -EINVAL;
+ goto out_put;
+ }
+
+ /* Success */
+ rdma_user_mmap_entry_put(rdma_entry);
+ return 0;
+
+out_put:
+ rdma_user_mmap_entry_put(rdma_entry);
+ return rc;
+}
+
+void vrdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
+{
+ struct vrdma_user_mmap_entry *entry = to_ventry(rdma_entry);
+
+ kfree(entry);
+}
+
static const struct ib_device_ops vrdma_dev_ops = {
.owner = THIS_MODULE,
.uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION,
@@ -1701,6 +1946,11 @@ static const struct ib_device_ops vrdma_dev_ops = {
.dealloc_ucontext = vrdma_dealloc_ucontext,
.create_ah = vrdma_create_ah,
.destroy_ah = vrdma_destroy_ah,
+ .get_dev_fw_str = vrdma_get_fw_ver_str,
+ .get_link_layer = vrdma_port_link_layer,
+ .map_mr_sg = vrdma_map_mr_sg,
+ .mmap = vrdma_mmap,
+ .mmap_free = vrdma_mmap_free,
};
/**
--
2.43.0
Powered by blists - more mailing lists