lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251218091050.55047-6-15927021679@163.com>
Date: Thu, 18 Dec 2025 17:09:45 +0800
From: Xiong Weimin <15927021679@....com>
To: "Michael S . Tsirkin" <mst@...hat.com>,
	David Hildenbrand <david@...hat.com>,
	Jason Wang <jasowang@...hat.com>,
	Stefano Garzarella <sgarzare@...hat.com>,
	Thomas Monjalon <thomas@...jalon.net>,
	David Marchand <david.marchand@...hat.com>,
	Luca Boccassi <bluca@...ian.org>,
	Kevin Traynor <ktraynor@...hat.com>,
	Christian Ehrhardt <christian.ehrhardt@...onical.com>,
	Xuan Zhuo <xuanzhuo@...ux.alibaba.com>,
	Eugenio Pérez <eperezma@...hat.com>,
	Xueming Li <xuemingl@...dia.com>,
	Maxime Coquelin <maxime.coquelin@...hat.com>,
	Chenbo Xia <chenbox@...dia.com>,
	Bruce Richardson <bruce.richardson@...el.com>
Cc: kvm@...r.kernel.org,
	virtualization@...ts.linux.dev,
	netdev@...r.kernel.org,
	xiongweimin <xiongweimin@...inos.cn>
Subject: [PATCH 05/10] drivers/infiniband/hw/virtio: Implement memory mapping and MR scatter-gather support

From: xiongweimin <xiongweimin@...inos.cn>

This comprehensive commit adds critical memory management capabilities to the virtio RDMA driver:

1. Port link layer identification
   - Reports Ethernet as the link layer (vrdma_port_link_layer)

2. Memory Region scatter-gather mapping
   - Implements two-level page table for efficient large MR handling (vrdma_set_page)
   - Adds SG list to MR mapping with device notification (vrdma_map_mr_sg)

3. User-space memory mapping
   - Supports mmap() for CQ/QP resources (vrdma_mmap)
   - Handles vring descriptors, user buffers, and fast doorbells
   - Implements mmap entry cleanup (vrdma_mmap_free)

Key features:
- Efficient 2-level page table for MRs (512 entries per level)
- Virtio command for backend MR mapping notification
- Unified mmap handling for CQ/QP with size validation
- Support for fast doorbell mapping optimization
- Comprehensive error handling in all code paths

Signed-off-by: Xiong Weimin <xiongweimin@...inos.cn>
---
 .../infiniband/hw/virtio/vrdma_dev_api.h      |  13 +
 .../drivers/infiniband/hw/virtio/vrdma_ib.c   | 250 ++++++++++++++++++
 2 files changed, 263 insertions(+)

diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
index da99f1f32..84dc05a96 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
@@ -200,6 +200,19 @@ struct vrdma_cmd_del_gid {
 	__u32 port_num;
 };
 
+struct vrdma_cmd_map_mr_sg {
+	__u32 mrn;
+	__u32 npages;
+	__u64 start;
+	__u64 length;
+
+	__u64 pages;
+};
+
+struct vrdma_rsp_map_mr_sg {
+	__u32 npages;
+};
+
 #define VRDMA_CTRL_OK	0
 #define VRDMA_CTRL_ERR	1
 
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
index b4c16ddbb..738935e3d 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
@@ -12,6 +12,7 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_addr.h>
+#include <linux/mm_types.h>
 
 #include "vrdma.h"
 #include "vrdma_dev.h"
@@ -21,6 +22,8 @@
 #include "vrdma_mmap.h"
 #include "vrdma_queue.h"
 
+#define VRTIO_RDMA_PAGE_PER_TBL 512
+
 /**
  * cmd_str - String representation of virtio RDMA control commands
  *
@@ -1677,6 +1680,248 @@ static int vrdma_destroy_ah(struct ib_ah *ibah, u32 flags)
 	return 0;
 }
 
+static void vrdma_get_fw_ver_str(struct ib_device *device, char *str)
+{
+	snprintf(str, IB_FW_VERSION_NAME_MAX, "%d.%d.%d\n", 1, 0, 0);
+}
+
+static enum rdma_link_layer vrdma_port_link_layer(struct ib_device *ibdev,
+						 u32 port)
+{
+	return IB_LINK_LAYER_ETHERNET;
+}
+
+/**
+ * vrdma_set_page - Callback to collect physical pages from scatterlist
+ * @ibmr:	Memory region being mapped
+ * @addr:	Physical address of the current page
+ *
+ * This function is called by ib_sg_to_pages() for each page in the SG list.
+ * It stores the physical address into a two-level page table:
+ *   - Level 1: Array of pointers to L2 tables (512 entries each)
+ *   - Level 2: Each holds up to 512 page addresses
+ *
+ * The layout allows efficient DMA mapping of large MRs without allocating one huge array.
+ *
+ * Context: Called from ib_sg_to_pages(); may sleep if GFP_KERNEL used internally.
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if number of pages exceeds pre-allocated limit
+ */
+static int vrdma_set_page(struct ib_mr *ibmr, u64 addr)
+{
+	struct vrdma_mr *mr = to_vmr(ibmr);
+
+	if (mr->npages >= mr->max_pages) {
+		pr_debug("vRDMA: too many pages for MR (max=%u)\n", mr->max_pages);
+		return -ENOMEM;
+	}
+
+	/* Two-level indexing: [L1 index][L2 offset] */
+	mr->pages_k[mr->npages / VRTIO_RDMA_PAGE_PER_TBL][mr->npages % VRTIO_RDMA_PAGE_PER_TBL] = addr;
+	mr->npages++;
+	return 0;
+}
+
+/**
+ * vrdma_map_mr_sg - Map scatter-gather list into MR's page table and notify device
+ * @ibmr:	The memory region to map
+ * @sg:		Scatterlist describing user/kernel memory chunks
+ * @sg_nents:	Number of entries in sg
+ * @sg_offset:	Optional offset within first sg element (ignored here)
+ *
+ * This function:
+ *   1. Walks the SG list via ib_sg_to_pages()
+ *   2. Populates software page table using vrdma_set_page()
+ *   3. Sends VIRTIO_RDMA_CMD_MAP_MR_SG to inform backend about IOVA range and page list
+ *
+ * Note: The actual DMA mapping was already done during ib_umem_get() or get_dma_mr().
+ *       This only sets up hardware-visible metadata.
+ *
+ * Context: Can sleep (called in process context).
+ * Return:
+ * * Number of successfully mapped sg entries (>0)
+ * * Negative errno on failure
+ */
+static int vrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
+		    int sg_nents, unsigned int *sg_offset)
+{
+	struct vrdma_dev *vdev = to_vdev(ibmr->device);
+	struct vrdma_mr *mr = to_vmr(ibmr);
+	struct vrdma_cmd_map_mr_sg *cmd;
+	struct vrdma_rsp_map_mr_sg *rsp;
+	struct scatterlist in, out;
+	int mapped;
+
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+	if (!cmd)
+		return -ENOMEM;
+
+	rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+	if (!rsp) {
+		kfree(cmd);
+		return -ENOMEM;
+	}
+
+	/* Reset page counter before traversal */
+	mr->npages = 0;
+
+	/* Use RDMA core helper to walk SG and call vrdma_set_page() per page */
+	mapped = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, vrdma_set_page);
+	if (mapped < 0) {
+		dev_err(&vdev->vdev->dev, "Failed to map SG to pages: %d\n", mapped);
+		kfree(rsp);
+		kfree(cmd);
+		return mapped;
+	}
+
+	/* Prepare command for device notification */
+	cmd->mrn = mr->mr_handle;
+	cmd->start = ibmr->iova;
+	cmd->length = ibmr->length;
+	cmd->npages = mr->npages;
+	cmd->pages = mr->dma_pages; /* Pre-DMA-mapped array of page addrs */
+
+	sg_init_one(&in, cmd, sizeof(*cmd));
+	sg_init_one(&out, rsp, sizeof(*rsp));
+
+	/* Notify backend about new mapping (optional optimization) */
+	int rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_MAP_MR_SG, &in, &out);
+	if (rc) {
+		dev_err(&vdev->vdev->dev,
+			"VIRTIO_RDMA_CMD_MAP_MR_SG failed for mrn=0x%x, err=%d\n",
+			mr->mr_handle, rc);
+		rc = -EIO;
+		goto out_free;
+	}
+
+	/* Success: return number of processed sg entries */
+	kfree(rsp);
+	kfree(cmd);
+	return mapped;
+
+out_free:
+	kfree(rsp);
+	kfree(cmd);
+	return rc;
+}
+
+/**
+ * vrdma_mmap - Map device memory (vring, ubuf, doorbell) into user space
+ * @ctx:	User's RDMA context
+ * @vma:	VMA describing the mapping request
+ *
+ * Maps memory regions associated with QP/CQ virtqueues into user space.
+ * Supports three components:
+ *   - vring descriptors (shared ring buffer)
+ *   - user buffer (optional data exchange area)
+ *   - fast doorbell page (if enabled)
+ *
+ * Uses PFN-based remapping for normal memory and I/O remapping for doorbells.
+ *
+ * Context: Called during mmap() in process context.
+ * Return:
+ * * 0 on success
+ * * -EINVAL for invalid parameters or layout mismatch
+ * * -EAGAIN/-EFAULT if remap fails
+ */
+int vrdma_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma)
+{
+	struct vrdma_ucontext *uctx = to_vucontext(ctx);
+	size_t requested_size = vma->vm_end - vma->vm_start;
+	struct rdma_user_mmap_entry *rdma_entry;
+	struct vrdma_user_mmap_entry *entry;
+	int rc;
+
+	/* Must be page-aligned */
+	if (vma->vm_start & (PAGE_SIZE - 1)) {
+		pr_warn("vRDMA: mmap start not page aligned: %#lx\n", vma->vm_start);
+		return -EINVAL;
+	}
+
+	/* Look up the registered mmap entry */
+	rdma_entry = rdma_user_mmap_entry_get(&uctx->ibucontext, vma);
+	if (!rdma_entry) {
+		pr_err("vRDMA: mmap lookup failed: pgoff=%lu size=%zu\n",
+		       vma->vm_pgoff, requested_size);
+		return -EINVAL;
+	}
+	entry = to_ventry(rdma_entry);
+
+	switch (entry->mmap_type) {
+	case VRDMA_MMAP_CQ:
+	case VRDMA_MMAP_QP:
+	{
+		unsigned long vq_size = PAGE_ALIGN(vring_size(virtqueue_get_vring_size(entry->vq),
+							      SMP_CACHE_BYTES));
+		unsigned long total_size = vq_size + entry->ubuf_size;
+
+		if (uctx->dev->fast_doorbell && entry->mmap_type == VRDMA_MMAP_QP)
+			total_size += PAGE_SIZE;
+
+		if (requested_size != total_size) {
+			WARN(1, "mmap size mismatch: got=%zu, expected=%lu\n",
+			     requested_size, total_size);
+			rc = -EINVAL;
+			goto out_put;
+		}
+
+		/* Map vring descriptor table */
+		rc = remap_pfn_range(vma, vma->vm_start,
+				     page_to_pfn(virt_to_page(virtqueue_get_vring(entry->vq)->desc)),
+				     vq_size, vma->vm_page_prot);
+		if (rc) {
+			pr_warn("vRDMA: remap vring failed: %d\n", rc);
+			goto out_put;
+		}
+
+		/* Map user buffer (shared data region) */
+		rc = remap_pfn_range(vma, vma->vm_start + vq_size,
+				     page_to_pfn(virt_to_page(entry->user_buf)),
+				     entry->ubuf_size, vma->vm_page_prot);
+		if (rc) {
+			pr_warn("vRDMA: remap ubuf failed: %d\n", rc);
+			goto out_put;
+		}
+
+		/* Optionally map fast doorbell register (QP only) */
+		if (uctx->dev->fast_doorbell && entry->mmap_type == VRDMA_MMAP_QP) {
+			unsigned long db_addr = vma->vm_start + vq_size + entry->ubuf_size;
+			struct virtqueue *vq = entry->vq;
+
+			rc = io_remap_pfn_range(vma, db_addr,
+						vmalloc_to_pfn(vq->priv),
+						PAGE_SIZE, vma->vm_page_prot);
+			if (rc) {
+				pr_warn("vRDMA: remap doorbell failed: %d\n", rc);
+				goto out_put;
+			}
+		}
+
+		break;
+	}
+	default:
+		pr_err("vRDMA: invalid mmap type %d\n", entry->mmap_type);
+		rc = -EINVAL;
+		goto out_put;
+	}
+
+	/* Success */
+	rdma_user_mmap_entry_put(rdma_entry);
+	return 0;
+
+out_put:
+	rdma_user_mmap_entry_put(rdma_entry);
+	return rc;
+}
+
+void vrdma_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
+{
+	struct vrdma_user_mmap_entry *entry = to_ventry(rdma_entry);
+
+	kfree(entry);
+}
+
 static const struct ib_device_ops vrdma_dev_ops = {
 	.owner = THIS_MODULE,
 	.uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION,
@@ -1701,6 +1946,11 @@ static const struct ib_device_ops vrdma_dev_ops = {
 	.dealloc_ucontext = vrdma_dealloc_ucontext,
 	.create_ah = vrdma_create_ah,
 	.destroy_ah = vrdma_destroy_ah,
+	.get_dev_fw_str = vrdma_get_fw_ver_str,
+	.get_link_layer = vrdma_port_link_layer,
+	.map_mr_sg = vrdma_map_mr_sg,
+	.mmap = vrdma_mmap,
+	.mmap_free = vrdma_mmap_free,		
 };
 
 /**
-- 
2.43.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ