lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251218091050.55047-5-15927021679@163.com>
Date: Thu, 18 Dec 2025 17:09:44 +0800
From: Xiong Weimin <15927021679@....com>
To: "Michael S . Tsirkin" <mst@...hat.com>,
	David Hildenbrand <david@...hat.com>,
	Jason Wang <jasowang@...hat.com>,
	Stefano Garzarella <sgarzare@...hat.com>,
	Thomas Monjalon <thomas@...jalon.net>,
	David Marchand <david.marchand@...hat.com>,
	Luca Boccassi <bluca@...ian.org>,
	Kevin Traynor <ktraynor@...hat.com>,
	Christian Ehrhardt <christian.ehrhardt@...onical.com>,
	Xuan Zhuo <xuanzhuo@...ux.alibaba.com>,
	Eugenio Pérez <eperezma@...hat.com>,
	Xueming Li <xuemingl@...dia.com>,
	Maxime Coquelin <maxime.coquelin@...hat.com>,
	Chenbo Xia <chenbox@...dia.com>,
	Bruce Richardson <bruce.richardson@...el.com>
Cc: kvm@...r.kernel.org,
	virtualization@...ts.linux.dev,
	netdev@...r.kernel.org,
	xiongweimin <xiongweimin@...inos.cn>
Subject: [PATCH 04/10] drivers/infiniband/hw/virtio: Implement MR, GID, ucontext and AH resource management verbs

From: xiongweimin <xiongweimin@...inos.cn>

This commit adds foundational resource management capabilities to the
vhost-user RDMA driver, enabling full RDMA operations:

1. Memory Region (MR) Management:
   - DMA MR registration via GET_DMA_MR
   - Two-level page table for large scatter-gather lists
   - CREATE_MR/DEREG_MR backend command flow
   - Atomic command execution with virtqueue

2. Global Identifier (GID) Management:
   - ADD_GID/DEL_GID backend commands
   - RoCE v1/v2 GID type support
   - Port-based GID table operations

3. User Context (ucontext) Support:
   - Allocation and deallocation hooks
   - Device association for future PD/CQ/MR management

4. Address Handle (AH) Management:
   - RoCE-specific AH creation/validation
   - Unicast GRH enforcement
   - Device-wide AH limit tracking

Key technical features:
- MRs support both DMA-direct and user-backed registrations
- Page-table optimized for large scatter-lists
- GID operations integrate with RDMA core notifications
- AHs store full address vectors for packet construction
- Resource limits enforced via atomic counters

Signed-off-by: Xiong Weimin <xiongweimin@...inos.cn>
---
 .../infiniband/hw/virtio/vrdma_dev_api.h      |  40 ++
 .../drivers/infiniband/hw/virtio/vrdma_ib.c   | 600 ++++++++++++++++++
 .../drivers/infiniband/hw/virtio/vrdma_ib.h   |  80 +++
 3 files changed, 720 insertions(+)

diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
index d1db1bea4..da99f1f32 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
@@ -160,6 +160,46 @@ struct vrdma_cmd_destroy_qp {
     __u32 qpn;
 };
 
+struct vrdma_cmd_get_dma_mr {
+	__u32 pdn;
+	__u32 access_flags;
+};
+
+struct vrdma_rsp_get_dma_mr {
+	__u32 mrn;
+	__u32 lkey;
+	__u32 rkey;
+};
+
+struct vrdma_cmd_create_mr {
+	__u32 pdn;
+	__u32 access_flags;
+
+	__u32 max_num_sg;
+};
+
+struct vrdma_rsp_create_mr {
+	__u32 mrn;
+	__u32 lkey;
+	__u32 rkey;
+};
+
+struct vrdma_cmd_dereg_mr {
+    __u32 mrn;
+};
+
+struct vrdma_cmd_add_gid {
+	__u8 gid[16];
+	__u32 gid_type;
+	__u16 index;
+	__u32 port_num;
+};
+
+struct vrdma_cmd_del_gid {
+	__u16 index;
+	__u32 port_num;
+};
+
 #define VRDMA_CTRL_OK	0
 #define VRDMA_CTRL_ERR	1
 
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
index f1f53314f..b4c16ddbb 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
@@ -1086,6 +1086,597 @@ static int vrdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 	return rc;
 }
 
+/**
+ * vrdma_get_dma_mr - Get a DMA memory region (uncached, direct-access MR)
+ * @pd:		Protection Domain to associate this MR with
+ * @flags:	Access permissions (IB_ACCESS_LOCAL_WRITE, IB_ACCESS_REMOTE_READ, etc.)
+ *
+ * This function creates a special type of Memory Region (MR) that refers to
+ * physically contiguous or scatter-gather DMA-capable memory, typically used
+ * for zero-copy or kernel-space registrations without user buffer backing.
+ *
+ * It issues the VIRTIO_RDMA_CMD_GET_DMA_MR command to the backend device,
+ * which returns:
+ *   - An MR handle (mrn)
+ *   - Local Key (lkey)
+ *   - Remote Key (rkey)
+ *
+ * Unlike regular MRs created via ib_reg_mr(), this MR does not back any
+ * user-space virtual memory (i.e., no ib_umem). It is typically used for
+ * device-specific buffers, scratch memory, or control structures.
+ *
+ * Context: Called in process context. May sleep.
+ * Return:
+ * * &mr->ibmr on success
+ * * ERR_PTR(-ENOMEM) if memory allocation fails
+ * * ERR_PTR(-EIO) if device communication fails
+ */
+static struct ib_mr *vrdma_get_dma_mr(struct ib_pd *pd, int flags)
+{
+	struct vrdma_dev *vdev = to_vdev(pd->device);
+	struct vrdma_mr *mr;
+	struct vrdma_cmd_get_dma_mr *cmd;
+	struct vrdma_rsp_get_dma_mr *rsp;
+	struct scatterlist in, out;
+	int rc;
+
+	/* Allocate software MR structure */
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+	if (!cmd) {
+		rc = -ENOMEM;
+		goto err_free_mr;
+	}
+
+	rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+	if (!rsp) {
+		rc = -ENOMEM;
+		goto err_free_cmd;
+	}
+
+	/* Prepare command parameters */
+	cmd->pdn = to_vpd(pd)->pd_handle;
+	cmd->access_flags = flags;
+
+	sg_init_one(&in, cmd, sizeof(*cmd));
+	sg_init_one(&out, rsp, sizeof(*rsp));
+
+	/* Send GET_DMA_MR command to device */
+	rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_GET_DMA_MR, &in, &out);
+	if (rc) {
+		dev_err(&vdev->vdev->dev,
+			"GET_DMA_MR command failed: %d\n", rc);
+		goto err_free_rsp;
+	}
+
+	/* Initialize MR fields from response */
+	mr->mr_handle = rsp->mrn;
+	mr->ibmr.lkey = rsp->lkey;
+	mr->ibmr.rkey = rsp->rkey;
+	mr->ibmr.pd = pd;
+	mr->ibmr.device = pd->device;
+	mr->ibmr.type = IB_MR_TYPE_MEM_REG; /* Standard memory registration */
+
+	/* No backing user memory */
+	mr->umem = NULL;
+	mr->iova = 0;
+	mr->size = 0;
+	mr->pages = NULL;
+	mr->pages_k = NULL;
+	mr->dma_pages = 0;
+	mr->npages = 0;
+	mr->max_pages = 0;
+
+	/* Cleanup command/response buffers */
+	kfree(cmd);
+	kfree(rsp);
+
+	return &mr->ibmr;
+
+err_free_rsp:
+	kfree(rsp);
+
+err_free_cmd:
+	kfree(cmd);
+
+err_free_mr:
+	kfree(mr);
+	return ERR_PTR(rc);
+}
+
+/**
+ * vrdma_init_page_tbl - Initialize a two-level page table for MR management
+ * @dev:		vRDMA device pointer
+ * @npages:		Maximum number of data pages this table can map
+ * @pages_dma:		Output: L1 table with entries pointing to DMA addresses of L2 tables
+ * @dma_pages_p:	Output: DMA address of the L1 table itself
+ *
+ * This function sets up a two-level page table structure used in Memory Region (MR)
+ * registration to support scatter-gather I/O. The layout is:
+ *
+ *   L1 (Level 1): Single page, DMA-coherent, holds pointers to L2 tables.
+ *                 Will be passed to hardware via WQE or command.
+ *
+ *   L2 (Level 2): Array of pages, each holding up to 512 x 8-byte DMA addresses
+ *                 (for 4KB page size). Each L2 table maps part of the S/G list.
+ *
+ * Example:
+ *   npages = 1024  => needs 1024 / 512 = 2 L2 tables
+ *
+ * Return:
+ *   Pointer to kernel virtual address of L1 table (pages_k), which stores
+ *   virtual addresses of L2 tables for cleanup.
+ *   On failure, returns NULL and cleans up all allocated memory.
+ */
+static uint64_t **vrdma_init_page_tbl(struct vrdma_dev *dev,
+				      unsigned int npages,
+				      uint64_t ***pages_dma,
+				      dma_addr_t *dma_pages_p)
+{
+	unsigned int nl2 = (npages == 0) ? 0 : (npages + 511) / 512; /* ceil(npages / 512) */
+	uint64_t **l1_table;           /* L1: stores DMA addrs of L2s (device-readable) */
+	uint64_t **l1_table_k;         /* L1: stores kernel vaddrs of L2s (for free) */
+	dma_addr_t l1_dma_addr;
+	dma_addr_t l2_dma_addr;
+	int i;
+
+	/* Allocate L1 table: must be DMA-coherent because device reads it */
+	l1_table = dma_alloc_coherent(dev->vdev->dev.parent, PAGE_SIZE, &l1_dma_addr, GFP_KERNEL);
+	if (!l1_table)
+		return NULL;
+
+	/* Allocate kernel-space array to track L2 virtual addresses */
+	l1_table_k = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!l1_table_k)
+		goto err_free_l1_table;
+
+	/* Allocate each L2 table (DMA-coherent, one per 512 entries) */
+	for (i = 0; i < nl2; i++) {
+		l1_table_k[i] = dma_alloc_coherent(dev->vdev->dev.parent, PAGE_SIZE, &l2_dma_addr, GFP_KERNEL);
+		if (!l1_table_k[i])
+			goto err_free_l2_tables;
+
+		l1_table[i] = (uint64_t *)l2_dma_addr; /* Device sees DMA address */
+	}
+
+	/* Output parameters */
+	*pages_dma = l1_table;        /* Device-visible L1 (with DMA pointers) */
+	*dma_pages_p = l1_dma_addr;   /* DMA address of L1 table */
+
+	return l1_table_k; /* Return kernel view for later cleanup */
+
+err_free_l2_tables:
+	/* Roll back any successfully allocated L2 tables */
+	while (--i >= 0) {
+		dma_free_coherent(dev->vdev->dev.parent, PAGE_SIZE, l1_table_k[i], (dma_addr_t)l1_table[i]);
+	}
+	kfree(l1_table_k);
+
+err_free_l1_table:
+	dma_free_coherent(dev->vdev->dev.parent, PAGE_SIZE, l1_table, l1_dma_addr);
+
+	return NULL;
+}
+
+/**
+ * vrdma_free_page_tbl - Free a two-level page table
+ * @dev:		vRDMA device
+ * @pages_k:		Return value from vrdma_init_page_tbl (kernel L2 pointers)
+ * @pages:		L1 table with DMA addresses (output of pages_dma)
+ * @dma_pages:		DMA address of L1 table
+ * @npages:		Number of pages that were to be supported
+ *
+ * Frees both L1 and all L2 page tables allocated by vrdma_init_page_tbl.
+ */
+static void vrdma_free_page_tbl(struct vrdma_dev *dev,
+				uint64_t **pages_k,
+				uint64_t **pages,
+				dma_addr_t dma_pages,
+				unsigned int npages)
+{
+	unsigned int nl2 = (npages == 0) ? 0 : (npages + 511) / 512;
+	int i;
+
+	if (!pages_k || !pages)
+		return;
+
+	/* Free all L2 tables */
+	for (i = 0; i < nl2; i++) {
+		if (pages_k[i])
+			dma_free_coherent(dev->vdev->dev.parent, PAGE_SIZE, pages_k[i],
+					  virt_to_phys((void *)pages[i]));
+	}
+
+	/* Free L1 tracking array */
+	kfree(pages_k);
+
+	/* Free L1 DMA table */
+	dma_free_coherent(dev->vdev->dev.parent, PAGE_SIZE, pages, dma_pages);
+}
+
+/**
+ * vrdma_alloc_mr - Allocate a multi-segment Memory Region (MR) with page tables
+ * @pd:		Protection Domain to associate the MR with
+ * @mr_type:	Type of MR (must be IB_MR_TYPE_MEM_REG)
+ * @max_num_sg:	Maximum number of scatter/gather entries supported by this MR
+ *
+ * This function allocates a software MR structure and reserves a hardware MR
+ * context on the backend vRDMA device. It prepares a two-level page table
+ * (L1/L2) to support up to @max_num_sg pages, which will later be filled during
+ * memory registration (e.g., via ib_update_page()).
+ *
+ * The allocated MR is not yet backed by any actual memory - it serves as a
+ * container for future page population (used primarily by ib_get_dma_mr() path
+ * or special fast-register mechanisms).
+ *
+ * Command flow:
+ *   - Sends VIRTIO_RDMA_CMD_CREATE_MR to device
+ *   - Receives mr_handle, lkey, rkey from response
+ *
+ * Context: Called in process context. May sleep.
+ * Return:
+ * * &mr->ibmr on success
+ * * ERR_PTR(-EINVAL) if unsupported MR type
+ * * ERR_PTR(-ENOMEM) if memory allocation fails
+ * * ERR_PTR(-EIO) if device command fails
+ */
+static struct ib_mr *vrdma_alloc_mr(struct ib_pd *pd,
+				    enum ib_mr_type mr_type,
+				    u32 max_num_sg)
+{
+	struct vrdma_dev *vdev = to_vdev(pd->device);
+	struct vrdma_mr *mr;
+	struct vrdma_cmd_create_mr *cmd;
+	struct vrdma_rsp_create_mr *rsp;
+	struct scatterlist in, out;
+	int rc;
+
+	/* Only support standard memory registration */
+	if (mr_type != IB_MR_TYPE_MEM_REG)
+		return ERR_PTR(-EINVAL);
+
+	/* Allocate software MR structure */
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+	if (!cmd) {
+		rc = -ENOMEM;
+		goto err_free_mr;
+	}
+
+	rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+	if (!rsp) {
+		rc = -ENOMEM;
+		goto err_free_cmd;
+	}
+
+	/*
+	 * Allocate two-level page table for S/G support.
+	 * Each L2 table holds PAGE_SIZE / sizeof(u64) entries.
+	 * L1 table points to multiple L2s.
+	 */
+	mr->pages_k = vrdma_init_page_tbl(vdev, max_num_sg,
+						&mr->pages, &mr->dma_pages);
+	if (!mr->pages_k) {
+		dev_err(&vdev->vdev->dev,
+			"Failed to allocate page table for %u S/G entries\n",
+			max_num_sg);
+		rc = -ENOMEM;
+		goto err_free_rsp;
+	}
+
+	mr->max_pages = max_num_sg;
+	mr->npages = 0;
+	mr->umem = NULL; /* No user memory backing at this stage */
+	mr->iova = 0;
+	mr->size = 0;
+
+	/* Prepare command */
+	cmd->pdn = to_vpd(pd)->pd_handle;
+	cmd->max_num_sg = max_num_sg;
+
+	sg_init_one(&in, cmd, sizeof(*cmd));
+	sg_init_one(&out, rsp, sizeof(*rsp));
+
+	/* Send CREATE_MR command to backend device */
+	rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_CREATE_MR, &in, &out);
+	if (rc) {
+		dev_err(&vdev->vdev->dev, "CREATE_MR failed: %d\n", rc);
+		goto err_free_page_tbl;
+	}
+
+	/* Initialize MR metadata from response */
+	mr->mr_handle = rsp->mrn;
+	mr->ibmr.lkey = rsp->lkey;
+	mr->ibmr.rkey = rsp->rkey;
+	mr->ibmr.pd = pd;
+	mr->ibmr.device = &vdev->ib_dev;
+	mr->ibmr.type = IB_MR_TYPE_MEM_REG;
+
+	/* Clean up command/response buffers */
+	kfree(cmd);
+	kfree(rsp);
+
+	return &mr->ibmr;
+
+err_free_page_tbl:
+	vrdma_free_page_tbl(vdev, mr->pages_k, mr->pages, mr->dma_pages,
+				  max_num_sg);
+err_free_rsp:
+	kfree(rsp);
+err_free_cmd:
+	kfree(cmd);
+err_free_mr:
+	kfree(mr);
+	return ERR_PTR(rc);
+}
+
+/**
+ * vrdma_dereg_mr - Deregister and destroy a Memory Region (MR)
+ * @ibmr:	The IB memory region to deregister
+ * @udata:	User data (optional, for user-space MRs)
+ *
+ * This function unregisters a previously allocated MR from the vRDMA device.
+ * It performs the following steps:
+ *   1. Sends VIRTIO_RDMA_CMD_DEREG_MR command to the backend device
+ *   2. Frees software page tables (L1/L2) used for scatter-gather mapping
+ *   3. Releases user memory (if any) via ib_umem_release()
+ *   4. Frees local metadata (struct vrdma_mr)
+ *
+ * Context: Can be called in process context. May sleep.
+ * Return:
+ * * 0 on success
+ * * -EIO if device communication fails
+ * * Other negative errno codes on allocation failure (rare during dereg)
+ */
+static int vrdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+{
+	struct vrdma_dev *vdev = to_vdev(ibmr->device);
+	struct vrdma_mr *mr = to_vmr(ibmr);
+	struct vrdma_cmd_dereg_mr *cmd;
+	struct scatterlist in;
+	int rc;
+
+	/* Allocate command buffer */
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+	if (!cmd)
+		return -ENOMEM;
+
+	/* Prepare command */
+	cmd->mrn = mr->mr_handle;
+	sg_init_one(&in, cmd, sizeof(*cmd));
+
+	/* Notify hardware to release MR context */
+	rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_DEREG_MR, &in, NULL);
+	if (rc) {
+		dev_err(&vdev->vdev->dev,
+			"VIRTIO_RDMA_CMD_DEREG_MR failed for mrn=0x%x, err=%d\n",
+			mr->mr_handle, rc);
+		rc = -EIO;
+		goto out_free_cmd;
+	}
+
+	/* Free two-level page table used for S/G entries */
+	vrdma_free_page_tbl(vdev, mr->pages_k, mr->pages, mr->dma_pages, mr->max_pages);
+
+	/* Release user memory if present */
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+
+	/* Success */
+	kfree(cmd);
+	return 0;
+
+out_free_cmd:
+	kfree(cmd);
+	return rc;
+}
+
+/**
+ * vrdma_add_gid - Add a GID (Global Identifier) entry to the hardware
+ * @attr:	GID attribute containing port, index, GID value, and GID type
+ * @context:	Pointer to store driver-specific context (unused in vRDMA)
+ *
+ * This callback is invoked by the RDMA core when a GID table entry is added,
+ * either manually via sysfs or automatically during IPv6 address assignment.
+ *
+ * The function sends VIRTIO_RDMA_CMD_ADD_GID to the backend device to register
+ * the GID at the specified index and port. This allows the device to use this
+ * GID for RoCE traffic (e.g., as source in GRH).
+ *
+ * Note: The @context parameter is unused in vRDMA drivers since no additional
+ * per-GID software state is maintained.
+ *
+ * Context: Can sleep (called in process context).
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if kmalloc fails
+ * * -EIO if device command fails
+ */
+static int vrdma_add_gid(const struct ib_gid_attr *attr, void **context)
+{
+	struct vrdma_dev *vdev = to_vdev(attr->device);
+	struct vrdma_cmd_add_gid *cmd;
+	struct scatterlist in;
+	int rc;
+
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+	if (!cmd)
+		return -ENOMEM;
+
+	/* Fill command parameters */
+	memcpy(cmd->gid, attr->gid.raw, sizeof(cmd->gid));
+	cmd->index = attr->index;
+	cmd->port_num = attr->port_num;
+	cmd->gid_type = attr->gid_type; /* e.g., IB_GID_TYPE_ROCE or IB_GID_TYPE_ROCE_UDP_ENCAP */
+
+	sg_init_one(&in, cmd, sizeof(*cmd));
+
+	/* Send command to backend */
+	rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_ADD_GID, &in, NULL);
+	if (rc)
+		dev_err(&vdev->vdev->dev,
+			"ADD_GID failed: port=%u index=%u type=%d, err=%d\n",
+			attr->port_num, attr->index, attr->gid_type, rc);
+
+	kfree(cmd);
+	return rc ? -EIO : 0;
+}
+
+/**
+ * vrdma_del_gid - Remove a GID entry from the hardware
+ * @attr:	GID attribute specifying which GID to delete (by index/port)
+ * @context:	Driver-specific context (passed from add_gid; unused here)
+ *
+ * This callback is called when a GID is removed from the GID table.
+ * It notifies the backend device to invalidate the GID mapping at the given index.
+ *
+ * The @context pointer is ignored because vRDMA does not maintain per-GID software state.
+ *
+ * Context: Can sleep (process context).
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if allocation fails
+ * * -EIO if device command fails
+ */
+static int vrdma_del_gid(const struct ib_gid_attr *attr, void **context)
+{
+	struct vrdma_dev *vdev = to_vdev(attr->device);
+	struct vrdma_cmd_del_gid *cmd;
+	struct scatterlist in;
+	int rc;
+
+	cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+	if (!cmd)
+		return -ENOMEM;
+
+	/* Only index and port are needed to identify the GID */
+	cmd->index = attr->index;
+	cmd->port_num = attr->port_num;
+
+	sg_init_one(&in, cmd, sizeof(*cmd));
+
+	/* Send command to backend */
+	rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_DEL_GID, &in, NULL);
+	if (rc)
+		dev_err(&vdev->vdev->dev,
+			"DEL_GID failed: port=%u index=%u, err=%d\n",
+			attr->port_num, attr->index, rc);
+
+	kfree(cmd);
+	return rc ? -EIO : 0;
+}
+
+static int vrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
+{
+	struct vrdma_ucontext *vuc = to_vucontext(uctx);
+	
+	vuc->dev = to_vdev(uctx->device);
+
+	return 0;
+}
+
+static void vrdma_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+}
+
+/**
+ * vrdma_create_ah - Create an Address Handle (AH) for RoCE communication
+ * @ibah:	IB address handle to initialize
+ * @init_attr:	AH initialization attributes
+ * @udata:	User data (unused in vRDMA)
+ *
+ * This function creates a software-only Address Handle (AH), which represents
+ * a remote destination for UD or RC QP sends. Since this is a virtualized driver,
+ * no hardware command is sent; instead, the AH context is stored locally in
+ * struct vrdma_ah for later use during packet construction.
+ *
+ * The AH must:
+ *   - Be RoCE type
+ *   - Contain GRH (Global Routing Header)
+ *   - Not be multicast (currently unsupported)
+ *
+ * Also enforces device limit on maximum number of active AHs via atomic counter.
+ *
+ * Context: Can sleep (called in process context).
+ * Return:
+ * * 0 on success
+ * * -EINVAL if attributes are invalid
+ * * -ENOMEM if AH limit exceeded
+ */
+static int vrdma_create_ah(struct ib_ah *ibah,
+			   struct rdma_ah_init_attr *init_attr,
+			   struct ib_udata *udata)
+{
+	struct vrdma_dev *vdev = to_vdev(ibah->device);
+	struct vrdma_ah *ah = to_vah(ibah);
+	const struct ib_global_route *grh;
+	u32 port_num = rdma_ah_get_port_num(init_attr->ah_attr);
+
+	/* Must have GRH enabled */
+	if (!(rdma_ah_get_ah_flags(init_attr->ah_attr) & IB_AH_GRH))
+		return -EINVAL;
+
+	grh = rdma_ah_read_grh(init_attr->ah_attr);
+
+	/* Only support RoCE type and unicast DGRAM */
+	if (init_attr->ah_attr->type != RDMA_AH_ATTR_TYPE_ROCE)
+		return -EINVAL;
+
+	if (rdma_is_multicast_addr((struct in6_addr *)grh->dgid.raw)) {
+		dev_dbg(&vdev->vdev->dev, "Multicast GID not supported in AH\n");
+		return -EINVAL;
+	}
+
+	/* Enforce max_ah limit using atomic increment with barrier */
+	if (!atomic_add_unless(&vdev->num_ah, 1, vdev->ib_dev.attrs.max_ah)) {
+		dev_dbg(&vdev->vdev->dev, "Exceeded max number of AHs (%u)\n",
+			vdev->ib_dev.attrs.max_ah);
+		return -ENOMEM;
+	}
+
+	/* Initialize AV (Address Vector) with relevant fields */
+	ah->av.port = port_num;
+	ah->av.pdn = to_vpd(ibah->pd)->pd_handle;         /* Protection Domain Number */
+	ah->av.gid_index = grh->sgid_index;               /* Source GID table index */
+	ah->av.hop_limit = grh->hop_limit;
+	ah->av.sl_tclass_flowlabel = (u32)(grh->traffic_class << 20) |
+				      (grh->flow_label & 0xfffff); /* 8-bit SL + 20-bit flow label */
+
+	memcpy(ah->av.dgid, grh->dgid.raw, sizeof(ah->av.dgid));        /* 128-bit Dest GID */
+	memcpy(ah->av.dmac, init_attr->ah_attr->roce.dmac, ETH_ALEN);    /* Next-hop MAC */
+
+	return 0;
+}
+
+/**
+ * vrdma_destroy_ah - Destroy an Address Handle
+ * @ibah:	The IB address handle to destroy
+ * @flags:	Destroy flags (e.g., for deferred cleanup; unused here)
+ *
+ * This callback releases the software state associated with an AH.
+ * It decrements the per-device AH counter to allow new AH creation.
+ *
+ * No hardware interaction is needed since AHs are purely software constructs
+ * in this virtio-rdma implementation.
+ *
+ * Context: Can sleep (process context). May be called from RCU read-side critical section.
+ * Return: Always returns 0 (success).
+ */
+static int vrdma_destroy_ah(struct ib_ah *ibah, u32 flags)
+{
+	struct vrdma_dev *vdev = to_vdev(ibah->device);
+
+	atomic_dec(&vdev->num_ah);
+
+	return 0;
+}
+
 static const struct ib_device_ops vrdma_dev_ops = {
 	.owner = THIS_MODULE,
 	.uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION,
@@ -1101,6 +1692,15 @@ static const struct ib_device_ops vrdma_dev_ops = {
 	.dealloc_pd = vrdma_dealloc_pd,
 	.create_qp = vrdma_create_qp,
 	.destroy_qp = vrdma_destroy_qp,
+	.get_dma_mr = vrdma_get_dma_mr,
+	.alloc_mr = vrdma_alloc_mr,
+	.dereg_mr = vrdma_dereg_mr,
+	.add_gid = vrdma_add_gid,
+	.del_gid = vrdma_del_gid,
+	.alloc_ucontext = vrdma_alloc_ucontext,
+	.dealloc_ucontext = vrdma_dealloc_ucontext,
+	.create_ah = vrdma_create_ah,
+	.destroy_ah = vrdma_destroy_ah,
 };
 
 /**
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h
index ba88599c8..6759c4349 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h
@@ -11,6 +11,8 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/vrdma_abi.h>
 
+#include "vrdma_abi.h"
+
 #define VRDMA_COMM_TIMEOUT 1000000
 
 enum vrdma_type {
@@ -130,6 +132,11 @@ struct vrdma_ucontext {
 	struct vrdma_dev *dev;
 };
 
+struct vrdma_ah {
+	struct ib_ah ibah;
+	struct vrdma_av av;
+};
+
 /**
  * struct vrdma_qp - Virtual RDMA Queue Pair (QP) private data
  *
@@ -166,6 +173,64 @@ struct vrdma_qp {
 	struct vrdma_user_mmap_entry *rq_entry; /* Mmap entry for RQ buffer */
 };
 
+/**
+ * struct vrdma_mr - Software state of a Virtio-RDMA Memory Region (MR)
+ * @ibmr:		InfiniBand core MR object (contains rkey, lkey, etc.)
+ * @umem:		User memory descriptor from ib_umem_get(), holds
+ *			page list and reference to user VMA
+ * @mr_handle:		Handle returned by backend device for this MR
+ * @iova:		I/O virtual address (start of the mapped region)
+ * @size:		Total size of the memory region in bytes
+ * @pages:		Level 1 (L1) page table - array of kernel pointers to
+ *			level 2 (L2) page tables containing DMA addresses.
+ *			This is used by the host driver to manage scatter-gather layout.
+ * @pages_k:		Array of kernel virtual addresses of L2 page tables.
+ *			Used to free memory correctly during cleanup.
+ * @dma_pages:		DMA address of the L1 page table (first-level table),
+ *			to be passed to the device or written in command WQE.
+ * @npages:		Number of valid pages in the memory region
+ * @max_pages:		Maximum number of pages that can be held in current
+ *			page table allocation (based on initial mapping size)
+ *
+ * This structure represents a registered memory region in the vRDMA driver.
+ * It supports large memory registrations using a two-level page table design:
+ *
+ *   L1 Page Table (contiguous DMA-mapped):
+ *     Contains pointers to multiple L2 tables (each L2 = one page).
+ *
+ *   L2 Page Tables:
+ *     Each stores up to N DMA addresses (physical page addresses).
+ *
+ * The layout allows efficient hardware access while keeping kernel allocations
+ * manageable for very large mappings (e.g., tens of GB).
+ *
+ * Example layout for 4K pages and 512 entries per L2 table:
+ *
+ *   L1 (dma_pages) -> [L2_0] -> [DMA_ADDR_A, ..., DMA_ADDR_Z]
+ *                   [L2_1] -> [DMA_ADDR_X, ..., DMA_ADDR_Y]
+ *                   ...
+ *
+ * Used during:
+ *   - ib_reg_mr()
+ *   - SEND/WRITE/READ operations with remote access
+ *   - MR invalidation and cleanup in vrdma_dereg_mr()
+ */
+struct vrdma_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+
+	u32 mr_handle;
+	u64 iova;
+	u64 size;
+
+	u64 **pages;        /* L1: array of L2 table DMA address pointers */
+	u64 **pages_k;      /* L1: array of L2 table kernel virtual addresses */
+	dma_addr_t dma_pages; /* DMA address of the L1 table itself */
+
+	u32 npages;
+	u32 max_pages;
+};
+
 static inline struct vrdma_cq *to_vcq(struct ib_cq *ibcq)
 {
        return container_of(ibcq, struct vrdma_cq, ibcq);
@@ -181,6 +246,21 @@ static inline struct vrdma_qp *to_vqp(struct ib_qp *ibqp)
 	return container_of(ibqp, struct vrdma_qp, ibqp);
 }
 
+static inline struct vrdma_mr *to_vmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct vrdma_mr, ibmr);
+}
+
+static inline struct vrdma_ucontext *to_vucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct vrdma_ucontext, ibucontext);
+}
+
+static inline struct vrdma_ah *to_vah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct vrdma_ah, ibah);
+}
+
 int vrdma_register_ib_device(struct vrdma_dev *vrdev);
 void vrdma_unregister_ib_device(struct vrdma_dev *vrdev);
 
-- 
2.43.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ