[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251218091050.55047-5-15927021679@163.com>
Date: Thu, 18 Dec 2025 17:09:44 +0800
From: Xiong Weimin <15927021679@....com>
To: "Michael S . Tsirkin" <mst@...hat.com>,
David Hildenbrand <david@...hat.com>,
Jason Wang <jasowang@...hat.com>,
Stefano Garzarella <sgarzare@...hat.com>,
Thomas Monjalon <thomas@...jalon.net>,
David Marchand <david.marchand@...hat.com>,
Luca Boccassi <bluca@...ian.org>,
Kevin Traynor <ktraynor@...hat.com>,
Christian Ehrhardt <christian.ehrhardt@...onical.com>,
Xuan Zhuo <xuanzhuo@...ux.alibaba.com>,
Eugenio Pérez <eperezma@...hat.com>,
Xueming Li <xuemingl@...dia.com>,
Maxime Coquelin <maxime.coquelin@...hat.com>,
Chenbo Xia <chenbox@...dia.com>,
Bruce Richardson <bruce.richardson@...el.com>
Cc: kvm@...r.kernel.org,
virtualization@...ts.linux.dev,
netdev@...r.kernel.org,
xiongweimin <xiongweimin@...inos.cn>
Subject: [PATCH 04/10] drivers/infiniband/hw/virtio: Implement MR, GID, ucontext and AH resource management verbs
From: xiongweimin <xiongweimin@...inos.cn>
This commit adds foundational resource management capabilities to the
vhost-user RDMA driver, enabling full RDMA operations:
1. Memory Region (MR) Management:
- DMA MR registration via GET_DMA_MR
- Two-level page table for large scatter-gather lists
- CREATE_MR/DEREG_MR backend command flow
- Atomic command execution with virtqueue
2. Global Identifier (GID) Management:
- ADD_GID/DEL_GID backend commands
- RoCE v1/v2 GID type support
- Port-based GID table operations
3. User Context (ucontext) Support:
- Allocation and deallocation hooks
- Device association for future PD/CQ/MR management
4. Address Handle (AH) Management:
- RoCE-specific AH creation/validation
- Unicast GRH enforcement
- Device-wide AH limit tracking
Key technical features:
- MRs support both DMA-direct and user-backed registrations
- Page-table optimized for large scatter-lists
- GID operations integrate with RDMA core notifications
- AHs store full address vectors for packet construction
- Resource limits enforced via atomic counters
Signed-off-by: Xiong Weimin <xiongweimin@...inos.cn>
---
.../infiniband/hw/virtio/vrdma_dev_api.h | 40 ++
.../drivers/infiniband/hw/virtio/vrdma_ib.c | 600 ++++++++++++++++++
.../drivers/infiniband/hw/virtio/vrdma_ib.h | 80 +++
3 files changed, 720 insertions(+)
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
index d1db1bea4..da99f1f32 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
@@ -160,6 +160,46 @@ struct vrdma_cmd_destroy_qp {
__u32 qpn;
};
+struct vrdma_cmd_get_dma_mr {
+ __u32 pdn;
+ __u32 access_flags;
+};
+
+struct vrdma_rsp_get_dma_mr {
+ __u32 mrn;
+ __u32 lkey;
+ __u32 rkey;
+};
+
+struct vrdma_cmd_create_mr {
+ __u32 pdn;
+ __u32 access_flags;
+
+ __u32 max_num_sg;
+};
+
+struct vrdma_rsp_create_mr {
+ __u32 mrn;
+ __u32 lkey;
+ __u32 rkey;
+};
+
+struct vrdma_cmd_dereg_mr {
+ __u32 mrn;
+};
+
+struct vrdma_cmd_add_gid {
+ __u8 gid[16];
+ __u32 gid_type;
+ __u16 index;
+ __u32 port_num;
+};
+
+struct vrdma_cmd_del_gid {
+ __u16 index;
+ __u32 port_num;
+};
+
#define VRDMA_CTRL_OK 0
#define VRDMA_CTRL_ERR 1
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
index f1f53314f..b4c16ddbb 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
@@ -1086,6 +1086,597 @@ static int vrdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
return rc;
}
+/**
+ * vrdma_get_dma_mr - Get a DMA memory region (uncached, direct-access MR)
+ * @pd: Protection Domain to associate this MR with
+ * @flags: Access permissions (IB_ACCESS_LOCAL_WRITE, IB_ACCESS_REMOTE_READ, etc.)
+ *
+ * This function creates a special type of Memory Region (MR) that refers to
+ * physically contiguous or scatter-gather DMA-capable memory, typically used
+ * for zero-copy or kernel-space registrations without user buffer backing.
+ *
+ * It issues the VIRTIO_RDMA_CMD_GET_DMA_MR command to the backend device,
+ * which returns:
+ * - An MR handle (mrn)
+ * - Local Key (lkey)
+ * - Remote Key (rkey)
+ *
+ * Unlike regular MRs created via ib_reg_mr(), this MR does not back any
+ * user-space virtual memory (i.e., no ib_umem). It is typically used for
+ * device-specific buffers, scratch memory, or control structures.
+ *
+ * Context: Called in process context. May sleep.
+ * Return:
+ * * &mr->ibmr on success
+ * * ERR_PTR(-ENOMEM) if memory allocation fails
+ * * ERR_PTR(-EIO) if device communication fails
+ */
+static struct ib_mr *vrdma_get_dma_mr(struct ib_pd *pd, int flags)
+{
+ struct vrdma_dev *vdev = to_vdev(pd->device);
+ struct vrdma_mr *mr;
+ struct vrdma_cmd_get_dma_mr *cmd;
+ struct vrdma_rsp_get_dma_mr *rsp;
+ struct scatterlist in, out;
+ int rc;
+
+ /* Allocate software MR structure */
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ if (!cmd) {
+ rc = -ENOMEM;
+ goto err_free_mr;
+ }
+
+ rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+ if (!rsp) {
+ rc = -ENOMEM;
+ goto err_free_cmd;
+ }
+
+ /* Prepare command parameters */
+ cmd->pdn = to_vpd(pd)->pd_handle;
+ cmd->access_flags = flags;
+
+ sg_init_one(&in, cmd, sizeof(*cmd));
+ sg_init_one(&out, rsp, sizeof(*rsp));
+
+ /* Send GET_DMA_MR command to device */
+ rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_GET_DMA_MR, &in, &out);
+ if (rc) {
+ dev_err(&vdev->vdev->dev,
+ "GET_DMA_MR command failed: %d\n", rc);
+ goto err_free_rsp;
+ }
+
+ /* Initialize MR fields from response */
+ mr->mr_handle = rsp->mrn;
+ mr->ibmr.lkey = rsp->lkey;
+ mr->ibmr.rkey = rsp->rkey;
+ mr->ibmr.pd = pd;
+ mr->ibmr.device = pd->device;
+ mr->ibmr.type = IB_MR_TYPE_MEM_REG; /* Standard memory registration */
+
+ /* No backing user memory */
+ mr->umem = NULL;
+ mr->iova = 0;
+ mr->size = 0;
+ mr->pages = NULL;
+ mr->pages_k = NULL;
+ mr->dma_pages = 0;
+ mr->npages = 0;
+ mr->max_pages = 0;
+
+ /* Cleanup command/response buffers */
+ kfree(cmd);
+ kfree(rsp);
+
+ return &mr->ibmr;
+
+err_free_rsp:
+ kfree(rsp);
+
+err_free_cmd:
+ kfree(cmd);
+
+err_free_mr:
+ kfree(mr);
+ return ERR_PTR(rc);
+}
+
+/**
+ * vrdma_init_page_tbl - Initialize a two-level page table for MR management
+ * @dev: vRDMA device pointer
+ * @npages: Maximum number of data pages this table can map
+ * @pages_dma: Output: L1 table with entries pointing to DMA addresses of L2 tables
+ * @dma_pages_p: Output: DMA address of the L1 table itself
+ *
+ * This function sets up a two-level page table structure used in Memory Region (MR)
+ * registration to support scatter-gather I/O. The layout is:
+ *
+ * L1 (Level 1): Single page, DMA-coherent, holds pointers to L2 tables.
+ * Will be passed to hardware via WQE or command.
+ *
+ * L2 (Level 2): Array of pages, each holding up to 512 x 8-byte DMA addresses
+ * (for 4KB page size). Each L2 table maps part of the S/G list.
+ *
+ * Example:
+ * npages = 1024 => needs 1024 / 512 = 2 L2 tables
+ *
+ * Return:
+ * Pointer to kernel virtual address of L1 table (pages_k), which stores
+ * virtual addresses of L2 tables for cleanup.
+ * On failure, returns NULL and cleans up all allocated memory.
+ */
+static uint64_t **vrdma_init_page_tbl(struct vrdma_dev *dev,
+ unsigned int npages,
+ uint64_t ***pages_dma,
+ dma_addr_t *dma_pages_p)
+{
+ unsigned int nl2 = (npages == 0) ? 0 : (npages + 511) / 512; /* ceil(npages / 512) */
+ uint64_t **l1_table; /* L1: stores DMA addrs of L2s (device-readable) */
+ uint64_t **l1_table_k; /* L1: stores kernel vaddrs of L2s (for free) */
+ dma_addr_t l1_dma_addr;
+ dma_addr_t l2_dma_addr;
+ int i;
+
+ /* Allocate L1 table: must be DMA-coherent because device reads it */
+ l1_table = dma_alloc_coherent(dev->vdev->dev.parent, PAGE_SIZE, &l1_dma_addr, GFP_KERNEL);
+ if (!l1_table)
+ return NULL;
+
+ /* Allocate kernel-space array to track L2 virtual addresses */
+ l1_table_k = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!l1_table_k)
+ goto err_free_l1_table;
+
+ /* Allocate each L2 table (DMA-coherent, one per 512 entries) */
+ for (i = 0; i < nl2; i++) {
+ l1_table_k[i] = dma_alloc_coherent(dev->vdev->dev.parent, PAGE_SIZE, &l2_dma_addr, GFP_KERNEL);
+ if (!l1_table_k[i])
+ goto err_free_l2_tables;
+
+ l1_table[i] = (uint64_t *)l2_dma_addr; /* Device sees DMA address */
+ }
+
+ /* Output parameters */
+ *pages_dma = l1_table; /* Device-visible L1 (with DMA pointers) */
+ *dma_pages_p = l1_dma_addr; /* DMA address of L1 table */
+
+ return l1_table_k; /* Return kernel view for later cleanup */
+
+err_free_l2_tables:
+ /* Roll back any successfully allocated L2 tables */
+ while (--i >= 0) {
+ dma_free_coherent(dev->vdev->dev.parent, PAGE_SIZE, l1_table_k[i], (dma_addr_t)l1_table[i]);
+ }
+ kfree(l1_table_k);
+
+err_free_l1_table:
+ dma_free_coherent(dev->vdev->dev.parent, PAGE_SIZE, l1_table, l1_dma_addr);
+
+ return NULL;
+}
+
+/**
+ * vrdma_free_page_tbl - Free a two-level page table
+ * @dev: vRDMA device
+ * @pages_k: Return value from vrdma_init_page_tbl (kernel L2 pointers)
+ * @pages: L1 table with DMA addresses (output of pages_dma)
+ * @dma_pages: DMA address of L1 table
+ * @npages: Number of pages that were to be supported
+ *
+ * Frees both L1 and all L2 page tables allocated by vrdma_init_page_tbl.
+ */
+static void vrdma_free_page_tbl(struct vrdma_dev *dev,
+ uint64_t **pages_k,
+ uint64_t **pages,
+ dma_addr_t dma_pages,
+ unsigned int npages)
+{
+ unsigned int nl2 = (npages == 0) ? 0 : (npages + 511) / 512;
+ int i;
+
+ if (!pages_k || !pages)
+ return;
+
+ /* Free all L2 tables */
+ for (i = 0; i < nl2; i++) {
+ if (pages_k[i])
+ dma_free_coherent(dev->vdev->dev.parent, PAGE_SIZE, pages_k[i],
+ virt_to_phys((void *)pages[i]));
+ }
+
+ /* Free L1 tracking array */
+ kfree(pages_k);
+
+ /* Free L1 DMA table */
+ dma_free_coherent(dev->vdev->dev.parent, PAGE_SIZE, pages, dma_pages);
+}
+
+/**
+ * vrdma_alloc_mr - Allocate a multi-segment Memory Region (MR) with page tables
+ * @pd: Protection Domain to associate the MR with
+ * @mr_type: Type of MR (must be IB_MR_TYPE_MEM_REG)
+ * @max_num_sg: Maximum number of scatter/gather entries supported by this MR
+ *
+ * This function allocates a software MR structure and reserves a hardware MR
+ * context on the backend vRDMA device. It prepares a two-level page table
+ * (L1/L2) to support up to @max_num_sg pages, which will later be filled during
+ * memory registration (e.g., via ib_update_page()).
+ *
+ * The allocated MR is not yet backed by any actual memory - it serves as a
+ * container for future page population (used primarily by ib_get_dma_mr() path
+ * or special fast-register mechanisms).
+ *
+ * Command flow:
+ * - Sends VIRTIO_RDMA_CMD_CREATE_MR to device
+ * - Receives mr_handle, lkey, rkey from response
+ *
+ * Context: Called in process context. May sleep.
+ * Return:
+ * * &mr->ibmr on success
+ * * ERR_PTR(-EINVAL) if unsupported MR type
+ * * ERR_PTR(-ENOMEM) if memory allocation fails
+ * * ERR_PTR(-EIO) if device command fails
+ */
+static struct ib_mr *vrdma_alloc_mr(struct ib_pd *pd,
+ enum ib_mr_type mr_type,
+ u32 max_num_sg)
+{
+ struct vrdma_dev *vdev = to_vdev(pd->device);
+ struct vrdma_mr *mr;
+ struct vrdma_cmd_create_mr *cmd;
+ struct vrdma_rsp_create_mr *rsp;
+ struct scatterlist in, out;
+ int rc;
+
+ /* Only support standard memory registration */
+ if (mr_type != IB_MR_TYPE_MEM_REG)
+ return ERR_PTR(-EINVAL);
+
+ /* Allocate software MR structure */
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!mr)
+ return ERR_PTR(-ENOMEM);
+
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ if (!cmd) {
+ rc = -ENOMEM;
+ goto err_free_mr;
+ }
+
+ rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+ if (!rsp) {
+ rc = -ENOMEM;
+ goto err_free_cmd;
+ }
+
+ /*
+ * Allocate two-level page table for S/G support.
+ * Each L2 table holds PAGE_SIZE / sizeof(u64) entries.
+ * L1 table points to multiple L2s.
+ */
+ mr->pages_k = vrdma_init_page_tbl(vdev, max_num_sg,
+ &mr->pages, &mr->dma_pages);
+ if (!mr->pages_k) {
+ dev_err(&vdev->vdev->dev,
+ "Failed to allocate page table for %u S/G entries\n",
+ max_num_sg);
+ rc = -ENOMEM;
+ goto err_free_rsp;
+ }
+
+ mr->max_pages = max_num_sg;
+ mr->npages = 0;
+ mr->umem = NULL; /* No user memory backing at this stage */
+ mr->iova = 0;
+ mr->size = 0;
+
+ /* Prepare command */
+ cmd->pdn = to_vpd(pd)->pd_handle;
+ cmd->max_num_sg = max_num_sg;
+
+ sg_init_one(&in, cmd, sizeof(*cmd));
+ sg_init_one(&out, rsp, sizeof(*rsp));
+
+ /* Send CREATE_MR command to backend device */
+ rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_CREATE_MR, &in, &out);
+ if (rc) {
+ dev_err(&vdev->vdev->dev, "CREATE_MR failed: %d\n", rc);
+ goto err_free_page_tbl;
+ }
+
+ /* Initialize MR metadata from response */
+ mr->mr_handle = rsp->mrn;
+ mr->ibmr.lkey = rsp->lkey;
+ mr->ibmr.rkey = rsp->rkey;
+ mr->ibmr.pd = pd;
+ mr->ibmr.device = &vdev->ib_dev;
+ mr->ibmr.type = IB_MR_TYPE_MEM_REG;
+
+ /* Clean up command/response buffers */
+ kfree(cmd);
+ kfree(rsp);
+
+ return &mr->ibmr;
+
+err_free_page_tbl:
+ vrdma_free_page_tbl(vdev, mr->pages_k, mr->pages, mr->dma_pages,
+ max_num_sg);
+err_free_rsp:
+ kfree(rsp);
+err_free_cmd:
+ kfree(cmd);
+err_free_mr:
+ kfree(mr);
+ return ERR_PTR(rc);
+}
+
+/**
+ * vrdma_dereg_mr - Deregister and destroy a Memory Region (MR)
+ * @ibmr: The IB memory region to deregister
+ * @udata: User data (optional, for user-space MRs)
+ *
+ * This function unregisters a previously allocated MR from the vRDMA device.
+ * It performs the following steps:
+ * 1. Sends VIRTIO_RDMA_CMD_DEREG_MR command to the backend device
+ * 2. Frees software page tables (L1/L2) used for scatter-gather mapping
+ * 3. Releases user memory (if any) via ib_umem_release()
+ * 4. Frees local metadata (struct vrdma_mr)
+ *
+ * Context: Can be called in process context. May sleep.
+ * Return:
+ * * 0 on success
+ * * -EIO if device communication fails
+ * * Other negative errno codes on allocation failure (rare during dereg)
+ */
+static int vrdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+{
+ struct vrdma_dev *vdev = to_vdev(ibmr->device);
+ struct vrdma_mr *mr = to_vmr(ibmr);
+ struct vrdma_cmd_dereg_mr *cmd;
+ struct scatterlist in;
+ int rc;
+
+ /* Allocate command buffer */
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ /* Prepare command */
+ cmd->mrn = mr->mr_handle;
+ sg_init_one(&in, cmd, sizeof(*cmd));
+
+ /* Notify hardware to release MR context */
+ rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_DEREG_MR, &in, NULL);
+ if (rc) {
+ dev_err(&vdev->vdev->dev,
+ "VIRTIO_RDMA_CMD_DEREG_MR failed for mrn=0x%x, err=%d\n",
+ mr->mr_handle, rc);
+ rc = -EIO;
+ goto out_free_cmd;
+ }
+
+ /* Free two-level page table used for S/G entries */
+ vrdma_free_page_tbl(vdev, mr->pages_k, mr->pages, mr->dma_pages, mr->max_pages);
+
+ /* Release user memory if present */
+ if (mr->umem)
+ ib_umem_release(mr->umem);
+
+ /* Success */
+ kfree(cmd);
+ return 0;
+
+out_free_cmd:
+ kfree(cmd);
+ return rc;
+}
+
+/**
+ * vrdma_add_gid - Add a GID (Global Identifier) entry to the hardware
+ * @attr: GID attribute containing port, index, GID value, and GID type
+ * @context: Pointer to store driver-specific context (unused in vRDMA)
+ *
+ * This callback is invoked by the RDMA core when a GID table entry is added,
+ * either manually via sysfs or automatically during IPv6 address assignment.
+ *
+ * The function sends VIRTIO_RDMA_CMD_ADD_GID to the backend device to register
+ * the GID at the specified index and port. This allows the device to use this
+ * GID for RoCE traffic (e.g., as source in GRH).
+ *
+ * Note: The @context parameter is unused in vRDMA drivers since no additional
+ * per-GID software state is maintained.
+ *
+ * Context: Can sleep (called in process context).
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if kmalloc fails
+ * * -EIO if device command fails
+ */
+static int vrdma_add_gid(const struct ib_gid_attr *attr, void **context)
+{
+ struct vrdma_dev *vdev = to_vdev(attr->device);
+ struct vrdma_cmd_add_gid *cmd;
+ struct scatterlist in;
+ int rc;
+
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ /* Fill command parameters */
+ memcpy(cmd->gid, attr->gid.raw, sizeof(cmd->gid));
+ cmd->index = attr->index;
+ cmd->port_num = attr->port_num;
+ cmd->gid_type = attr->gid_type; /* e.g., IB_GID_TYPE_ROCE or IB_GID_TYPE_ROCE_UDP_ENCAP */
+
+ sg_init_one(&in, cmd, sizeof(*cmd));
+
+ /* Send command to backend */
+ rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_ADD_GID, &in, NULL);
+ if (rc)
+ dev_err(&vdev->vdev->dev,
+ "ADD_GID failed: port=%u index=%u type=%d, err=%d\n",
+ attr->port_num, attr->index, attr->gid_type, rc);
+
+ kfree(cmd);
+ return rc ? -EIO : 0;
+}
+
+/**
+ * vrdma_del_gid - Remove a GID entry from the hardware
+ * @attr: GID attribute specifying which GID to delete (by index/port)
+ * @context: Driver-specific context (passed from add_gid; unused here)
+ *
+ * This callback is called when a GID is removed from the GID table.
+ * It notifies the backend device to invalidate the GID mapping at the given index.
+ *
+ * The @context pointer is ignored because vRDMA does not maintain per-GID software state.
+ *
+ * Context: Can sleep (process context).
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if allocation fails
+ * * -EIO if device command fails
+ */
+static int vrdma_del_gid(const struct ib_gid_attr *attr, void **context)
+{
+ struct vrdma_dev *vdev = to_vdev(attr->device);
+ struct vrdma_cmd_del_gid *cmd;
+ struct scatterlist in;
+ int rc;
+
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ /* Only index and port are needed to identify the GID */
+ cmd->index = attr->index;
+ cmd->port_num = attr->port_num;
+
+ sg_init_one(&in, cmd, sizeof(*cmd));
+
+ /* Send command to backend */
+ rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_DEL_GID, &in, NULL);
+ if (rc)
+ dev_err(&vdev->vdev->dev,
+ "DEL_GID failed: port=%u index=%u, err=%d\n",
+ attr->port_num, attr->index, rc);
+
+ kfree(cmd);
+ return rc ? -EIO : 0;
+}
+
+static int vrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata)
+{
+ struct vrdma_ucontext *vuc = to_vucontext(uctx);
+
+ vuc->dev = to_vdev(uctx->device);
+
+ return 0;
+}
+
+static void vrdma_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+}
+
+/**
+ * vrdma_create_ah - Create an Address Handle (AH) for RoCE communication
+ * @ibah: IB address handle to initialize
+ * @init_attr: AH initialization attributes
+ * @udata: User data (unused in vRDMA)
+ *
+ * This function creates a software-only Address Handle (AH), which represents
+ * a remote destination for UD or RC QP sends. Since this is a virtualized driver,
+ * no hardware command is sent; instead, the AH context is stored locally in
+ * struct vrdma_ah for later use during packet construction.
+ *
+ * The AH must:
+ * - Be RoCE type
+ * - Contain GRH (Global Routing Header)
+ * - Not be multicast (currently unsupported)
+ *
+ * Also enforces device limit on maximum number of active AHs via atomic counter.
+ *
+ * Context: Can sleep (called in process context).
+ * Return:
+ * * 0 on success
+ * * -EINVAL if attributes are invalid
+ * * -ENOMEM if AH limit exceeded
+ */
+static int vrdma_create_ah(struct ib_ah *ibah,
+ struct rdma_ah_init_attr *init_attr,
+ struct ib_udata *udata)
+{
+ struct vrdma_dev *vdev = to_vdev(ibah->device);
+ struct vrdma_ah *ah = to_vah(ibah);
+ const struct ib_global_route *grh;
+ u32 port_num = rdma_ah_get_port_num(init_attr->ah_attr);
+
+ /* Must have GRH enabled */
+ if (!(rdma_ah_get_ah_flags(init_attr->ah_attr) & IB_AH_GRH))
+ return -EINVAL;
+
+ grh = rdma_ah_read_grh(init_attr->ah_attr);
+
+ /* Only support RoCE type and unicast DGRAM */
+ if (init_attr->ah_attr->type != RDMA_AH_ATTR_TYPE_ROCE)
+ return -EINVAL;
+
+ if (rdma_is_multicast_addr((struct in6_addr *)grh->dgid.raw)) {
+ dev_dbg(&vdev->vdev->dev, "Multicast GID not supported in AH\n");
+ return -EINVAL;
+ }
+
+ /* Enforce max_ah limit using atomic increment with barrier */
+ if (!atomic_add_unless(&vdev->num_ah, 1, vdev->ib_dev.attrs.max_ah)) {
+ dev_dbg(&vdev->vdev->dev, "Exceeded max number of AHs (%u)\n",
+ vdev->ib_dev.attrs.max_ah);
+ return -ENOMEM;
+ }
+
+ /* Initialize AV (Address Vector) with relevant fields */
+ ah->av.port = port_num;
+ ah->av.pdn = to_vpd(ibah->pd)->pd_handle; /* Protection Domain Number */
+ ah->av.gid_index = grh->sgid_index; /* Source GID table index */
+ ah->av.hop_limit = grh->hop_limit;
+ ah->av.sl_tclass_flowlabel = (u32)(grh->traffic_class << 20) |
+ (grh->flow_label & 0xfffff); /* 8-bit SL + 20-bit flow label */
+
+ memcpy(ah->av.dgid, grh->dgid.raw, sizeof(ah->av.dgid)); /* 128-bit Dest GID */
+ memcpy(ah->av.dmac, init_attr->ah_attr->roce.dmac, ETH_ALEN); /* Next-hop MAC */
+
+ return 0;
+}
+
+/**
+ * vrdma_destroy_ah - Destroy an Address Handle
+ * @ibah: The IB address handle to destroy
+ * @flags: Destroy flags (e.g., for deferred cleanup; unused here)
+ *
+ * This callback releases the software state associated with an AH.
+ * It decrements the per-device AH counter to allow new AH creation.
+ *
+ * No hardware interaction is needed since AHs are purely software constructs
+ * in this virtio-rdma implementation.
+ *
+ * Context: Can sleep (process context). May be called from RCU read-side critical section.
+ * Return: Always returns 0 (success).
+ */
+static int vrdma_destroy_ah(struct ib_ah *ibah, u32 flags)
+{
+ struct vrdma_dev *vdev = to_vdev(ibah->device);
+
+ atomic_dec(&vdev->num_ah);
+
+ return 0;
+}
+
static const struct ib_device_ops vrdma_dev_ops = {
.owner = THIS_MODULE,
.uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION,
@@ -1101,6 +1692,15 @@ static const struct ib_device_ops vrdma_dev_ops = {
.dealloc_pd = vrdma_dealloc_pd,
.create_qp = vrdma_create_qp,
.destroy_qp = vrdma_destroy_qp,
+ .get_dma_mr = vrdma_get_dma_mr,
+ .alloc_mr = vrdma_alloc_mr,
+ .dereg_mr = vrdma_dereg_mr,
+ .add_gid = vrdma_add_gid,
+ .del_gid = vrdma_del_gid,
+ .alloc_ucontext = vrdma_alloc_ucontext,
+ .dealloc_ucontext = vrdma_dealloc_ucontext,
+ .create_ah = vrdma_create_ah,
+ .destroy_ah = vrdma_destroy_ah,
};
/**
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h
index ba88599c8..6759c4349 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h
@@ -11,6 +11,8 @@
#include <rdma/ib_verbs.h>
#include <rdma/vrdma_abi.h>
+#include "vrdma_abi.h"
+
#define VRDMA_COMM_TIMEOUT 1000000
enum vrdma_type {
@@ -130,6 +132,11 @@ struct vrdma_ucontext {
struct vrdma_dev *dev;
};
+struct vrdma_ah {
+ struct ib_ah ibah;
+ struct vrdma_av av;
+};
+
/**
* struct vrdma_qp - Virtual RDMA Queue Pair (QP) private data
*
@@ -166,6 +173,64 @@ struct vrdma_qp {
struct vrdma_user_mmap_entry *rq_entry; /* Mmap entry for RQ buffer */
};
+/**
+ * struct vrdma_mr - Software state of a Virtio-RDMA Memory Region (MR)
+ * @ibmr: InfiniBand core MR object (contains rkey, lkey, etc.)
+ * @umem: User memory descriptor from ib_umem_get(), holds
+ * page list and reference to user VMA
+ * @mr_handle: Handle returned by backend device for this MR
+ * @iova: I/O virtual address (start of the mapped region)
+ * @size: Total size of the memory region in bytes
+ * @pages: Level 1 (L1) page table - array of kernel pointers to
+ * level 2 (L2) page tables containing DMA addresses.
+ * This is used by the host driver to manage scatter-gather layout.
+ * @pages_k: Array of kernel virtual addresses of L2 page tables.
+ * Used to free memory correctly during cleanup.
+ * @dma_pages: DMA address of the L1 page table (first-level table),
+ * to be passed to the device or written in command WQE.
+ * @npages: Number of valid pages in the memory region
+ * @max_pages: Maximum number of pages that can be held in current
+ * page table allocation (based on initial mapping size)
+ *
+ * This structure represents a registered memory region in the vRDMA driver.
+ * It supports large memory registrations using a two-level page table design:
+ *
+ * L1 Page Table (contiguous DMA-mapped):
+ * Contains pointers to multiple L2 tables (each L2 = one page).
+ *
+ * L2 Page Tables:
+ * Each stores up to N DMA addresses (physical page addresses).
+ *
+ * The layout allows efficient hardware access while keeping kernel allocations
+ * manageable for very large mappings (e.g., tens of GB).
+ *
+ * Example layout for 4K pages and 512 entries per L2 table:
+ *
+ * L1 (dma_pages) -> [L2_0] -> [DMA_ADDR_A, ..., DMA_ADDR_Z]
+ * [L2_1] -> [DMA_ADDR_X, ..., DMA_ADDR_Y]
+ * ...
+ *
+ * Used during:
+ * - ib_reg_mr()
+ * - SEND/WRITE/READ operations with remote access
+ * - MR invalidation and cleanup in vrdma_dereg_mr()
+ */
+struct vrdma_mr {
+ struct ib_mr ibmr;
+ struct ib_umem *umem;
+
+ u32 mr_handle;
+ u64 iova;
+ u64 size;
+
+ u64 **pages; /* L1: array of L2 table DMA address pointers */
+ u64 **pages_k; /* L1: array of L2 table kernel virtual addresses */
+ dma_addr_t dma_pages; /* DMA address of the L1 table itself */
+
+ u32 npages;
+ u32 max_pages;
+};
+
static inline struct vrdma_cq *to_vcq(struct ib_cq *ibcq)
{
return container_of(ibcq, struct vrdma_cq, ibcq);
@@ -181,6 +246,21 @@ static inline struct vrdma_qp *to_vqp(struct ib_qp *ibqp)
return container_of(ibqp, struct vrdma_qp, ibqp);
}
+static inline struct vrdma_mr *to_vmr(struct ib_mr *ibmr)
+{
+ return container_of(ibmr, struct vrdma_mr, ibmr);
+}
+
+static inline struct vrdma_ucontext *to_vucontext(struct ib_ucontext *ibucontext)
+{
+ return container_of(ibucontext, struct vrdma_ucontext, ibucontext);
+}
+
+static inline struct vrdma_ah *to_vah(struct ib_ah *ibah)
+{
+ return container_of(ibah, struct vrdma_ah, ibah);
+}
+
int vrdma_register_ib_device(struct vrdma_dev *vrdev);
void vrdma_unregister_ib_device(struct vrdma_dev *vrdev);
--
2.43.0
Powered by blists - more mailing lists