[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251218091050.55047-10-15927021679@163.com>
Date: Thu, 18 Dec 2025 17:09:49 +0800
From: Xiong Weimin <15927021679@....com>
To: "Michael S . Tsirkin" <mst@...hat.com>,
David Hildenbrand <david@...hat.com>,
Jason Wang <jasowang@...hat.com>,
Stefano Garzarella <sgarzare@...hat.com>,
Thomas Monjalon <thomas@...jalon.net>,
David Marchand <david.marchand@...hat.com>,
Luca Boccassi <bluca@...ian.org>,
Kevin Traynor <ktraynor@...hat.com>,
Christian Ehrhardt <christian.ehrhardt@...onical.com>,
Xuan Zhuo <xuanzhuo@...ux.alibaba.com>,
Eugenio Pérez <eperezma@...hat.com>,
Xueming Li <xuemingl@...dia.com>,
Maxime Coquelin <maxime.coquelin@...hat.com>,
Chenbo Xia <chenbox@...dia.com>,
Bruce Richardson <bruce.richardson@...el.com>
Cc: kvm@...r.kernel.org,
virtualization@...ts.linux.dev,
netdev@...r.kernel.org,
xiongweimin <xiongweimin@...inos.cn>
Subject: [PATCH 09/10] drivers/infiniband/hw/virtio: Implement P_key, QP query and user MR resource management verbs
From: xiongweimin <xiongweimin@...inos.cn>
This commit adds support for essential RDMA resource management verbs:
1. P_Key Table Query:
- Implements IB_QUERY_PKEY verb for partition key retrieval
- Handles endianness conversion for cross-platform compatibility
- Provides complete error handling for device communication failures
2. QP Attribute Query:
- Full QP state retrieval including capabilities and AH attributes
- Byte order handling for all struct fields
- Init attribute preservation for consistency checks
- Detailed error logging for debugging
3. User Memory Registration:
- Memory pinning via ib_umem_get() with access flag enforcement
- DMA-safe page table construction and bulk transfer to device
- Multi-architecture DMA address handling
- Strict memory boundary validation
- Resource cleanup guarantees on all error paths
Key enhancements:
- Unified virtqueue command infrastructure
- Cross-architecture endianness handling
- Atomic page table transfer for registered memory regions
- Protection domain integration for memory access control
- Error injection points for robust resource recovery
Signed-off-by: Xiong Weimin <xiongweimin@...inos.cn>
---
.../infiniband/hw/virtio/vrdma_dev_api.h | 35 ++
.../drivers/infiniband/hw/virtio/vrdma_ib.c | 333 +++++++++++++++++-
2 files changed, 367 insertions(+), 1 deletion(-)
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
index d0ce02601..86b5ecade 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
@@ -225,6 +225,41 @@ struct vrdma_rsp_modify_qp {
__u32 qpn;
};
+struct vrdma_cmd_query_pkey {
+ __u32 port;
+ __u16 index;
+};
+
+struct vrdma_rsp_query_pkey {
+ __u16 pkey;
+};
+
+struct vrdma_cmd_query_qp {
+ __u32 qpn;
+ __u32 attr_mask;
+};
+
+struct vrdma_rsp_query_qp {
+ struct vrdma_qp_attr attr;
+};
+
+struct vrdma_cmd_reg_user_mr {
+ __u32 pdn;
+ __u32 access_flags;
+ __u64 start;
+ __u64 length;
+ __u64 virt_addr;
+
+ __u64 pages;
+ __u32 npages;
+};
+
+struct vrdma_rsp_reg_user_mr {
+ __u32 mrn;
+ __u32 lkey;
+ __u32 rkey;
+};
+
#define VRDMA_CTRL_OK 0
#define VRDMA_CTRL_ERR 1
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
index f9b129774..b1429e072 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
@@ -23,6 +23,7 @@
#include "vrdma_queue.h"
#define VRTIO_RDMA_PAGE_PER_TBL 512
+#define VRDMA_MAX_PAGES 512 * 512
/**
* cmd_str - String representation of virtio RDMA control commands
@@ -86,6 +87,36 @@ static void rdma_ah_attr_to_vrdma(struct vrdma_ah_attr *dst,
memcpy(&dst->roce, &src->roce, sizeof(struct roce_ah_attr));
}
+static void vrdma_to_ib_global_route(struct ib_global_route *dst,
+ const struct vrdma_global_route *src)
+{
+ dst->dgid = src->dgid;
+ dst->flow_label = src->flow_label;
+ dst->sgid_index = src->sgid_index;
+ dst->hop_limit = src->hop_limit;
+ dst->traffic_class = src->traffic_class;
+}
+
+static void vrdma_to_ib_qp_cap(struct ib_qp_cap *dst, const struct vrdma_qp_cap *src)
+{
+ dst->max_send_wr = src->max_send_wr;
+ dst->max_recv_wr = src->max_recv_wr;
+ dst->max_send_sge = src->max_send_sge;
+ dst->max_recv_sge = src->max_recv_sge;
+ dst->max_inline_data = src->max_inline_data;
+}
+
+static void vrdma_to_rdma_ah_attr(struct rdma_ah_attr *dst,
+ const struct vrdma_ah_attr *src)
+{
+ vrdma_to_ib_global_route(rdma_ah_retrieve_grh(dst), &src->grh);
+ rdma_ah_set_sl(dst, src->sl);
+ rdma_ah_set_static_rate(dst, src->static_rate);
+ rdma_ah_set_port_num(dst, src->port_num);
+ rdma_ah_set_ah_flags(dst, src->ah_flags);
+ memcpy(&dst->roce, &src->roce, sizeof(struct roce_ah_attr));
+}
+
/**
* vrdma_exec_verbs_cmd - Execute a verbs command via control virtqueue
* @vrdev: VRDMA device
@@ -2521,6 +2552,303 @@ static int vrdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
return rc;
}
+/**
+ * vrdma_query_pkey - Query Partition Key (P_Key) at given index
+ * @ibdev: Verbs device (vRDMA virtual device)
+ * @port: Port number (1-indexed)
+ * @index: P_Key table index
+ * @pkey: Output buffer to store the P_Key value
+ *
+ * Queries the P_Key from the backend via virtqueue command.
+ * Only meaningful for IB-style ports (not RoCE).
+ *
+ * Context: Process context (may sleep). Can be called from user IOCTL path.
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if command allocation fails
+ * * -EIO or other negative errno on communication failure
+ */
+static int vrdma_query_pkey(struct ib_device *ibdev, u32 port, u16 index, u16 *pkey)
+{
+ struct vrdma_dev *vdev = to_vdev(ibdev);
+ struct vrdma_cmd_query_pkey *cmd;
+ struct vrdma_rsp_query_pkey *rsp;
+ struct scatterlist in, out;
+ int rc;
+
+ /* Allocate command and response buffers */
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+ if (!rsp) {
+ kfree(cmd);
+ return -ENOMEM;
+ }
+
+ /* Fill input parameters */
+ cmd->port = cpu_to_le32(port);
+ cmd->index = cpu_to_le16(index);
+
+ /* Prepare scatterlists for virtqueue I/O */
+ sg_init_one(&in, cmd, sizeof(*cmd));
+ sg_init_one(&out, rsp, sizeof(*rsp));
+
+ /* Execute command */
+ rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_QUERY_PKEY, &in, &out);
+ if (rc) {
+ dev_err(&vdev->vdev->dev,
+ "VIRTIO_RDMA_CMD_QUERY_PKEY failed: port=%u idx=%u err=%d\n",
+ port, index, rc);
+ goto out_free;
+ }
+
+ /* Copy result to user */
+ *pkey = le16_to_cpu(rsp->pkey);
+
+out_free:
+ kfree(rsp);
+ kfree(cmd);
+ return rc;
+}
+
+/**
+ * vrdma_query_qp - Query QP attributes from the backend
+ * @ibqp: Queue pair to query
+ * @attr: Output structure for QP attributes
+ * @attr_mask: Which fields are requested (ignored by some backends)
+ * @init_attr: Output structure for init-time attributes
+ *
+ * Queries the QP state and configuration via a control virtqueue command.
+ * This is a synchronous operation.
+ *
+ * Context: Process context (can sleep)
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if allocation fails
+ * * -EIO or other negative errno on communication failure
+ */
+static int vrdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+ int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+ struct vrdma_qp *vqp = to_vqp(ibqp);
+ struct vrdma_dev *vdev = to_vdev(ibqp->device);
+ struct vrdma_cmd_query_qp *cmd;
+ struct vrdma_rsp_query_qp *rsp;
+ struct scatterlist in, out;
+ int rc;
+
+ /* Allocate command and response buffers */
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ if (!cmd)
+ return -ENOMEM;
+
+ rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+ if (!rsp) {
+ kfree(cmd);
+ return -ENOMEM;
+ }
+
+ /* Fill input parameters */
+ cmd->qpn = cpu_to_le32(vqp->qp_handle);
+ cmd->attr_mask = cpu_to_le32(attr_mask); /* Optional optimization */
+
+ sg_init_one(&in, cmd, sizeof(*cmd));
+ sg_init_one(&out, rsp, sizeof(*rsp));
+
+ /* Execute command over control virtqueue */
+ rc = vrdma_exec_verbs_cmd(vdev, VIRTIO_RDMA_CMD_QUERY_QP, &in, &out);
+ if (rc) {
+ dev_err(&vdev->vdev->dev,
+ "VIRTIO_RDMA_CMD_QUERY_QP failed: qpn=0x%x err=%d\n",
+ vqp->qp_handle, rc);
+ goto out_free;
+ }
+
+ /* Only copy results on success */
+ attr->qp_state = rsp->attr.qp_state;
+ attr->cur_qp_state = rsp->attr.cur_qp_state;
+ attr->path_mtu = rsp->attr.path_mtu;
+ attr->path_mig_state = rsp->attr.path_mig_state;
+ attr->qkey = le32_to_cpu(rsp->attr.qkey);
+ attr->rq_psn = le32_to_cpu(rsp->attr.rq_psn);
+ attr->sq_psn = le32_to_cpu(rsp->attr.sq_psn);
+ attr->dest_qp_num = le32_to_cpu(rsp->attr.dest_qp_num);
+ attr->qp_access_flags = le32_to_cpu(rsp->attr.qp_access_flags);
+ attr->pkey_index = le16_to_cpu(rsp->attr.pkey_index);
+ attr->alt_pkey_index = le16_to_cpu(rsp->attr.alt_pkey_index);
+ attr->en_sqd_async_notify = rsp->attr.en_sqd_async_notify;
+ attr->sq_draining = rsp->attr.sq_draining;
+ attr->max_rd_atomic = rsp->attr.max_rd_atomic;
+ attr->max_dest_rd_atomic = rsp->attr.max_dest_rd_atomic;
+ attr->min_rnr_timer = rsp->attr.min_rnr_timer;
+ attr->port_num = rsp->attr.port_num;
+ attr->timeout = rsp->attr.timeout;
+ attr->retry_cnt = rsp->attr.retry_cnt;
+ attr->rnr_retry = rsp->attr.rnr_retry;
+ attr->alt_port_num = rsp->attr.alt_port_num;
+ attr->alt_timeout = rsp->attr.alt_timeout;
+ attr->rate_limit = le32_to_cpu(rsp->attr.rate_limit);
+
+ /* Copy capabilities */
+ vrdma_to_ib_qp_cap(&attr->cap, &rsp->attr.cap);
+
+ /* Convert AH attributes (contains GRH + DIP) */
+ vrdma_to_rdma_ah_attr(&attr->ah_attr, &rsp->attr.ah_attr);
+ vrdma_to_rdma_ah_attr(&attr->alt_ah_attr, &rsp->attr.alt_ah_attr);
+
+ /* Fill init attributes (mostly static) */
+ init_attr->event_handler = vqp->ibqp.event_handler;
+ init_attr->qp_context = vqp->ibqp.qp_context;
+ init_attr->send_cq = vqp->ibqp.send_cq;
+ init_attr->recv_cq = vqp->ibqp.recv_cq;
+ init_attr->srq = vqp->ibqp.srq;
+ init_attr->xrcd = NULL; /* Not supported in vRDMA */
+ init_attr->cap = attr->cap;
+ init_attr->sq_sig_type = IB_SIGNAL_REQ_WR; /* Or driver default */
+ init_attr->qp_type = vqp->ibqp.qp_type;
+ init_attr->create_flags = 0;
+ init_attr->port_num = vqp->port;
+
+out_free:
+ kfree(rsp);
+ kfree(cmd);
+ return rc;
+}
+
+/**
+ * vrdma_reg_user_mr - Register a user memory region
+ * @pd: Protection domain
+ * @start: User virtual address of memory to register
+ * @length: Length of memory region
+ * virt_addr: Optional virtual address for rkey access (often same as start)
+ * @access_flags: Access permissions (IB_ACCESS_xxx)
+ * @udata: User data (optional, unused here)
+ *
+ * Locks down user pages, builds page table, and registers MR with backend.
+ * Returns pointer to ib_mr or ERR_PTR on failure.
+ *
+ * Context: Process context (may sleep during ib_umem_get)
+ * Return:
+ * * Pointer to &mr->ibmr on success
+ * * ERR_PTR(-errno) on failure
+ */
+static struct ib_mr *vrdma_reg_user_mr(struct ib_pd *pd, u64 start,
+ u64 length, u64 virt_addr,
+ int access_flags,
+ struct ib_udata *udata)
+{
+ struct vrdma_dev *dev = to_vdev(pd->device);
+ struct vrdma_cmd_reg_user_mr *cmd;
+ struct vrdma_rsp_reg_user_mr *rsp;
+ struct vrdma_mr *mr;
+ struct ib_umem *umem;
+ struct sg_dma_page_iter sg_iter;
+ struct scatterlist in, out;
+ int rc = 0;
+ unsigned npages;
+ dma_addr_t *pages_flat = NULL;
+
+ /* Step 1: Pin user memory pages */
+ umem = ib_umem_get(pd->device, start, length, access_flags);
+ if (IS_ERR(umem)) {
+ dev_err(&dev->vdev->dev, "Failed to pin user memory: va=0x%llx len=%llu\n",
+ start, length);
+ return ERR_CAST(umem);
+ }
+
+ npages = ib_umem_num_pages(umem);
+ if (npages == 0 || npages > VRDMA_MAX_PAGES) { // e.g., VRDMA_MAX_PAGES = 512*512
+ dev_err(&dev->vdev->dev, "Invalid number of pages: %u\n", npages);
+ rc = -EINVAL;
+ goto err_umem;
+ }
+
+ /* Allocate command/response structures (GFP_KERNEL ok in process context) */
+ cmd = kzalloc(sizeof(*cmd), GFP_KERNEL);
+ rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+ mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+ if (!cmd || !rsp || !mr) {
+ rc = -ENOMEM;
+ goto err_alloc;
+ }
+
+ /* Initialize MR structure */
+ mr->umem = umem;
+ mr->size = length;
+ mr->iova = virt_addr;
+ mr->max_pages = npages;
+
+ /* Allocate contiguous DMA-mapped array for page addresses */
+ pages_flat = dma_alloc_coherent(&dev->vdev->dev,
+ npages * sizeof(dma_addr_t),
+ &mr->dma_pages, GFP_KERNEL);
+ if (!pages_flat) {
+ dev_err(&dev->vdev->dev, "Failed to allocate DMA memory for page table\n");
+ rc = -ENOMEM;
+ goto err_alloc;
+ }
+ mr->pages_k = &pages_flat; /* Treat as 2D: [i/512][i%512] */
+
+ /* Fill page table from ib_umem scatterlist */
+ mr->npages = 0;
+ for_each_sg_dma_page(umem->sgt_append.sgt.sgl, &sg_iter, umem->sgt_append.sgt.nents, 0) {
+ dma_addr_t addr = sg_page_iter_dma_address(&sg_iter);
+ pages_flat[mr->npages++] = addr;
+ }
+
+ /* Sanity check: should match ib_umem_num_pages() */
+ WARN_ON(mr->npages != npages);
+
+ /* Prepare command */
+ cmd->pdn = cpu_to_le32(to_vpd(pd)->pd_handle);
+ cmd->start = cpu_to_le64(start);
+ cmd->length = cpu_to_le64(length);
+ cmd->virt_addr = cpu_to_le64(virt_addr);
+ cmd->access_flags = cpu_to_le32(access_flags);
+ cmd->pages = cpu_to_le64(mr->dma_pages); /* DMA address of page array */
+ cmd->npages = cpu_to_le32(npages);
+
+ sg_init_one(&in, cmd, sizeof(*cmd));
+ sg_init_one(&out, rsp, sizeof(*rsp));
+
+ /* Send command to backend */
+ rc = vrdma_exec_verbs_cmd(dev, VIRTIO_RDMA_CMD_REG_USER_MR, &in, &out);
+ if (rc) {
+ dev_err(&dev->vdev->dev, "Backend failed to register MR: %d\n", rc);
+ goto err_cmd;
+ }
+
+ /* Copy results from response */
+ mr->mr_handle = le32_to_cpu(rsp->mrn);
+ mr->ibmr.lkey = le32_to_cpu(rsp->lkey);
+ mr->ibmr.rkey = le32_to_cpu(rsp->rkey);
+
+ /* Cleanup temporary allocations */
+ kfree(cmd);
+ kfree(rsp);
+
+ /* Link MR to PD if needed, initialize other fields */
+ mr->ibmr.pd = pd;
+ mr->ibmr.device = pd->device;
+ mr->ibmr.type = IB_MR_TYPE_MEM_REG;
+ mr->ibmr.length = length;
+
+ return &mr->ibmr;
+
+err_cmd:
+ dma_free_coherent(&dev->vdev->dev, npages * sizeof(dma_addr_t),
+ pages_flat, mr->dma_pages);
+err_alloc:
+ kfree(mr);
+ kfree(rsp);
+ kfree(cmd);
+err_umem:
+ ib_umem_release(umem);
+ return ERR_PTR(rc);
+}
+
static const struct ib_device_ops vrdma_dev_ops = {
.owner = THIS_MODULE,
.uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION,
@@ -2554,7 +2882,10 @@ static const struct ib_device_ops vrdma_dev_ops = {
.modify_qp = vrdma_modify_qp,
.poll_cq = vrdma_poll_cq,
.post_recv = vrdma_post_recv,
- .post_send = vrdma_post_send,
+ .post_send = vrdma_post_send,
+ .query_pkey = vrdma_query_pkey,
+ .query_qp = vrdma_query_qp,
+ .reg_user_mr = vrdma_reg_user_mr,
};
/**
--
2.43.0
Powered by blists - more mailing lists