[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251218091050.55047-9-15927021679@163.com>
Date: Thu, 18 Dec 2025 17:09:48 +0800
From: Xiong Weimin <15927021679@....com>
To: "Michael S . Tsirkin" <mst@...hat.com>,
David Hildenbrand <david@...hat.com>,
Jason Wang <jasowang@...hat.com>,
Stefano Garzarella <sgarzare@...hat.com>,
Thomas Monjalon <thomas@...jalon.net>,
David Marchand <david.marchand@...hat.com>,
Luca Boccassi <bluca@...ian.org>,
Kevin Traynor <ktraynor@...hat.com>,
Christian Ehrhardt <christian.ehrhardt@...onical.com>,
Xuan Zhuo <xuanzhuo@...ux.alibaba.com>,
Eugenio Pérez <eperezma@...hat.com>,
Xueming Li <xuemingl@...dia.com>,
Maxime Coquelin <maxime.coquelin@...hat.com>,
Chenbo Xia <chenbox@...dia.com>,
Bruce Richardson <bruce.richardson@...el.com>
Cc: kvm@...r.kernel.org,
virtualization@...ts.linux.dev,
netdev@...r.kernel.org,
xiongweimin <xiongweimin@...inos.cn>
Subject: [PATCH 08/10] drivers/infiniband/hw/virtio: Implement send/receive verb support
From: xiongweimin <xiongweimin@...inos.cn>
This commit adds core RDMA verb implementations for the virtio RDMA driver:
1. Post Receive Support:
- Full handling of recv_wr chains with SGE conversion
- SMI QP rejection and user-space QP fast path
- Atomic buffer allocation with GFP_ATOMIC
2. Post Send Support:
- Comprehensive opcode support including RDMA/Atomic/UD
- Inline data handling via contiguous copy
- Detailed error handling with bad_wr tracking
- Memory registration support integration
Key features:
- Support for 15+ IB_WR_OPCODE types
- Specialized handling for UD/RC/GSI QP types
- Kernel-space WR processing with virtio command conversion
- Virtqueue batching optimizations
- Strict concurrency control with QP locks
Signed-off-by: Xiong Weimin <xiongweimin@...inos.cn>
---
.../drivers/infiniband/hw/virtio/vrdma_abi.h | 99 ++++--
.../drivers/infiniband/hw/virtio/vrdma_ib.c | 310 +++++++++++++++++-
2 files changed, 372 insertions(+), 37 deletions(-)
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_abi.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_abi.h
index 7cdc4e488..0a9404057 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_abi.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_abi.h
@@ -222,6 +222,19 @@ struct vrdma_av {
__u8 reserved[6]; /* Reserved for future use / alignment padding */
};
+struct vrdma_sge {
+ __u64 addr;
+ __u32 length;
+ __u32 lkey;
+};
+
+struct vrdma_cmd_post_recv {
+ __u32 qpn;
+ __u32 num_sge;
+ __u64 wr_id;
+ struct ib_sge *sge_list;
+};
+
/**
* struct vrdma_cmd_post_send - User-space command to post a Send WQE
*
@@ -232,48 +245,62 @@ struct vrdma_av {
* All fields use fixed-size types for ABI stability across architectures.
*/
struct vrdma_cmd_post_send {
- __u32 num_sge; /* Number of scatter-gather elements in this WQE */
-
- __u32 send_flags; /* IBV_SEND_xxx flags (e.g., signaled, inline, fence) */
- __u32 opcode; /* Operation code: RDMA_WRITE, SEND, ATOMIC, etc. */
- __u64 wr_id; /* Work Request ID returned in CQE */
-
union {
- __be32 imm_data; /* Immediate data for RC/UC QPs */
- __u32 invalidate_rkey; /* rkey to invalidate (on SEND_WITH_INV) */
- } ex;
-
- union wr_data {
+ /* Length of sg_list */
+ __le32 num_sge;
+ /* Length of inline data */
+ __le16 inline_len;
+ };
+ /* Flags of the WR properties */
+ __u8 send_flags;
+ /* WR opcode, enum virtio_ib_wr_opcode */
+ __u32 opcode;
+ /* User defined WR ID */
+ __le64 wr_id;
+#define VIRTIO_IB_SEND_FENCE (1 << 0)
+#define VIRTIO_IB_SEND_SIGNALED (1 << 1)
+#define VIRTIO_IB_SEND_SOLICITED (1 << 2)
+#define VIRTIO_IB_SEND_INLINE (1 << 3)
+ /* Immediate data (in network byte order) to send */
+ __le32 imm_data;
+ union {
+ __le32 imm_data;
+ __u32 invalidate_rkey;
+ } ex;
+ union {
struct {
- __u64 remote_addr; /* Target virtual address for RDMA op */
- __u32 rkey; /* Remote key for memory access */
+ /* Start address of remote memory buffer */
+ __le64 remote_addr;
+ /* Key of the remote MR */
+ __le32 rkey;
} rdma;
-
- struct {
- __u64 remote_addr; /* Address of atomic variable */
- __u64 compare_add; /* Value to compare */
- __u64 swap; /* Value to swap (or add) */
- __u32 rkey; /* Remote memory key */
- } atomic;
-
+ struct {
+ __u64 remote_addr;
+ __u64 compare_add;
+ __u64 swap;
+ __u32 rkey;
+ } atomic;
struct {
- __u32 remote_qpn; /* Destination QP number */
- __u32 remote_qkey; /* Q_Key for UD packet validation */
- struct vrdma_av av; /* Address vector (L2/L3 info) */
+ /* Index of the destination QP */
+ __le32 remote_qpn;
+ /* Q_Key of the destination QP */
+ __le32 remote_qkey;
+ struct vrdma_av av;
} ud;
-
- struct {
- __u32 mrn; /* Memory Region Number (MR handle) */
- __u32 key; /* Staging rkey for MR registration */
- __u32 access; /* Access flags (IB_ACCESS_xxx) */
- } reg;
+ struct {
+ __u32 mrn;
+ __u32 key;
+ int access;
+ } reg;
+ /* Reserved for future */
+ __le64 reserved[4];
} wr;
-};
-
-struct vrdma_sge {
- __u64 addr;
- __u32 length;
- __u32 lkey;
+ /* Inline data */
+ //__u8 inline_data[512];
+ /* Reserved for future */
+ __le32 reserved2[3];
+ /* Scatter/gather list */
+ struct vrdma_sge sg_list[];
};
#endif
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
index 705d18b55..f9b129774 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
@@ -2215,6 +2215,312 @@ static int vrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
return i; /* Return number of polled completions */
}
+/**
+ * vrdma_post_recv - Post a list of receive work requests
+ * @ibqp: Queue pair
+ * @wr: List of receive work requests
+ * @bad_wr: Out parameter pointing to first failed WR on error
+ *
+ * Submits receive buffers to the backend via virtqueue.
+ * Each WR is serialized into a command structure and passed to the host.
+ *
+ * Context: Process context (may be called from atomic context in rare cases).
+ * Return:
+ * * 0 on success
+ * * negative errno on failure (e.g., -ENOMEM, -EOPNOTSUPP)
+ */
+static int vrdma_post_recv(struct ib_qp *ibqp,
+ const struct ib_recv_wr *wr,
+ const struct ib_recv_wr **bad_wr)
+{
+ struct vrdma_qp *vqp = to_vqp(ibqp);
+ struct vrdma_cmd_post_recv *cmd;
+ unsigned int sgl_size;
+ int rc = 0;
+ struct scatterlist hdr;
+ struct scatterlist *sgs[1];
+ unsigned long flags;
+
+ /* SMI QPs are not supported */
+ if (ibqp->qp_type == IB_QPT_SMI) {
+ *bad_wr = wr;
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * For user-space QPs, we assume recv posting is handled differently
+ * (e.g., through mmap'ed rings). Skip kernel-side posting.
+ */
+ if (vqp->type == VIRTIO_RDMA_TYPE_USER)
+ goto kick_and_return;
+
+ /* Serialize access to RQ */
+ spin_lock_irqsave(&vqp->rq->lock, flags);
+
+ while (wr) {
+ /* Validate required fields */
+ if (unlikely(!wr->num_sge)) {
+ rc = -EINVAL;
+ goto out_bad_wr;
+ }
+
+ /* Calculate size of SGE array to copy */
+ sgl_size = sizeof(struct vrdma_sge) * wr->num_sge;
+ cmd = kzalloc(sizeof(*cmd) + sgl_size, GFP_ATOMIC);
+ if (!cmd) {
+ rc = -ENOMEM;
+ goto out_bad_wr;
+ }
+
+ cmd->sge_list = kzalloc(sgl_size, GFP_ATOMIC);
+ if (!cmd->sge_list) {
+ rc = -ENOMEM;
+ goto out_bad_wr;
+ }
+
+ /* Fill command */
+ cmd->qpn = vqp->qp_handle;
+ cmd->wr_id = (ibqp->qp_type == IB_QPT_GSI) ? 0 : wr->wr_id;
+ cmd->num_sge = wr->num_sge;
+
+ /* Copy SGEs from user WR into command buffer */
+ memcpy(cmd->sge_list, wr->sg_list, sgl_size);
+
+ /* Prepare scatterlist for virtqueue */
+ sg_init_one(&hdr, cmd, sizeof(*cmd) + sgl_size);
+ sgs[0] = &hdr;
+
+ /* Add to virtqueue */
+ rc = virtqueue_add_sgs(vqp->rq->vq, sgs, 1, 0, cmd, GFP_ATOMIC);
+ if (rc) {
+ kfree(cmd);
+ goto out_bad_wr;
+ }
+
+ wr = wr->next;
+ }
+
+ spin_unlock_irqrestore(&vqp->rq->lock, flags);
+
+kick_and_return:
+ virtqueue_kick(vqp->rq->vq);
+ return 0;
+
+out_bad_wr:
+ *bad_wr = wr;
+ spin_unlock_irqrestore(&vqp->rq->lock, flags);
+ virtqueue_kick(vqp->rq->vq); /* Still kick so backend knows partial update */
+ return rc;
+}
+
+/**
+ * copy_inline_data_to_wqe - Copy inline data from SGEs into WQE buffer
+ * @wqe: Pointer to the vrdma_cmd_post_send command structure
+ * @ibwr: IB send work request containing SGEs with inline data
+ *
+ * Copies all data referenced by SGEs into a contiguous area immediately
+ * following the WQE header, typically used when IB_SEND_INLINE is set.
+ *
+ * Assumes:
+ * - Memory at sge->addr is accessible (kernel virtual address)
+ * - Total size <= device max_inline_data
+ * - wqe has enough tailroom for all data
+ *
+ * Context: Called under spinlock, atomic context (GFP_ATOMIC allocation)
+ */
+static void vrdma_copy_inline_data_to_wqe(struct vrdma_cmd_post_send *wqe,
+ const struct ib_send_wr *ibwr)
+{
+ const struct ib_sge *sge;
+ char *dst = (char *)wqe + sizeof(*wqe); /* Start after header */
+ int i;
+
+ for (i = 0; i < ibwr->num_sge; i++) {
+ sge = &ibwr->sg_list[i];
+
+ /* Skip zero-length segments */
+ if (sge->length == 0)
+ continue;
+
+ /*
+ * WARNING: sge->addr is a user-space or kernel virtual address.
+ * Using (void *)(uintptr_t)sge->addr assumes it's directly dereferenceable.
+ * This is only valid if:
+ * - The QP is KERNEL type AND
+ * - The memory was registered and we trust its mapping
+ */
+
+ memcpy(dst, (void *)(uintptr_t)sge->addr, sge->length);
+ dst += sge->length;
+ }
+}
+
+/**
+ * vrdma_post_send - Post a list of send work requests to the SQ
+ * @ibqp: Queue pair
+ * @wr: List of work requests
+ * @bad_wr: Out parameter pointing to failing WR on error
+ *
+ * Converts each ib_send_wr into a vrdma_cmd_post_send and submits it
+ * via the send virtqueue. Supports both kernel and user QPs.
+ *
+ * Context: Process context (may hold spinlock, so no sleep in atomic section)
+ * Return:
+ * * 0 on success
+ * * negative errno on failure (e.g., -EINVAL, -ENOMEM)
+ * * @bad_wr set to first failed WR
+ */
+static int vrdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
+ const struct ib_send_wr **bad_wr)
+{
+ struct vrdma_qp *vqp = to_vqp(ibqp);
+ struct vrdma_cmd_post_send *cmd;
+ unsigned int sgl_size;
+ int rc = 0;
+ struct scatterlist hdr;
+ struct scatterlist *sgs[1];
+
+ /* Fast path for user-space QP: defer to userspace */
+ if (vqp->type == VIRTIO_RDMA_TYPE_USER) {
+ virtqueue_kick(vqp->sq->vq);
+ return 0;
+ }
+
+ spin_lock(&vqp->sq->lock);
+
+ while (wr) {
+ *bad_wr = wr; /* In case of error */
+
+ /* Validate opcode support in kernel QP */
+ switch (wr->opcode) {
+ case IB_WR_SEND:
+ case IB_WR_SEND_WITH_IMM:
+ case IB_WR_SEND_WITH_INV:
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ case IB_WR_RDMA_READ:
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ case IB_WR_LOCAL_INV:
+ case IB_WR_REG_MR:
+ break;
+ default:
+ pr_warn("vRDMA: unsupported opcode %d for kernel QP\n",
+ wr->opcode);
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* Allocate command buffer including space for SGEs */
+ sgl_size = wr->num_sge * sizeof(struct vrdma_sge);
+ cmd = kzalloc(sizeof(*cmd) + sgl_size, GFP_ATOMIC);
+ if (!cmd) {
+ rc = -ENOMEM;
+ goto out_unlock;
+ }
+
+ /* Fill common fields */
+ cmd->wr_id = wr->wr_id;
+ cmd->num_sge = wr->num_sge;
+ cmd->send_flags = wr->send_flags;
+ cmd->opcode = wr->opcode;
+
+ /* Immediate data and invalidation key */
+ if (wr->send_flags & IB_SEND_INLINE) {
+ /* TODO: Check max_inline_data limit */
+ vrdma_copy_inline_data_to_wqe(cmd, wr);
+ } else {
+ memcpy(cmd->sg_list, wr->sg_list, sgl_size);
+ }
+
+ /* Handle immediate data (SEND_WITH_IMM, WRITE_WITH_IMM) */
+ if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+ wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+ cmd->ex.imm_data = wr->ex.imm_data;
+
+ /* Handle invalidate key (SEND_WITH_INV, LOCAL_INV) */
+ if (wr->opcode == IB_WR_SEND_WITH_INV ||
+ wr->opcode == IB_WR_LOCAL_INV)
+ cmd->ex.invalidate_rkey = wr->ex.invalidate_rkey;
+
+ /* RDMA and Atomic specific fields */
+ switch (ibqp->qp_type) {
+ case IB_QPT_RC:
+ switch (wr->opcode) {
+ case IB_WR_RDMA_READ:
+ case IB_WR_RDMA_WRITE:
+ case IB_WR_RDMA_WRITE_WITH_IMM:
+ cmd->wr.rdma.remote_addr = rdma_wr(wr)->remote_addr;
+ cmd->wr.rdma.rkey = rdma_wr(wr)->rkey;
+ break;
+
+ case IB_WR_ATOMIC_CMP_AND_SWP:
+ case IB_WR_ATOMIC_FETCH_AND_ADD:
+ cmd->wr.atomic.remote_addr = atomic_wr(wr)->remote_addr;
+ cmd->wr.atomic.rkey = atomic_wr(wr)->rkey;
+ cmd->wr.atomic.compare_add = atomic_wr(wr)->compare_add;
+ if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP)
+ cmd->wr.atomic.swap = atomic_wr(wr)->swap;
+ break;
+
+ case IB_WR_REG_MR: {
+ const struct ib_reg_wr *reg = reg_wr(wr);
+ struct vrdma_mr *vmr = to_vmr(reg->mr);
+ cmd->wr.reg.mrn = vmr->mr_handle;
+ cmd->wr.reg.key = reg->key;
+ cmd->wr.reg.access = reg->access;
+ break;
+ }
+ default:
+ break;
+ }
+ break;
+
+ case IB_QPT_UD:
+ case IB_QPT_GSI: {
+ if (!ud_wr(wr)->ah) {
+ pr_warn("vRDMA: invalid address handle in UD WR\n");
+ kfree(cmd);
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+ cmd->wr.ud.remote_qpn = ud_wr(wr)->remote_qpn;
+ cmd->wr.ud.remote_qkey = ud_wr(wr)->remote_qkey;
+ cmd->wr.ud.av = to_vah(ud_wr(wr)->ah)->av;
+ break;
+ }
+
+ default:
+ pr_err("vRDMA: unsupported QP type %d\n", ibqp->qp_type);
+ kfree(cmd);
+ rc = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* Prepare scatterlist for virtqueue */
+ sg_init_one(&hdr, cmd, sizeof(*cmd) + sgl_size);
+ sgs[0] = &hdr;
+
+ rc = virtqueue_add_sgs(vqp->sq->vq, sgs, 1, 0, cmd, GFP_ATOMIC);
+ if (rc) {
+ dev_err(&vqp->sq->vq->vdev->dev,
+ "vRDMA: failed to add send WR to vq: %d\n", rc);
+ kfree(cmd);
+ goto out_unlock;
+ }
+
+ /* Advance to next WR */
+ wr = wr->next;
+ }
+
+out_unlock:
+ spin_unlock(&vqp->sq->lock);
+
+ /* Only kick after successful submission(s), but always try to kick */
+ virtqueue_kick(vqp->sq->vq);
+ return rc;
+}
+
static const struct ib_device_ops vrdma_dev_ops = {
.owner = THIS_MODULE,
.uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION,
@@ -2246,7 +2552,9 @@ static const struct ib_device_ops vrdma_dev_ops = {
.mmap_free = vrdma_mmap_free,
.modify_port = vrdma_modify_port,
.modify_qp = vrdma_modify_qp,
- .poll_cq = vrdma_poll_cq,
+ .poll_cq = vrdma_poll_cq,
+ .post_recv = vrdma_post_recv,
+ .post_send = vrdma_post_send,
};
/**
--
2.43.0
Powered by blists - more mailing lists