lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251218091050.55047-9-15927021679@163.com>
Date: Thu, 18 Dec 2025 17:09:48 +0800
From: Xiong Weimin <15927021679@....com>
To: "Michael S . Tsirkin" <mst@...hat.com>,
	David Hildenbrand <david@...hat.com>,
	Jason Wang <jasowang@...hat.com>,
	Stefano Garzarella <sgarzare@...hat.com>,
	Thomas Monjalon <thomas@...jalon.net>,
	David Marchand <david.marchand@...hat.com>,
	Luca Boccassi <bluca@...ian.org>,
	Kevin Traynor <ktraynor@...hat.com>,
	Christian Ehrhardt <christian.ehrhardt@...onical.com>,
	Xuan Zhuo <xuanzhuo@...ux.alibaba.com>,
	Eugenio Pérez <eperezma@...hat.com>,
	Xueming Li <xuemingl@...dia.com>,
	Maxime Coquelin <maxime.coquelin@...hat.com>,
	Chenbo Xia <chenbox@...dia.com>,
	Bruce Richardson <bruce.richardson@...el.com>
Cc: kvm@...r.kernel.org,
	virtualization@...ts.linux.dev,
	netdev@...r.kernel.org,
	xiongweimin <xiongweimin@...inos.cn>
Subject: [PATCH 08/10] drivers/infiniband/hw/virtio: Implement send/receive verb support

From: xiongweimin <xiongweimin@...inos.cn>

This commit adds core RDMA verb implementations for the virtio RDMA driver:

1. Post Receive Support:
   - Full handling of recv_wr chains with SGE conversion
   - SMI QP rejection and user-space QP fast path
   - Atomic buffer allocation with GFP_ATOMIC

2. Post Send Support:
   - Comprehensive opcode support including RDMA/Atomic/UD
   - Inline data handling via contiguous copy
   - Detailed error handling with bad_wr tracking
   - Memory registration support integration

Key features:
- Support for 15+ IB_WR_OPCODE types
- Specialized handling for UD/RC/GSI QP types
- Kernel-space WR processing with virtio command conversion
- Virtqueue batching optimizations
- Strict concurrency control with QP locks

Signed-off-by: Xiong Weimin <xiongweimin@...inos.cn>
---
 .../drivers/infiniband/hw/virtio/vrdma_abi.h  |  99 ++++--
 .../drivers/infiniband/hw/virtio/vrdma_ib.c   | 310 +++++++++++++++++-
 2 files changed, 372 insertions(+), 37 deletions(-)

diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_abi.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_abi.h
index 7cdc4e488..0a9404057 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_abi.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_abi.h
@@ -222,6 +222,19 @@ struct vrdma_av {
 	__u8 reserved[6];		/* Reserved for future use / alignment padding */
 };
 
+struct vrdma_sge {
+    __u64 addr;
+    __u32 length;
+    __u32 lkey;
+};
+
+struct vrdma_cmd_post_recv {
+	__u32 qpn;
+	__u32 num_sge;
+	__u64 wr_id;
+	struct ib_sge *sge_list;
+};
+
 /**
  * struct vrdma_cmd_post_send - User-space command to post a Send WQE
  *
@@ -232,48 +245,62 @@ struct vrdma_av {
  * All fields use fixed-size types for ABI stability across architectures.
  */
 struct vrdma_cmd_post_send {
-	__u32 num_sge;		/* Number of scatter-gather elements in this WQE */
-
-	__u32 send_flags;	/* IBV_SEND_xxx flags (e.g., signaled, inline, fence) */
-	__u32 opcode;		/* Operation code: RDMA_WRITE, SEND, ATOMIC, etc. */
-	__u64 wr_id;		/* Work Request ID returned in CQE */
-
 	union {
-		__be32	imm_data;		/* Immediate data for RC/UC QPs */
-		__u32	invalidate_rkey;	/* rkey to invalidate (on SEND_WITH_INV) */
-	} ex;
-
-	union wr_data {
+		/* Length of sg_list */
+		__le32 num_sge;
+		/* Length of inline data */
+		__le16 inline_len;
+	};
+	/* Flags of the WR properties */
+	__u8 send_flags;
+	/* WR opcode, enum virtio_ib_wr_opcode */
+	__u32 opcode;
+	/* User defined WR ID */
+	__le64 wr_id;	
+#define VIRTIO_IB_SEND_FENCE        (1 << 0)
+#define VIRTIO_IB_SEND_SIGNALED     (1 << 1)
+#define VIRTIO_IB_SEND_SOLICITED    (1 << 2)
+#define VIRTIO_IB_SEND_INLINE       (1 << 3)
+	/* Immediate data (in network byte order) to send */
+	__le32 imm_data;
+    union {
+        __le32 imm_data;
+        __u32 invalidate_rkey;
+    } ex;	
+	union {
 		struct {
-			__u64 remote_addr;	/* Target virtual address for RDMA op */
-			__u32 rkey;		/* Remote key for memory access */
+			/* Start address of remote memory buffer */
+			__le64 remote_addr;
+			/* Key of the remote MR */
+			__le32 rkey;
 		} rdma;
-
-		struct {
-			__u64 remote_addr;	/* Address of atomic variable */
-			__u64 compare_add;	/* Value to compare */
-			__u64 swap;		/* Value to swap (or add) */
-			__u32 rkey;		/* Remote memory key */
-		} atomic;
-
+        struct {
+            __u64 remote_addr;
+            __u64 compare_add;
+            __u64 swap;
+            __u32 rkey;
+        } atomic;		
 		struct {
-			__u32 remote_qpn;	/* Destination QP number */
-			__u32 remote_qkey;	/* Q_Key for UD packet validation */
-			struct vrdma_av av;	/* Address vector (L2/L3 info) */
+			/* Index of the destination QP */
+			__le32 remote_qpn;
+			/* Q_Key of the destination QP */
+			__le32 remote_qkey;
+			struct vrdma_av av;
 		} ud;
-
-		struct {
-			__u32 mrn;		/* Memory Region Number (MR handle) */
-			__u32 key;		/* Staging rkey for MR registration */
-			__u32 access;		/* Access flags (IB_ACCESS_xxx) */
-		} reg;
+        struct {
+            __u32 mrn;
+            __u32 key;
+            int access;
+        } reg;		
+		/* Reserved for future */
+		__le64 reserved[4];
 	} wr;
-};
-
-struct vrdma_sge {
-    __u64 addr;
-    __u32 length;
-    __u32 lkey;
+	/* Inline data */
+	//__u8 inline_data[512];
+	/* Reserved for future */
+	__le32 reserved2[3];
+	/* Scatter/gather list */
+	struct vrdma_sge sg_list[];
 };
 
 #endif
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
index 705d18b55..f9b129774 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
@@ -2215,6 +2215,312 @@ static int vrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
 	return i; /* Return number of polled completions */
 }
 
+/**
+ * vrdma_post_recv - Post a list of receive work requests
+ * @ibqp:	Queue pair
+ * @wr:		List of receive work requests
+ * @bad_wr:	Out parameter pointing to first failed WR on error
+ *
+ * Submits receive buffers to the backend via virtqueue.
+ * Each WR is serialized into a command structure and passed to the host.
+ *
+ * Context: Process context (may be called from atomic context in rare cases).
+ * Return:
+ * * 0 on success
+ * * negative errno on failure (e.g., -ENOMEM, -EOPNOTSUPP)
+ */
+static int vrdma_post_recv(struct ib_qp *ibqp,
+				 const struct ib_recv_wr *wr,
+				 const struct ib_recv_wr **bad_wr)
+{
+	struct vrdma_qp *vqp = to_vqp(ibqp);
+	struct vrdma_cmd_post_recv *cmd;
+	unsigned int sgl_size;
+	int rc = 0;
+	struct scatterlist hdr;
+	struct scatterlist *sgs[1];
+	unsigned long flags;
+
+	/* SMI QPs are not supported */
+	if (ibqp->qp_type == IB_QPT_SMI) {
+		*bad_wr = wr;
+		return -EOPNOTSUPP;
+	}
+
+	/*
+	 * For user-space QPs, we assume recv posting is handled differently
+	 * (e.g., through mmap'ed rings). Skip kernel-side posting.
+	 */
+	if (vqp->type == VIRTIO_RDMA_TYPE_USER)
+		goto kick_and_return;
+
+	/* Serialize access to RQ */
+	spin_lock_irqsave(&vqp->rq->lock, flags);
+
+	while (wr) {
+		/* Validate required fields */
+		if (unlikely(!wr->num_sge)) {
+			rc = -EINVAL;
+			goto out_bad_wr;
+		}
+
+		/* Calculate size of SGE array to copy */
+		sgl_size = sizeof(struct vrdma_sge) * wr->num_sge;
+		cmd = kzalloc(sizeof(*cmd) + sgl_size, GFP_ATOMIC);
+		if (!cmd) {
+			rc = -ENOMEM;
+			goto out_bad_wr;
+		}
+
+		cmd->sge_list = kzalloc(sgl_size, GFP_ATOMIC);
+		if (!cmd->sge_list) {
+			rc = -ENOMEM;
+			goto out_bad_wr;
+		}		
+
+		/* Fill command */
+		cmd->qpn = vqp->qp_handle;
+		cmd->wr_id = (ibqp->qp_type == IB_QPT_GSI) ? 0 : wr->wr_id;
+		cmd->num_sge = wr->num_sge;
+
+		/* Copy SGEs from user WR into command buffer */
+		memcpy(cmd->sge_list, wr->sg_list, sgl_size);
+
+		/* Prepare scatterlist for virtqueue */
+		sg_init_one(&hdr, cmd, sizeof(*cmd) + sgl_size);
+		sgs[0] = &hdr;
+
+		/* Add to virtqueue */
+		rc = virtqueue_add_sgs(vqp->rq->vq, sgs, 1, 0, cmd, GFP_ATOMIC);
+		if (rc) {
+			kfree(cmd);
+			goto out_bad_wr;
+		}
+
+		wr = wr->next;
+	}
+
+	spin_unlock_irqrestore(&vqp->rq->lock, flags);
+
+kick_and_return:
+	virtqueue_kick(vqp->rq->vq);
+	return 0;
+
+out_bad_wr:
+	*bad_wr = wr;
+	spin_unlock_irqrestore(&vqp->rq->lock, flags);
+	virtqueue_kick(vqp->rq->vq); /* Still kick so backend knows partial update */
+	return rc;
+}
+
+/**
+ * copy_inline_data_to_wqe - Copy inline data from SGEs into WQE buffer
+ * @wqe:	Pointer to the vrdma_cmd_post_send command structure
+ * @ibwr:	IB send work request containing SGEs with inline data
+ *
+ * Copies all data referenced by SGEs into a contiguous area immediately
+ * following the WQE header, typically used when IB_SEND_INLINE is set.
+ *
+ * Assumes:
+ * - Memory at sge->addr is accessible (kernel virtual address)
+ * - Total size <= device max_inline_data
+ * - wqe has enough tailroom for all data
+ *
+ * Context: Called under spinlock, atomic context (GFP_ATOMIC allocation)
+ */
+static void vrdma_copy_inline_data_to_wqe(struct vrdma_cmd_post_send *wqe,
+				    const struct ib_send_wr *ibwr)
+{
+	const struct ib_sge *sge;
+	char *dst = (char *)wqe + sizeof(*wqe); /* Start after header */
+	int i;
+
+	for (i = 0; i < ibwr->num_sge; i++) {
+		sge = &ibwr->sg_list[i];
+
+		/* Skip zero-length segments */
+		if (sge->length == 0)
+			continue;
+
+		/*
+		 * WARNING: sge->addr is a user-space or kernel virtual address.
+		 * Using (void *)(uintptr_t)sge->addr assumes it's directly dereferenceable.
+		 * This is only valid if:
+		 *   - The QP is KERNEL type AND
+		 *   - The memory was registered and we trust its mapping
+		 */
+
+		memcpy(dst, (void *)(uintptr_t)sge->addr, sge->length);
+		dst += sge->length;
+	}
+}
+
+/**
+ * vrdma_post_send - Post a list of send work requests to the SQ
+ * @ibqp:	Queue pair
+ * @wr:		List of work requests
+ * @bad_wr:	Out parameter pointing to failing WR on error
+ *
+ * Converts each ib_send_wr into a vrdma_cmd_post_send and submits it
+ * via the send virtqueue. Supports both kernel and user QPs.
+ *
+ * Context: Process context (may hold spinlock, so no sleep in atomic section)
+ * Return:
+ * * 0 on success
+ * * negative errno on failure (e.g., -EINVAL, -ENOMEM)
+ * * @bad_wr set to first failed WR
+ */
+static int vrdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
+			   const struct ib_send_wr **bad_wr)
+{
+	struct vrdma_qp *vqp = to_vqp(ibqp);
+	struct vrdma_cmd_post_send *cmd;
+	unsigned int sgl_size;
+	int rc = 0;
+	struct scatterlist hdr;
+	struct scatterlist *sgs[1];
+
+	/* Fast path for user-space QP: defer to userspace */
+	if (vqp->type == VIRTIO_RDMA_TYPE_USER) {
+		virtqueue_kick(vqp->sq->vq);
+		return 0;
+	}
+
+	spin_lock(&vqp->sq->lock);
+
+	while (wr) {
+		*bad_wr = wr; /* In case of error */
+
+		/* Validate opcode support in kernel QP */
+		switch (wr->opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+		case IB_WR_SEND_WITH_INV:
+		case IB_WR_RDMA_WRITE:
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+		case IB_WR_RDMA_READ:
+		case IB_WR_ATOMIC_CMP_AND_SWP:
+		case IB_WR_ATOMIC_FETCH_AND_ADD:
+		case IB_WR_LOCAL_INV:
+		case IB_WR_REG_MR:
+			break;
+		default:
+			pr_warn("vRDMA: unsupported opcode %d for kernel QP\n",
+				wr->opcode);
+			rc = -EINVAL;
+			goto out_unlock;
+		}
+
+		/* Allocate command buffer including space for SGEs */
+		sgl_size = wr->num_sge * sizeof(struct vrdma_sge);
+		cmd = kzalloc(sizeof(*cmd) + sgl_size, GFP_ATOMIC);
+		if (!cmd) {
+			rc = -ENOMEM;
+			goto out_unlock;
+		}
+
+		/* Fill common fields */
+		cmd->wr_id = wr->wr_id;
+		cmd->num_sge = wr->num_sge;
+		cmd->send_flags = wr->send_flags;
+		cmd->opcode = wr->opcode;
+
+		/* Immediate data and invalidation key */
+		if (wr->send_flags & IB_SEND_INLINE) {
+			/* TODO: Check max_inline_data limit */
+			vrdma_copy_inline_data_to_wqe(cmd, wr);
+		} else {
+			memcpy(cmd->sg_list, wr->sg_list, sgl_size);
+		}
+
+		/* Handle immediate data (SEND_WITH_IMM, WRITE_WITH_IMM) */
+		if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+			cmd->ex.imm_data = wr->ex.imm_data;
+
+		/* Handle invalidate key (SEND_WITH_INV, LOCAL_INV) */
+		if (wr->opcode == IB_WR_SEND_WITH_INV ||
+		    wr->opcode == IB_WR_LOCAL_INV)
+			cmd->ex.invalidate_rkey = wr->ex.invalidate_rkey;
+
+		/* RDMA and Atomic specific fields */
+		switch (ibqp->qp_type) {
+		case IB_QPT_RC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				cmd->wr.rdma.remote_addr = rdma_wr(wr)->remote_addr;
+				cmd->wr.rdma.rkey = rdma_wr(wr)->rkey;
+				break;
+
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				cmd->wr.atomic.remote_addr = atomic_wr(wr)->remote_addr;
+				cmd->wr.atomic.rkey = atomic_wr(wr)->rkey;
+				cmd->wr.atomic.compare_add = atomic_wr(wr)->compare_add;
+				if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP)
+					cmd->wr.atomic.swap = atomic_wr(wr)->swap;
+				break;
+
+			case IB_WR_REG_MR: {
+				const struct ib_reg_wr *reg = reg_wr(wr);
+				struct vrdma_mr *vmr = to_vmr(reg->mr);
+				cmd->wr.reg.mrn = vmr->mr_handle;
+				cmd->wr.reg.key = reg->key;
+				cmd->wr.reg.access = reg->access;
+				break;
+			}
+			default:
+				break;
+			}
+			break;
+
+		case IB_QPT_UD:
+		case IB_QPT_GSI: {
+			if (!ud_wr(wr)->ah) {
+				pr_warn("vRDMA: invalid address handle in UD WR\n");
+				kfree(cmd);
+				rc = -EINVAL;
+				goto out_unlock;
+			}
+			cmd->wr.ud.remote_qpn = ud_wr(wr)->remote_qpn;
+			cmd->wr.ud.remote_qkey = ud_wr(wr)->remote_qkey;
+			cmd->wr.ud.av = to_vah(ud_wr(wr)->ah)->av;
+			break;
+		}
+
+		default:
+			pr_err("vRDMA: unsupported QP type %d\n", ibqp->qp_type);
+			kfree(cmd);
+			rc = -EINVAL;
+			goto out_unlock;
+		}
+
+		/* Prepare scatterlist for virtqueue */
+		sg_init_one(&hdr, cmd, sizeof(*cmd) + sgl_size);
+		sgs[0] = &hdr;
+
+		rc = virtqueue_add_sgs(vqp->sq->vq, sgs, 1, 0, cmd, GFP_ATOMIC);
+		if (rc) {
+			dev_err(&vqp->sq->vq->vdev->dev,
+				"vRDMA: failed to add send WR to vq: %d\n", rc);
+			kfree(cmd);
+			goto out_unlock;
+		}
+
+		/* Advance to next WR */
+		wr = wr->next;
+	}
+
+out_unlock:
+	spin_unlock(&vqp->sq->lock);
+
+	/* Only kick after successful submission(s), but always try to kick */
+	virtqueue_kick(vqp->sq->vq);
+	return rc;
+}
+
 static const struct ib_device_ops vrdma_dev_ops = {
 	.owner = THIS_MODULE,
 	.uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION,
@@ -2246,7 +2552,9 @@ static const struct ib_device_ops vrdma_dev_ops = {
 	.mmap_free = vrdma_mmap_free,
 	.modify_port = vrdma_modify_port,
 	.modify_qp = vrdma_modify_qp,
-	.poll_cq = vrdma_poll_cq,			
+	.poll_cq = vrdma_poll_cq,
+	.post_recv = vrdma_post_recv,
+	.post_send = vrdma_post_send,			
 };
 
 /**
-- 
2.43.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ