[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251217080605.38473-9-15927021679@163.com>
Date: Wed, 17 Dec 2025 16:05:18 +0800
From: 15927021679@....com
To: Alexei Starovoitov <ast@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>,
"David S . Miller" <davem@...emloft.net>,
Jakub Kicinski <kuba@...nel.org>,
Jesper Dangaard Brouer <hawk@...nel.org>,
John Fastabend <john.fastabend@...il.com>,
Stanislav Fomichev <sdf@...ichev.me>
Cc: linux-kernel@...r.kernel.org,
netdev@...r.kernel.org,
xiongweimin <xiongweimin@...inos.cn>
Subject: [PATCH 07/14] examples/vhost_user_rdma: Implement high-performance requester engine with advanced flow control
From: xiongweimin <xiongweimin@...inos.cn>
This commit adds the core requester engine for RDMA operations:
1. Work Queue Element (WQE) processing state machine
2. Flow control with window-based congestion avoidance
3. MTU-aware packet segmentation
4. Error handling with automatic retry mechanisms
5. Atomic operation support and resource management
Key features:
- PSN-based flow control for reliable connections (RC)
- UD MTU handling with simulated success for oversize packets
- Work request state management (DONE, ERROR, RETRY)
- Packet construction and transmission pipeline
- Memory buffer (mbuf) accounting for congestion control
- Atomic reference counting for safe resource handling
Signed-off-by: Xiong Weimin <xiongweimin@...inos.cn>
Change-Id: Ib0873f3d56ff71ed9f51e47edfa972054145f226
---
examples/vhost_user_rdma/meson.build | 2 +
examples/vhost_user_rdma/vhost_rdma.h | 9 +
examples/vhost_user_rdma/vhost_rdma_crc.c | 163 ++++
examples/vhost_user_rdma/vhost_rdma_opcode.c | 141 +++-
examples/vhost_user_rdma/vhost_rdma_opcode.h | 335 ++++++--
examples/vhost_user_rdma/vhost_rdma_pkt.c | 221 +++++
examples/vhost_user_rdma/vhost_rdma_pkt.h | 31 +-
examples/vhost_user_rdma/vhost_rdma_queue.c | 826 ++++++++++++++++++-
examples/vhost_user_rdma/vhost_rdma_queue.h | 221 ++++-
9 files changed, 1855 insertions(+), 94 deletions(-)
create mode 100644 examples/vhost_user_rdma/vhost_rdma_crc.c
create mode 100644 examples/vhost_user_rdma/vhost_rdma_pkt.c
diff --git a/examples/vhost_user_rdma/meson.build b/examples/vhost_user_rdma/meson.build
index a032a27767..2a0a6ffc15 100644
--- a/examples/vhost_user_rdma/meson.build
+++ b/examples/vhost_user_rdma/meson.build
@@ -43,5 +43,7 @@ sources = files(
'vhost_rdma_ib.c',
'vhost_rdma_queue.c',
'vhost_rdma_opcode.c',
+ 'vhost_rdma_pkt.c',
+ 'vhost_rdma_crc.c',
)
diff --git a/examples/vhost_user_rdma/vhost_rdma.h b/examples/vhost_user_rdma/vhost_rdma.h
index 980bb74beb..bf772283b8 100644
--- a/examples/vhost_user_rdma/vhost_rdma.h
+++ b/examples/vhost_user_rdma/vhost_rdma.h
@@ -72,6 +72,8 @@ extern "C" {
#define VHOST_NET_RXQ 0
#define VHOST_NET_TXQ 1
+#define ROCE_V2_UDP_DPORT 4791
+
/* VIRTIO_F_EVENT_IDX is NOT supported now */
#define VHOST_RDMA_FEATURE ((1ULL << VIRTIO_F_VERSION_1) |\
(1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \
@@ -457,6 +459,13 @@ static inline enum vhost_rdma_network_type rdma_gid_attr_network_type(const stru
return VHOST_RDMA_NETWORK_IPV6;
}
+static __rte_always_inline void
+vhost_rdma_counter_inc(struct vhost_rdma_device *dev,
+ enum vhost_rdma_counters index)
+{
+ rte_atomic64_inc(&dev->stats_counters[index]);
+}
+
int vhost_rdma_construct(struct vhost_rdma_device *dev, const char *path, int idx);
void vhost_rdma_net_construct(struct vhost_user_queue *queues, int idx);
void vs_vhost_rdma_net_setup(int vid);
diff --git a/examples/vhost_user_rdma/vhost_rdma_crc.c b/examples/vhost_user_rdma/vhost_rdma_crc.c
new file mode 100644
index 0000000000..7802bc61e1
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_crc.c
@@ -0,0 +1,163 @@
+/*
+ * Vhost-user RDMA device : Calculating the CRC of data packet
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@...inos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <stdint.h>
+
+#include <rte_mbuf.h>
+#include <rte_ip.h>
+#include <rte_udp.h>
+
+#include "vhost_rdma_opcode.h"
+#include "vhost_rdma_ib.h"
+#include "vhost_rdma_queue.h"
+#include "vhost_rdma.h"
+#include "vhost_rdma_pkt.h"
+
+const uint32_t crc_table[256] = {
+ 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L,
+ 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L,
+ 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L,
+ 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL,
+ 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L,
+ 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L,
+ 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L,
+ 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL,
+ 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L,
+ 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL,
+ 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L,
+ 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L,
+ 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L,
+ 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL,
+ 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL,
+ 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L,
+ 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL,
+ 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L,
+ 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L,
+ 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L,
+ 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL,
+ 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L,
+ 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L,
+ 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL,
+ 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L,
+ 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L,
+ 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L,
+ 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L,
+ 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L,
+ 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL,
+ 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL,
+ 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L,
+ 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L,
+ 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL,
+ 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL,
+ 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L,
+ 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL,
+ 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L,
+ 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL,
+ 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L,
+ 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL,
+ 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L,
+ 0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L,
+ 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL,
+ 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L,
+ 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L,
+ 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L,
+ 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L,
+ 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L,
+ 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L,
+ 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
+ 0x2d02ef8dL
+};
+
+#define DO1(buf) crc = crc_table[((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8);
+#define DO2(buf) DO1(buf); DO1(buf);
+#define DO4(buf) DO2(buf); DO2(buf);
+#define DO8(buf) DO4(buf); DO4(buf);
+
+#define CSUM_MANGLED_0 0xffff
+
+uint32_t
+crc32(uint32_t crc, void* buf, uint32_t len)
+{
+ char* bufc = buf;
+ while (len >= 8)
+ {
+ DO8(bufc);
+ len -= 8;
+ }
+ if (len) do {
+ DO1(bufc);
+ } while (--len);
+ return crc;
+}
+
+uint32_t
+vhost_rdma_icrc_hdr(struct vhost_rdma_pkt_info *pkt, struct rte_mbuf *mbuf)
+{
+ unsigned int bth_offset = 0;
+ struct rte_ipv4_hdr *ip4h = NULL;
+ struct rte_ipv6_hdr *ip6h = NULL;
+ struct rte_udp_hdr *udph;
+ struct vhost_bth *bth;
+ int crc;
+ int length;
+ int hdr_size = sizeof(struct rte_udp_hdr) +
+ (mbuf->l3_type == VHOST_NETWORK_TYPE_IPV4 ?
+ sizeof(struct rte_ipv4_hdr) : sizeof(struct rte_ipv6_hdr));
+ /* pseudo header buffer size is calculate using ipv6 header size since
+ * it is bigger than ipv4
+ */
+ uint8_t pshdr[sizeof(struct rte_udp_hdr) +
+ sizeof(struct rte_ipv6_hdr) +
+ VHOST_BTH_BYTES];
+
+ /* This seed is the result of computing a CRC with a seed of
+ * 0xfffffff and 8 bytes of 0xff representing a masked LRH.
+ */
+ crc = 0xdebb20e3;
+
+ if (mbuf->l3_type == VHOST_NETWORK_TYPE_IPV4) { /* IPv4 */
+ rte_memcpy(pshdr, ip_hdr(pkt), hdr_size);
+ ip4h = (struct rte_ipv4_hdr *)pshdr;
+ udph = (struct rte_udp_hdr *)(ip4h + 1);
+
+ ip4h->time_to_live = 0xff;
+ ip4h->hdr_checksum = CSUM_MANGLED_0;
+ ip4h->type_of_service = 0xff;
+ } else { /* IPv6 */
+ rte_memcpy(pshdr, ipv6_hdr(pkt), hdr_size);
+ ip6h = (struct rte_ipv6_hdr *)pshdr;
+ udph = (struct rte_udp_hdr *)(ip6h + 1);
+
+ // memset(ip6h->flow_lbl, 0xff, sizeof(ip6h->flow_lbl));
+ // ip6h->priority = 0xf;
+ ip6h->vtc_flow = rte_cpu_to_be_32(RTE_IPV6_HDR_FL_MASK | RTE_IPV6_HDR_TC_MASK);
+ ip6h->hop_limits = 0xff;
+ }
+ udph->dgram_cksum = CSUM_MANGLED_0;
+
+ bth_offset += hdr_size;
+
+ rte_memcpy(&pshdr[bth_offset], pkt->hdr, VHOST_BTH_BYTES);
+ bth = (struct vhost_bth *)&pshdr[bth_offset];
+
+ /* exclude bth.resv8a */
+ bth->qpn |= rte_cpu_to_be_32(~VHOST_RDMA_QPN_MASK);
+
+ length = hdr_size + VHOST_BTH_BYTES;
+ crc = crc32(crc, pshdr, length);
+
+ /* And finish to compute the CRC on the remainder of the headers. */
+ crc = crc32(crc, pkt->hdr + VHOST_BTH_BYTES,
+ vhost_rdma_opcode[pkt->opcode].length - VHOST_BTH_BYTES);
+ return crc;
+}
+
diff --git a/examples/vhost_user_rdma/vhost_rdma_opcode.c b/examples/vhost_user_rdma/vhost_rdma_opcode.c
index 4284a405f5..fbbed5b0e2 100644
--- a/examples/vhost_user_rdma/vhost_rdma_opcode.c
+++ b/examples/vhost_user_rdma/vhost_rdma_opcode.c
@@ -891,4 +891,143 @@ struct vhost_rdma_opcode_info vhost_rdma_opcode[VHOST_NUM_OPCODE] = {
}
},
-};
\ No newline at end of file
+};
+
+static int
+next_opcode_rc(struct vhost_rdma_qp *qp, uint32_t opcode, int fits)
+{
+ switch (opcode) {
+ case VHOST_RDMA_IB_WR_RDMA_WRITE:
+ if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE)
+ return fits ?
+ IB_OPCODE_RC_RDMA_WRITE_LAST :
+ IB_OPCODE_RC_RDMA_WRITE_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_RC_RDMA_WRITE_ONLY :
+ IB_OPCODE_RC_RDMA_WRITE_FIRST;
+
+ case VHOST_RDMA_IB_WR_RDMA_WRITE_WITH_IMM:
+ if (qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_RDMA_WRITE_MIDDLE)
+ return fits ?
+ IB_OPCODE_RC_RDMA_WRITE_LAST_WITH_IMMEDIATE :
+ IB_OPCODE_RC_RDMA_WRITE_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_RC_RDMA_WRITE_ONLY_WITH_IMMEDIATE :
+ IB_OPCODE_RC_RDMA_WRITE_FIRST;
+
+ case VHOST_RDMA_IB_WR_SEND:
+ if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+ return fits ?
+ IB_OPCODE_RC_SEND_LAST :
+ IB_OPCODE_RC_SEND_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_RC_SEND_ONLY :
+ IB_OPCODE_RC_SEND_FIRST;
+
+ case VHOST_RDMA_IB_WR_SEND_WITH_IMM:
+ if (qp->req.opcode == IB_OPCODE_RC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_RC_SEND_MIDDLE)
+ return fits ?
+ IB_OPCODE_RC_SEND_LAST_WITH_IMMEDIATE :
+ IB_OPCODE_RC_SEND_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_RC_SEND_ONLY_WITH_IMMEDIATE :
+ IB_OPCODE_RC_SEND_FIRST;
+
+ case VHOST_RDMA_IB_WR_RDMA_READ:
+ return IB_OPCODE_RC_RDMA_READ_REQUEST;
+ }
+
+ return -EINVAL;
+}
+
+static int
+next_opcode_uc(struct vhost_rdma_qp *qp, uint32_t opcode, int fits)
+{
+ switch (opcode) {
+ case VHOST_RDMA_IB_WR_RDMA_WRITE:
+ if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST ||
+ qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE)
+ return fits ?
+ IB_OPCODE_UC_RDMA_WRITE_LAST :
+ IB_OPCODE_UC_RDMA_WRITE_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_UC_RDMA_WRITE_ONLY :
+ IB_OPCODE_UC_RDMA_WRITE_FIRST;
+
+ case VHOST_RDMA_IB_WR_RDMA_WRITE_WITH_IMM:
+ if (qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_FIRST ||
+ qp->req.opcode == IB_OPCODE_UC_RDMA_WRITE_MIDDLE)
+ return fits ?
+ IB_OPCODE_UC_RDMA_WRITE_LAST_WITH_IMMEDIATE :
+ IB_OPCODE_UC_RDMA_WRITE_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_UC_RDMA_WRITE_ONLY_WITH_IMMEDIATE :
+ IB_OPCODE_UC_RDMA_WRITE_FIRST;
+
+ case VHOST_RDMA_IB_WR_SEND:
+ if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE)
+ return fits ?
+ IB_OPCODE_UC_SEND_LAST :
+ IB_OPCODE_UC_SEND_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_UC_SEND_ONLY :
+ IB_OPCODE_UC_SEND_FIRST;
+
+ case VHOST_RDMA_IB_WR_SEND_WITH_IMM:
+ if (qp->req.opcode == IB_OPCODE_UC_SEND_FIRST ||
+ qp->req.opcode == IB_OPCODE_UC_SEND_MIDDLE)
+ return fits ?
+ IB_OPCODE_UC_SEND_LAST_WITH_IMMEDIATE :
+ IB_OPCODE_UC_SEND_MIDDLE;
+ else
+ return fits ?
+ IB_OPCODE_UC_SEND_ONLY_WITH_IMMEDIATE :
+ IB_OPCODE_UC_SEND_FIRST;
+ }
+
+ return -EINVAL;
+}
+
+int vhost_rdma_next_opcode(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *wqe,
+ uint32_t opcode)
+{
+ int fits = (wqe->dma.resid <= qp->mtu);
+
+ switch (qp->type) {
+ case VHOST_RDMA_IB_QPT_RC:
+ return next_opcode_rc(qp, opcode, fits);
+
+ case VHOST_RDMA_IB_QPT_UC:
+ return next_opcode_uc(qp, opcode, fits);
+
+ case VHOST_RDMA_IB_QPT_SMI:
+ case VHOST_RDMA_IB_QPT_UD:
+ case VHOST_RDMA_IB_QPT_GSI:
+ switch (opcode) {
+ case VHOST_RDMA_IB_WR_SEND:
+ return IB_OPCODE_UD_SEND_ONLY;
+
+ case VHOST_RDMA_IB_WR_SEND_WITH_IMM:
+ return IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ return -EINVAL;
+}
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_opcode.h b/examples/vhost_user_rdma/vhost_rdma_opcode.h
index b8f48bcdf5..6c3660f36b 100644
--- a/examples/vhost_user_rdma/vhost_rdma_opcode.h
+++ b/examples/vhost_user_rdma/vhost_rdma_opcode.h
@@ -24,6 +24,7 @@
#include <rte_interrupts.h>
#include "vhost_rdma_ib.h"
+#include "vhost_rdma_pkt.h"
/** Maximum number of QP types supported for WR mask dispatching */
#define WR_MAX_QPT 8
@@ -38,6 +39,92 @@
/* Invalid opcode marker */
#define OPCODE_NONE (-1)
+#define VHOST_RDMA_SE_MASK (0x80)
+#define VHOST_RDMA_MIG_MASK (0x40)
+#define VHOST_RDMA_PAD_MASK (0x30)
+#define VHOST_RDMA_TVER_MASK (0x0f)
+#define VHOST_RDMA_FECN_MASK (0x80000000)
+#define VHOST_RDMA_BECN_MASK (0x40000000)
+#define VHOST_RDMA_RESV6A_MASK (0x3f000000)
+#define VHOST_RDMA_QPN_MASK (0x00ffffff)
+#define VHOST_RDMA_ACK_MASK (0x80000000)
+#define VHOST_RDMA_RESV7_MASK (0x7f000000)
+#define VHOST_RDMA_PSN_MASK (0x00ffffff)
+
+/**
+ * @defgroup hdr_types Header Types (for offset tracking)
+ * @{
+ */
+enum vhost_rdma_hdr_type {
+ VHOST_RDMA_LRH, /**< Link Layer Header (InfiniBand only) */
+ VHOST_RDMA_GRH, /**< Global Route Header (IPv6-style GIDs) */
+ VHOST_RDMA_BTH, /**< Base Transport Header */
+ VHOST_RDMA_RETH, /**< RDMA Extended Transport Header */
+ VHOST_RDMA_AETH, /**< Acknowledge/Error Header */
+ VHOST_RDMA_ATMETH, /**< Atomic Operation Request Header */
+ VHOST_RDMA_ATMACK, /**< Atomic Operation Response Header */
+ VHOST_RDMA_IETH, /**< Immediate Data + Error Code Header */
+ VHOST_RDMA_RDETH, /**< Reliable Datagram Extended Transport Header */
+ VHOST_RDMA_DETH, /**< Datagram Endpoint Identifier Header */
+ VHOST_RDMA_IMMDT, /**< Immediate Data Header */
+ VHOST_RDMA_PAYLOAD, /**< Payload section */
+ NUM_HDR_TYPES /**< Number of known header types */
+};
+
+/**
+ * @defgroup hdr_masks Header Presence and Semantic Flags
+ * @{
+ */
+enum vhost_rdma_hdr_mask {
+ VHOST_LRH_MASK = BIT(VHOST_RDMA_LRH),
+ VHOST_GRH_MASK = BIT(VHOST_RDMA_GRH),
+ VHOST_BTH_MASK = BIT(VHOST_RDMA_BTH),
+ VHOST_IMMDT_MASK = BIT(VHOST_RDMA_IMMDT),
+ VHOST_RETH_MASK = BIT(VHOST_RDMA_RETH),
+ VHOST_AETH_MASK = BIT(VHOST_RDMA_AETH),
+ VHOST_ATMETH_MASK = BIT(VHOST_RDMA_ATMETH),
+ VHOST_ATMACK_MASK = BIT(VHOST_RDMA_ATMACK),
+ VHOST_IETH_MASK = BIT(VHOST_RDMA_IETH),
+ VHOST_RDETH_MASK = BIT(VHOST_RDMA_RDETH),
+ VHOST_DETH_MASK = BIT(VHOST_RDMA_DETH),
+ VHOST_PAYLOAD_MASK = BIT(VHOST_RDMA_PAYLOAD),
+
+ /* Semantic packet type flags */
+ VHOST_REQ_MASK = BIT(NUM_HDR_TYPES + 0), /**< Request packet */
+ VHOST_ACK_MASK = BIT(NUM_HDR_TYPES + 1), /**< ACK/NACK packet */
+ VHOST_SEND_MASK = BIT(NUM_HDR_TYPES + 2), /**< Send operation */
+ VHOST_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), /**< RDMA Write */
+ VHOST_READ_MASK = BIT(NUM_HDR_TYPES + 4), /**< RDMA Read */
+ VHOST_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), /**< Atomic operation */
+
+ /* Packet fragmentation flags */
+ VHOST_RWR_MASK = BIT(NUM_HDR_TYPES + 6), /**< RDMA with Immediate + Invalidate */
+ VHOST_COMP_MASK = BIT(NUM_HDR_TYPES + 7), /**< Completion required */
+
+ VHOST_START_MASK = BIT(NUM_HDR_TYPES + 8), /**< First fragment */
+ VHOST_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), /**< Middle fragment */
+ VHOST_END_MASK = BIT(NUM_HDR_TYPES + 10), /**< Last fragment */
+
+ VHOST_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), /**< Loopback within host */
+
+ /* Composite masks */
+ VHOST_READ_OR_ATOMIC = (VHOST_READ_MASK | VHOST_ATOMIC_MASK),
+ VHOST_WRITE_OR_SEND = (VHOST_WRITE_MASK | VHOST_SEND_MASK),
+};
+
+/**
+ * @brief Per-opcode metadata for parsing and validation
+ */
+struct vhost_rdma_opcode_info {
+ const char *name; /**< Opcode name (e.g., "RC SEND_FIRST") */
+ int length; /**< Fixed payload length (if any) */
+ int offset[NUM_HDR_TYPES]; /**< Offset of each header within packet */
+ enum vhost_rdma_hdr_mask mask; /**< Header presence and semantic flags */
+};
+
+/* Global opcode info table (indexed by IB opcode byte) */
+extern struct vhost_rdma_opcode_info vhost_rdma_opcode[VHOST_NUM_OPCODE];
+
struct vhost_bth {
uint8_t opcode;
uint8_t flags;
@@ -46,21 +133,192 @@ struct vhost_bth {
rte_be32_t apsn;
};
+static inline uint8_t __bth_pad(void *arg)
+{
+ struct vhost_bth *bth = arg;
+
+ return (VHOST_RDMA_PAD_MASK & bth->flags) >> 4;
+}
+
+static inline uint8_t bth_pad(struct vhost_rdma_pkt_info *pkt)
+{
+ return __bth_pad(pkt->hdr);
+}
+
struct vhost_deth {
rte_be32_t qkey;
rte_be32_t sqp;
};
+#define GSI_QKEY (0x80010000)
+#define DETH_SQP_MASK (0x00ffffff)
+
+static inline uint32_t __deth_qkey(void *arg)
+{
+ struct vhost_deth *deth = arg;
+
+ return rte_be_to_cpu_32(deth->qkey);
+}
+
+static inline void __deth_set_qkey(void *arg, uint32_t qkey)
+{
+ struct vhost_deth *deth = arg;
+
+ deth->qkey = rte_cpu_to_be_32(qkey);
+}
+
+static inline uint32_t __deth_sqp(void *arg)
+{
+ struct vhost_deth *deth = arg;
+
+ return DETH_SQP_MASK & rte_be_to_cpu_32(deth->sqp);
+}
+
+static inline void __deth_set_sqp(void *arg, uint32_t sqp)
+{
+ struct vhost_deth *deth = arg;
+
+ deth->sqp = rte_cpu_to_be_32(DETH_SQP_MASK & sqp);
+}
+
+static inline uint32_t deth_qkey(struct vhost_rdma_pkt_info *pkt)
+{
+ return __deth_qkey(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_DETH]);
+}
+
+static inline void deth_set_qkey(struct vhost_rdma_pkt_info *pkt, uint32_t qkey)
+{
+ __deth_set_qkey(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_DETH], qkey);
+}
+
+static inline uint32_t deth_sqp(struct vhost_rdma_pkt_info *pkt)
+{
+ return __deth_sqp(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_DETH]);
+}
+
+static inline void deth_set_sqp(struct vhost_rdma_pkt_info *pkt, uint32_t sqp)
+{
+ __deth_set_sqp(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_DETH], sqp);
+}
+
struct vhost_immdt {
rte_be32_t imm;
};
+static inline rte_be32_t __immdt_imm(void *arg)
+{
+ struct vhost_immdt *immdt = arg;
+
+ return immdt->imm;
+}
+
+static inline void __immdt_set_imm(void *arg, rte_be32_t imm)
+{
+ struct vhost_immdt *immdt = arg;
+
+ immdt->imm = imm;
+}
+
+static inline rte_be32_t immdt_imm(struct vhost_rdma_pkt_info *pkt)
+{
+ return __immdt_imm(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_IMMDT]);
+}
+
+static inline void immdt_set_imm(struct vhost_rdma_pkt_info *pkt, rte_be32_t imm)
+{
+ __immdt_set_imm(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_IMMDT], imm);
+}
+
struct vhost_reth {
rte_be64_t va;
rte_be32_t rkey;
rte_be32_t len;
};
+static inline uint64_t __reth_va(void *arg)
+{
+ struct vhost_reth *reth = arg;
+
+ return rte_be_to_cpu_64(reth->va);
+}
+
+static inline void __reth_set_va(void *arg, uint64_t va)
+{
+ struct vhost_reth *reth = arg;
+
+ reth->va = rte_cpu_to_be_64(va);
+}
+
+static inline uint32_t __reth_rkey(void *arg)
+{
+ struct vhost_reth *reth = arg;
+
+ return rte_be_to_cpu_32(reth->rkey);
+}
+
+static inline void __reth_set_rkey(void *arg, uint32_t rkey)
+{
+ struct vhost_reth *reth = arg;
+
+ reth->rkey = rte_cpu_to_be_32(rkey);
+}
+
+static inline uint32_t __reth_len(void *arg)
+{
+ struct vhost_reth *reth = arg;
+
+ return rte_be_to_cpu_32(reth->len);
+}
+
+static inline void __reth_set_len(void *arg, uint32_t len)
+{
+ struct vhost_reth *reth = arg;
+
+ reth->len = rte_cpu_to_be_32(len);
+}
+
+static inline uint64_t reth_va(struct vhost_rdma_pkt_info *pkt)
+{
+ return __reth_va(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_RETH]);
+}
+
+static inline void reth_set_va(struct vhost_rdma_pkt_info *pkt, uint64_t va)
+{
+ __reth_set_va(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_RETH], va);
+}
+
+static inline uint32_t reth_rkey(struct vhost_rdma_pkt_info *pkt)
+{
+ return __reth_rkey(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_RETH]);
+}
+
+static inline void reth_set_rkey(struct vhost_rdma_pkt_info *pkt, uint32_t rkey)
+{
+ __reth_set_rkey(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_RETH], rkey);
+}
+
+static inline uint32_t reth_len(struct vhost_rdma_pkt_info *pkt)
+{
+ return __reth_len(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_RETH]);
+}
+
+static inline void reth_set_len(struct vhost_rdma_pkt_info *pkt, uint32_t len)
+{
+ __reth_set_len(pkt->hdr +
+ vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_RETH], len);
+}
+
struct vhost_aeth {
rte_be32_t smsn;
};
@@ -252,79 +510,8 @@ static inline unsigned int wr_opcode_mask(int opcode, struct vhost_rdma_qp *qp)
return vhost_rdma_wr_opcode_info[opcode].mask[qp->type];
}
-/**
- * @defgroup hdr_types Header Types (for offset tracking)
- * @{
- */
-enum vhost_rdma_hdr_type {
- VHOST_RDMA_LRH, /**< Link Layer Header (InfiniBand only) */
- VHOST_RDMA_GRH, /**< Global Route Header (IPv6-style GIDs) */
- VHOST_RDMA_BTH, /**< Base Transport Header */
- VHOST_RDMA_RETH, /**< RDMA Extended Transport Header */
- VHOST_RDMA_AETH, /**< Acknowledge/Error Header */
- VHOST_RDMA_ATMETH, /**< Atomic Operation Request Header */
- VHOST_RDMA_ATMACK, /**< Atomic Operation Response Header */
- VHOST_RDMA_IETH, /**< Immediate Data + Error Code Header */
- VHOST_RDMA_RDETH, /**< Reliable Datagram Extended Transport Header */
- VHOST_RDMA_DETH, /**< Datagram Endpoint Identifier Header */
- VHOST_RDMA_IMMDT, /**< Immediate Data Header */
- VHOST_RDMA_PAYLOAD, /**< Payload section */
- NUM_HDR_TYPES /**< Number of known header types */
-};
-
-/**
- * @defgroup hdr_masks Header Presence and Semantic Flags
- * @{
- */
-enum vhost_rdma_hdr_mask {
- VHOST_LRH_MASK = BIT(VHOST_RDMA_LRH),
- VHOST_GRH_MASK = BIT(VHOST_RDMA_GRH),
- VHOST_BTH_MASK = BIT(VHOST_RDMA_BTH),
- VHOST_IMMDT_MASK = BIT(VHOST_RDMA_IMMDT),
- VHOST_RETH_MASK = BIT(VHOST_RDMA_RETH),
- VHOST_AETH_MASK = BIT(VHOST_RDMA_AETH),
- VHOST_ATMETH_MASK = BIT(VHOST_RDMA_ATMETH),
- VHOST_ATMACK_MASK = BIT(VHOST_RDMA_ATMACK),
- VHOST_IETH_MASK = BIT(VHOST_RDMA_IETH),
- VHOST_RDETH_MASK = BIT(VHOST_RDMA_RDETH),
- VHOST_DETH_MASK = BIT(VHOST_RDMA_DETH),
- VHOST_PAYLOAD_MASK = BIT(VHOST_RDMA_PAYLOAD),
-
- /* Semantic packet type flags */
- VHOST_REQ_MASK = BIT(NUM_HDR_TYPES + 0), /**< Request packet */
- VHOST_ACK_MASK = BIT(NUM_HDR_TYPES + 1), /**< ACK/NACK packet */
- VHOST_SEND_MASK = BIT(NUM_HDR_TYPES + 2), /**< Send operation */
- VHOST_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), /**< RDMA Write */
- VHOST_READ_MASK = BIT(NUM_HDR_TYPES + 4), /**< RDMA Read */
- VHOST_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), /**< Atomic operation */
-
- /* Packet fragmentation flags */
- VHOST_RWR_MASK = BIT(NUM_HDR_TYPES + 6), /**< RDMA with Immediate + Invalidate */
- VHOST_COMP_MASK = BIT(NUM_HDR_TYPES + 7), /**< Completion required */
-
- VHOST_START_MASK = BIT(NUM_HDR_TYPES + 8), /**< First fragment */
- VHOST_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), /**< Middle fragment */
- VHOST_END_MASK = BIT(NUM_HDR_TYPES + 10), /**< Last fragment */
-
- VHOST_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), /**< Loopback within host */
-
- /* Composite masks */
- VHOST_READ_OR_ATOMIC = (VHOST_READ_MASK | VHOST_ATOMIC_MASK),
- VHOST_WRITE_OR_SEND = (VHOST_WRITE_MASK | VHOST_SEND_MASK),
-};
-/** @} */
-
-/**
- * @brief Per-opcode metadata for parsing and validation
- */
-struct vhost_rdma_opcode_info {
- const char *name; /**< Opcode name (e.g., "RC SEND_FIRST") */
- int length; /**< Fixed payload length (if any) */
- int offset[NUM_HDR_TYPES]; /**< Offset of each header within packet */
- enum vhost_rdma_hdr_mask mask; /**< Header presence and semantic flags */
-};
-
-/* Global opcode info table (indexed by IB opcode byte) */
-extern struct vhost_rdma_opcode_info vhost_rdma_opcode[VHOST_NUM_OPCODE];
+int vhost_rdma_next_opcode(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *wqe,
+ uint32_t opcode);
#endif
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_pkt.c b/examples/vhost_user_rdma/vhost_rdma_pkt.c
new file mode 100644
index 0000000000..27f7dd0647
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_pkt.c
@@ -0,0 +1,221 @@
+/*
+ * Vhost-user RDMA device : handling ipv4 or ipv6 hdr and data
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@...inos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <rte_mbuf.h>
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_udp.h>
+
+#include "vhost_rdma_pkt.h"
+#include "vhost_rdma.h"
+#include "vhost_rdma_opcode.h"
+#include "vhost_rdma_queue.h"
+
+static __rte_always_inline
+void default_gid_to_mac(struct vhost_rdma_device *dev, char *mac)
+{
+ struct vhost_rdma_gid *gid = &dev->gid_tbl[0];
+
+ mac[0] = gid->gid[8];
+ mac[1] = gid->gid[9];
+ mac[2] = gid->gid[10];
+ mac[3] = gid->gid[13];
+ mac[4] = gid->gid[14];
+ mac[5] = gid->gid[15];
+}
+
+static void prepare_udp_hdr(struct rte_mbuf *m,
+ rte_be16_t src_port,
+ rte_be16_t dst_port)
+{
+ struct rte_udp_hdr *udph;
+
+ udph = (struct rte_udp_hdr *)rte_pktmbuf_prepend(m, sizeof(*udph));
+
+ udph->dst_port = dst_port;
+ udph->src_port = src_port;
+ udph->dgram_len = rte_cpu_to_be_16(m->data_len);
+ udph->dgram_cksum = 0;
+}
+
+static void prepare_ipv4_hdr(struct rte_mbuf *m,
+ rte_be32_t saddr,
+ rte_be32_t daddr,
+ uint8_t proto,
+ uint8_t tos,
+ uint8_t ttl,
+ rte_be16_t df)
+{
+ struct rte_ipv4_hdr *iph;
+
+ iph = (struct rte_ipv4_hdr *)rte_pktmbuf_prepend(m, sizeof(*iph));
+
+ iph->version_ihl = RTE_IPV4_VHL_DEF;
+ iph->total_length = rte_cpu_to_be_16(m->data_len);
+ iph->fragment_offset = df;
+ iph->next_proto_id = proto;
+ iph->type_of_service = tos;
+ iph->dst_addr = daddr;
+ iph->src_addr = saddr;
+ iph->time_to_live = ttl;
+}
+
+static inline void ip6_flow_hdr(struct rte_ipv6_hdr *hdr, unsigned int tclass,
+ rte_be32_t flowlabel)
+{
+ *(rte_be32_t *)hdr = rte_cpu_to_be_32(0x60000000 | (tclass << 20))|flowlabel;
+}
+
+static void
+prepare_ipv6_hdr(struct rte_mbuf *m,
+ struct in6_addr *saddr,
+ struct in6_addr *daddr,
+ uint8_t proto,
+ uint8_t prio,
+ uint8_t ttl)
+{
+ struct rte_ipv6_hdr *ip6h;
+
+ ip6h = (struct rte_ipv6_hdr *)rte_pktmbuf_prepend(m, sizeof(*ip6h));
+
+ ip6_flow_hdr(ip6h, prio, rte_cpu_to_be_32(0));
+ ip6h->proto = proto;
+ ip6h->hop_limits = ttl;
+ rte_memcpy(ip6h->dst_addr, daddr, sizeof(*daddr));
+ rte_memcpy(ip6h->src_addr, saddr, sizeof(*daddr));
+ ip6h->payload_len = rte_cpu_to_be_16(m->data_len - sizeof(*ip6h));
+}
+
+static int
+prepare4(struct vhost_rdma_pkt_info *pkt, struct rte_mbuf *m)
+{
+ struct vhost_rdma_qp *qp = pkt->qp;
+ struct vhost_rdma_av *av = vhost_rdma_get_av(pkt);
+ struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr;
+ struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr;
+ rte_be16_t df = rte_cpu_to_be_16(RTE_IPV4_HDR_DF_FLAG);
+
+ prepare_udp_hdr(m, rte_cpu_to_be_16(qp->src_port),
+ rte_cpu_to_be_16(ROCE_V2_UDP_DPORT));
+
+ // FIXME: check addr
+ prepare_ipv4_hdr(m, saddr->s_addr, daddr->s_addr, IPPROTO_UDP,
+ av->grh.traffic_class, av->grh.hop_limit, df);
+
+ return 0;
+}
+
+static int
+prepare6(struct vhost_rdma_pkt_info *pkt, struct rte_mbuf *m)
+{
+ struct vhost_rdma_qp *qp = pkt->qp;
+ struct vhost_rdma_av *av = vhost_rdma_get_av(pkt);
+ struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr;
+ struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr;
+
+ prepare_udp_hdr(m, rte_cpu_to_be_16(qp->src_port),
+ rte_cpu_to_be_16(ROCE_V2_UDP_DPORT));
+
+ prepare_ipv6_hdr(m, saddr, daddr, IPPROTO_UDP,
+ av->grh.traffic_class,
+ av->grh.hop_limit);
+
+ return 0;
+}
+
+int
+vhost_rdma_prepare(struct vhost_rdma_pkt_info *pkt,
+ struct rte_mbuf *m,
+ uint32_t *crc)
+{
+ int err = 0;
+ char dev_mac[6];
+
+ if (m->l3_type == VHOST_NETWORK_TYPE_IPV4)
+ err = prepare4(pkt, m);
+ else if (m->l3_type == VHOST_NETWORK_TYPE_IPV6)
+ err = prepare6(pkt, m);
+
+ *crc = vhost_rdma_icrc_hdr(pkt, m);
+
+ default_gid_to_mac(pkt->dev, dev_mac);
+
+ if (memcmp(dev_mac, vhost_rdma_get_av(pkt)->dmac, 6) == 0) {
+ pkt->mask |= VHOST_LOOPBACK_MASK;
+ }
+
+ return err;
+}
+
+static int
+ip_out(struct vhost_rdma_pkt_info *pkt, struct rte_mbuf* mbuf, uint16_t type)
+{
+ struct rte_ether_hdr *ether;
+
+ ether = (struct rte_ether_hdr *)rte_pktmbuf_prepend(mbuf, sizeof(*ether));
+
+ ether->ether_type = rte_cpu_to_be_16(type);
+ default_gid_to_mac(pkt->dev, (char*)ðer->src_addr.addr_bytes[0]);
+ rte_memcpy(ðer->dst_addr.addr_bytes[0], vhost_rdma_get_av(pkt)->dmac, 6);
+
+ // IP checksum offload
+ mbuf->ol_flags = RTE_MBUF_F_TX_IP_CKSUM;
+ if (type == RTE_ETHER_TYPE_IPV4) {
+ mbuf->ol_flags |= RTE_MBUF_F_TX_IPV4;
+ mbuf->l3_len = sizeof(struct rte_ipv4_hdr);
+ } else {
+ mbuf->ol_flags |= RTE_MBUF_F_TX_IPV6;
+ mbuf->l3_len = sizeof(struct rte_ipv6_hdr);
+ }
+ mbuf->l4_len = sizeof(struct rte_udp_hdr);
+ mbuf->l2_len = sizeof(struct rte_ether_hdr);
+
+ rte_ring_enqueue(pkt->dev->tx_ring, mbuf);
+
+ return 0;
+}
+
+int vhost_rdma_send(struct vhost_rdma_pkt_info *pkt,
+ struct rte_mbuf *mbuf)
+{
+ int err;
+ int mbuf_out;
+ struct vhost_rdma_qp *qp = pkt->qp;
+
+ vhost_rdma_add_ref(qp);
+ rte_atomic32_inc(&pkt->qp->mbuf_out);
+
+ if (mbuf->l3_type == VHOST_NETWORK_TYPE_IPV4) {
+ err = ip_out(pkt, mbuf, RTE_ETHER_TYPE_IPV4);
+ } else if (mbuf->l3_type == VHOST_NETWORK_TYPE_IPV6) {
+ err = ip_out(pkt, mbuf, RTE_ETHER_TYPE_IPV6);
+ } else {
+ RDMA_LOG_ERR("Unknown layer 3 protocol: %u\n", mbuf->l3_type);
+ vhost_rdma_drop_ref(qp, qp->dev, qp);
+ rte_pktmbuf_free(mbuf);
+ return -EINVAL;
+ }
+
+ mbuf_out = rte_atomic32_sub_return(&pkt->qp->mbuf_out, 1);
+ if (unlikely(pkt->qp->need_req_mbuf &&
+ mbuf_out < VHOST_INFLIGHT_SKBS_PER_QP_LOW))
+ vhost_rdma_run_task(&pkt->qp->req.task, 1);
+
+ vhost_rdma_drop_ref(qp, qp->dev, qp);
+
+ if (unlikely(err)) {
+ RDMA_LOG_ERR("ip out failed");
+ return -EAGAIN;
+ }
+
+ return 0;
+}
diff --git a/examples/vhost_user_rdma/vhost_rdma_pkt.h b/examples/vhost_user_rdma/vhost_rdma_pkt.h
index e6a605f574..f012edd8ec 100644
--- a/examples/vhost_user_rdma/vhost_rdma_pkt.h
+++ b/examples/vhost_user_rdma/vhost_rdma_pkt.h
@@ -22,9 +22,13 @@
#include <stdint.h>
#include <stddef.h>
+#include <netinet/in.h>
#include <rte_byteorder.h>
#include <rte_mbuf.h> /* For struct rte_mbuf if needed later */
+#include "vhost_rdma.h"
+#include "vhost_rdma_ib.h"
+
/* Forward declarations */
struct vhost_rdma_dev;
struct vhost_rdma_qp;
@@ -34,16 +38,23 @@ struct vhost_rdma_send_wqe;
#define BIT(x) (1U << (x)) /**< Generate bitmask from bit index */
#endif
+#define ip_hdr(p) ((struct rte_ipv4_hdr*) \
+ (RTE_PTR_SUB(p->hdr, \
+ sizeof(struct rte_udp_hdr) + sizeof(struct rte_ipv4_hdr))))
+#define ipv6_hdr(p) ((struct rte_ipv6_hdr*) \
+ (RTE_PTR_SUB(p->hdr, \
+ sizeof(struct rte_udp_hdr) + sizeof(struct rte_ipv6_hdr))))
+
/**
- * @defgroup constants Constants & Limits
- * @{
- */
+* @defgroup constants Constants & Limits
+* @{
+*/
/**
- * @brief Runtime packet context used during processing
- */
+* @brief Runtime packet context used during processing
+*/
struct vhost_rdma_pkt_info {
- struct vhost_rdma_dev *dev; /**< Owning device */
+ struct vhost_rdma_device *dev; /**< Owning device */
struct vhost_rdma_qp *qp; /**< Associated QP */
struct vhost_rdma_send_wqe *wqe; /**< Corresponding send WQE (if applicable) */
uint8_t *hdr; /**< Pointer to BTH (Base Transport Header) */
@@ -55,4 +66,12 @@ struct vhost_rdma_pkt_info {
uint8_t opcode; /**< BTH opcode field */
};
+int vhost_rdma_prepare(struct vhost_rdma_pkt_info *pkt,
+ struct rte_mbuf *m,
+ uint32_t *crc);
+
+uint32_t vhost_rdma_icrc_hdr(struct vhost_rdma_pkt_info *pkt, struct rte_mbuf *mbuf);
+
+uint32_t crc32(uint32_t crc, void* buf, uint32_t len);
+
#endif /* __VHOST_RDMA_PKT_H__ */
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_queue.c b/examples/vhost_user_rdma/vhost_rdma_queue.c
index abce651fa5..7d0c45592c 100644
--- a/examples/vhost_user_rdma/vhost_rdma_queue.c
+++ b/examples/vhost_user_rdma/vhost_rdma_queue.c
@@ -13,6 +13,11 @@
#include <rte_interrupts.h>
#include <rte_malloc.h>
#include <rte_vhost.h>
+#include <rte_mbuf.h>
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_udp.h>
+#include <rte_timer.h>
#include "vhost_rdma_queue.h"
#include "vhost_rdma_pkt.h"
@@ -560,12 +565,829 @@ vhost_rdma_queue_cleanup(struct vhost_rdma_qp *qp, struct vhost_rdma_queue *queu
queue->data = NULL;
}
-int vhost_rdma_requester(void *arg)
+int
+vhost_rdma_advance_dma_data(struct vhost_rdma_dma_info *dma, unsigned int length)
{
- //TODO: handle request
+ struct vhost_rdma_sge *sge = &dma->sge[dma->cur_sge];
+ uint32_t offset = dma->sge_offset;
+ int resid = dma->resid;
+
+ while (length) {
+ unsigned int bytes;
+
+ if (offset >= sge->length) {
+ sge++;
+ dma->cur_sge++;
+ offset = 0;
+ if (dma->cur_sge >= dma->num_sge)
+ return -ENOSPC;
+ }
+
+ bytes = length;
+
+ if (bytes > sge->length - offset)
+ bytes = sge->length - offset;
+
+ offset += bytes;
+ resid -= bytes;
+ length -= bytes;
+ }
+
+ dma->sge_offset = offset;
+ dma->resid = resid;
+
return 0;
}
+static __rte_always_inline void
+vhost_rdma_retry_first_write_send(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *wqe,
+ unsigned int mask, int npsn)
+{
+ int i;
+
+ for (i = 0; i < npsn; i++) {
+ int to_send = (wqe->dma.resid > qp->mtu) ?
+ qp->mtu : wqe->dma.resid;
+
+ qp->req.opcode = vhost_rdma_next_opcode(qp,
+ wqe,
+ wqe->wr->opcode);
+
+ if (wqe->wr->send_flags & VHOST_RDMA_IB_SEND_INLINE) {
+ wqe->dma.resid -= to_send;
+ wqe->dma.sge_offset += to_send;
+ } else {
+ vhost_rdma_advance_dma_data(&wqe->dma, to_send);
+ }
+ if (mask & WR_WRITE_MASK)
+ wqe->iova += qp->mtu;
+ }
+}
+
+static void vhost_rdma_req_retry(struct vhost_rdma_qp *qp)
+{
+ struct vhost_rdma_send_wqe *wqe;
+ struct vhost_rdma_queue *q = &qp->sq.queue;
+ unsigned int cons;
+ unsigned int prod;
+ unsigned int wqe_index;
+ unsigned int mask;
+ int npsn;
+ int first = 1;
+
+ cons = q->consumer_index;
+ prod = q->producer_index;
+
+ qp->req.wqe_index = cons;
+ qp->req.psn = qp->comp.psn;
+ qp->req.opcode = -1;
+
+ for (wqe_index = cons; wqe_index != prod; wqe_index++) {
+ wqe = addr_from_index(&qp->sq.queue, wqe_index);
+ mask = wr_opcode_mask(wqe->wr->opcode, qp);
+
+ if (wqe->state == WQE_STATE_POSTED)
+ break;
+
+ if (wqe->state == WQE_STATE_DONE)
+ continue;
+
+ wqe->iova = (mask & WR_READ_OR_WRITE_MASK) ?
+ wqe->wr->rdma.remote_addr : 0;
+
+ if (!first || (mask & WR_READ_MASK) == 0) {
+ wqe->dma.resid = wqe->dma.length;
+ wqe->dma.cur_sge = 0;
+ wqe->dma.sge_offset = 0;
+ }
+
+ if (first) {
+ first = 0;
+
+ if (mask & WR_WRITE_OR_SEND_MASK) {
+ npsn = (qp->comp.psn - wqe->first_psn) & VHOST_RDMA_PSN_MASK;
+ vhost_rdma_retry_first_write_send(qp, wqe, mask, npsn);
+ }
+
+ if (mask & WR_READ_MASK) {
+ npsn = (wqe->dma.length - wqe->dma.resid) / qp->mtu;
+ wqe->iova += npsn * qp->mtu;
+ }
+ }
+ wqe->state = WQE_STATE_POSTED;
+ }
+}
+
+static struct vhost_rdma_send_wqe* vhost_rdma_req_next_wqe(struct vhost_rdma_qp *qp)
+{
+ struct vhost_rdma_send_wqe *wqe;
+ struct vhost_rdma_queue *q = &qp->sq.queue;
+ unsigned int index = qp->req.wqe_index;
+ unsigned int cons;
+ unsigned int prod;
+
+ wqe = queue_head(q);
+ cons = q->consumer_index;
+ prod = q->producer_index;
+
+ if (unlikely(qp->req.state == QP_STATE_DRAIN)) {
+ rte_spinlock_lock(&qp->state_lock);
+ do {
+ if (qp->req.state != QP_STATE_DRAIN) {
+ /* comp just finished */
+ rte_spinlock_unlock(&qp->state_lock);
+ break;
+ }
+
+ if (wqe && ((index != cons) ||
+ (wqe->state != WQE_STATE_POSTED))) {
+ /* comp not done yet */
+ rte_spinlock_unlock(&qp->state_lock);
+ break;
+ }
+
+ qp->req.state = QP_STATE_DRAINED;
+ rte_spinlock_unlock(&qp->state_lock);
+ } while (0);
+ }
+
+ if (index == prod)
+ return NULL;
+
+ wqe = addr_from_index(q, index);
+
+ if (unlikely((qp->req.state == QP_STATE_DRAIN ||
+ qp->req.state == QP_STATE_DRAINED) &&
+ (wqe->state != WQE_STATE_PROCESSING)))
+ return NULL;
+
+ if (unlikely((wqe->wr->send_flags & VHOST_RDMA_IB_SEND_FENCE) &&
+ (index != cons))) {
+ qp->req.wait_fence = 1;
+ return NULL;
+ }
+
+ wqe->mask = wr_opcode_mask(wqe->wr->opcode, qp);
+ return wqe;
+}
+
+struct vhost_rdma_av *vhost_rdma_get_av(struct vhost_rdma_pkt_info *pkt)
+{
+ if (!pkt || !pkt->qp)
+ return NULL;
+
+ if (pkt->qp->type == VHOST_RDMA_IB_QPT_RC ||
+ pkt->qp->type == VHOST_RDMA_IB_QPT_UC)
+ return &pkt->qp->av;
+
+ return (pkt->wqe) ? &pkt->wqe->av : NULL;
+}
+
+struct rte_mbuf *vhost_rdma_init_packet(struct vhost_rdma_device *dev,
+ struct vhost_rdma_av *av,
+ int paylen,
+ struct vhost_rdma_pkt_info *pkt)
+{
+ const struct vhost_rdma_gid *attr;
+ unsigned int hdr_len;
+ struct rte_mbuf *mbuf = NULL;
+ const int port_num = 1;
+ uint16_t data_room;
+
+ attr = &dev->gid_tbl[av->grh.sgid_index];
+
+ if (attr->type == VHOST_RDMA_GID_TYPE_ILLIGAL)
+ return NULL;
+
+ if (av->network_type == VHOST_NETWORK_TYPE_IPV4)
+ hdr_len = ETH_HLEN + sizeof(struct rte_udp_hdr) +
+ sizeof(struct rte_ipv4_hdr);
+ else
+ hdr_len = ETH_HLEN + sizeof(struct rte_udp_hdr) +
+ sizeof(struct rte_ipv6_hdr);
+
+ hdr_len += sizeof(struct rte_ether_hdr);
+
+ mbuf = rte_pktmbuf_alloc(dev->mbuf_pool);
+
+ if (unlikely(mbuf == NULL)) {
+ goto out;
+ }
+
+ if (unlikely(hdr_len > rte_pktmbuf_headroom(mbuf))) {
+ RDMA_LOG_ERR("no enough head room %u > %u", hdr_len, rte_pktmbuf_headroom(mbuf));
+ rte_pktmbuf_free(mbuf);
+ return NULL;
+ }
+
+ data_room = mbuf->buf_len - rte_pktmbuf_headroom(mbuf);
+ if (unlikely(paylen > data_room)) {
+ RDMA_LOG_ERR("no enough data room %u > %u", paylen, data_room);
+ rte_pktmbuf_free(mbuf);
+ return NULL;
+ }
+
+ if (av->network_type == VHOST_NETWORK_TYPE_IPV4)
+ mbuf->l3_type = VHOST_NETWORK_TYPE_IPV4;
+ else
+ mbuf->l3_type = VHOST_NETWORK_TYPE_IPV6;
+
+ pkt->dev = dev;
+ pkt->port_num = port_num;
+ pkt->hdr = (uint8_t *)rte_pktmbuf_adj(mbuf, 0);
+ pkt->mask |= VHOST_GRH_MASK;
+
+ rte_pktmbuf_data_len(mbuf) = paylen;
+
+out:
+ return mbuf;
+}
+
+
+static struct rte_mbuf* vhost_rdma_init_req_packet(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *wqe,
+ int opcode,
+ int payload,
+ struct vhost_rdma_pkt_info *pkt)
+{
+ struct vhost_rdma_device *dev = qp->dev;
+ struct rte_mbuf *mbuf;
+ struct vhost_rdma_sq_req *wr = wqe->wr;
+ struct vhost_rdma_av *av;
+ int pad = (-payload) & 0x3;
+ int paylen;
+ int solicited;
+ uint16_t pkey;
+ uint32_t qp_num;
+ int ack_req;
+
+ /* length from start of bth to end of icrc */
+ paylen = vhost_rdma_opcode[opcode].length + payload + pad + VHOST_ICRC_SIZE;
+
+ /* pkt->hdr, rxe, port_num and mask are initialized in ifc
+ * layer
+ */
+ pkt->opcode = opcode;
+ pkt->qp = qp;
+ pkt->psn = qp->req.psn;
+ pkt->mask = vhost_rdma_opcode[opcode].mask;
+ pkt->paylen = paylen;
+ pkt->wqe = wqe;
+
+ /* init mbuf */
+ av = vhost_rdma_get_av(pkt);
+ mbuf = vhost_rdma_init_packet(dev, av, paylen, pkt);
+ if (unlikely(!mbuf))
+ return NULL;
+
+ /* init bth */
+ solicited = (wr->send_flags & VHOST_RDMA_IB_SEND_SOLICITED) &&
+ (pkt->mask & VHOST_END_MASK) &&
+ ((pkt->mask & (VHOST_SEND_MASK)) ||
+ (pkt->mask & (VHOST_WRITE_MASK | VHOST_IMMDT_MASK)) ==
+ (VHOST_WRITE_MASK | VHOST_IMMDT_MASK));
+
+ pkey = IB_DEFAULT_PKEY_FULL;
+
+ qp_num = (pkt->mask & VHOST_DETH_MASK) ? wr->ud.remote_qpn :
+ qp->attr.dest_qp_num;
+
+ ack_req = ((pkt->mask & VHOST_END_MASK) ||
+ (qp->req.noack_pkts++ > VHOST_MAX_PKT_PER_ACK));
+ if (ack_req)
+ qp->req.noack_pkts = 0;
+
+ bth_init(pkt, pkt->opcode, solicited, 0,
+ pad, pkey, qp_num,
+ ack_req, pkt->psn);
+
+ /* init optional headers */
+ if (pkt->mask & VHOST_RETH_MASK) {
+ reth_set_rkey(pkt, wr->rdma.rkey);
+ reth_set_va(pkt, wqe->iova);
+ reth_set_len(pkt, wqe->dma.resid);
+ }
+
+ if (pkt->mask & VHOST_IMMDT_MASK)
+ immdt_set_imm(pkt, wr->imm_data);
+ if (pkt->mask & VHOST_DETH_MASK) {
+ if (qp->qpn == 1)
+ deth_set_qkey(pkt, GSI_QKEY);
+ else
+ deth_set_qkey(pkt, wr->ud.remote_qkey);
+ deth_set_sqp(pkt, qp->qpn);
+ }
+
+ return mbuf;
+}
+
+struct vhost_rdma_mr* lookup_mr(struct vhost_rdma_pd *pd,
+ int access,
+ uint32_t key,
+ enum vhost_rdma_mr_lookup_type type)
+{
+ struct vhost_rdma_mr *mr;
+ int index = key >> 8;
+
+ mr = vhost_rdma_pool_get(&pd->dev->mr_pool, index);
+ if (!mr)
+ return NULL;
+ vhost_rdma_add_ref(mr);
+
+ if (unlikely((type == VHOST_LOOKUP_LOCAL && mr->lkey != key) ||
+ (type == VHOST_LOOKUP_REMOTE && mr->rkey != key) ||
+ mr->pd != pd || (access && !(access & mr->access)) ||
+ mr->state != VHOST_MR_STATE_VALID)) {
+ vhost_rdma_drop_ref(mr, pd->dev, mr);
+ mr = NULL;
+ }
+
+ return mr;
+}
+
+int
+mr_check_range(struct vhost_rdma_mr *mr, uint64_t iova, size_t length)
+{
+ switch (mr->type) {
+ case VHOST_MR_TYPE_DMA:
+ return 0;
+
+ case VHOST_MR_TYPE_MR:
+ if (iova < mr->iova || length > mr->length ||
+ iova > mr->iova + mr->length - length)
+ return -EFAULT;
+ return 0;
+
+ default:
+ return -EFAULT;
+ }
+}
+
+static __rte_always_inline uint64_t
+lookup_iova(struct vhost_rdma_mr *mr, uint64_t iova)
+{
+ size_t offset, index;
+
+ index = (iova - mr->iova) / USER_MMAP_TARGET_PAGE_SIZE;
+ offset = (iova - mr->iova) & ~USER_MMAP_PAGE_MASK;
+
+ return mr->pages[index] + offset;
+}
+
+int
+vhost_rdma_mr_copy(struct rte_vhost_memory *mem,
+ struct vhost_rdma_mr *mr,
+ uint64_t iova,
+ void *addr,
+ uint64_t length,
+ enum vhost_rdma_mr_copy_dir dir,
+ uint32_t *crcp)
+{
+ int err;
+ uint64_t bytes;
+ uint8_t *va;
+ uint32_t crc = crcp ? (*crcp) : 0;
+
+ if (length == 0)
+ return 0;
+
+ if (mr->type == VHOST_MR_TYPE_DMA) {
+ uint8_t *src, *dest;
+ // for dma addr, need to translate
+ iova = gpa_to_vva(mem, iova, &length);
+
+ src = (dir == VHOST_RDMA_TO_MR_OBJ) ? addr : ((void *)(uintptr_t)iova);
+
+ dest = (dir == VHOST_RDMA_TO_MR_OBJ) ? ((void *)(uintptr_t)iova) : addr;
+
+ rte_memcpy(dest, src, length);
+
+ if (crcp)
+ *crcp = crc32(*crcp, dest, length);
+
+ return 0;
+ }
+
+ err = mr_check_range(mr, iova, length);
+ if (err) {
+ err = -EFAULT;
+ goto err1;
+ }
+
+ while (length > 0) {
+ uint8_t *src, *dest;
+
+ va = (uint8_t *)lookup_iova(mr, iova);
+ src = (dir == VHOST_RDMA_TO_MR_OBJ) ? addr : va;
+ dest = (dir == VHOST_RDMA_TO_MR_OBJ) ? va : addr;
+
+ bytes = USER_MMAP_TARGET_PAGE_SIZE - ((uint64_t)va & ~ USER_MMAP_PAGE_MASK);
+
+ if (bytes > length)
+ bytes = length;
+
+ RDMA_LOG_DEBUG_DP("copy %p <- %p %lu", dest, src, bytes);
+ rte_memcpy(dest, src, bytes);
+
+ if (crcp)
+ crc = crc32(crc, dest, bytes);
+
+ length -= bytes;
+ addr += bytes;
+ iova += bytes;
+ }
+
+ if (crcp)
+ *crcp = crc;
+
+ return 0;
+
+err1:
+ return err;
+}
+
+int
+copy_data(struct vhost_rdma_pd *pd, int access,
+ struct vhost_rdma_dma_info *dma, void *addr,
+ int length, enum vhost_rdma_mr_copy_dir dir, uint32_t *crcp)
+{
+ uint32_t bytes;
+ struct vhost_rdma_sge *sge = &dma->sge[dma->cur_sge];
+ uint32_t offset = dma->sge_offset;
+ int resid = dma->resid;
+ struct vhost_rdma_mr *mr = NULL;
+ uint64_t iova;
+ int err;
+
+ if (length == 0)
+ return 0;
+
+ if (length > resid) {
+ err = -EINVAL;
+ goto err2;
+ }
+
+ RDMA_LOG_DEBUG("sge %llx %u offset %u %d", sge->addr, sge->length, offset, length);
+ if (sge->length && (offset < sge->length)) {
+ mr = lookup_mr(pd, access, sge->lkey, VHOST_LOOKUP_LOCAL);
+ if (!mr) {
+ err = -EINVAL;
+ goto err1;
+ }
+ }
+
+ while (length > 0) {
+ bytes = length;
+
+ if (offset >= sge->length) {
+ if (mr) {
+ vhost_rdma_drop_ref(mr, pd->dev, mr);
+ mr = NULL;
+ }
+ sge++;
+ dma->cur_sge++;
+ offset = 0;
+
+ if (dma->cur_sge >= dma->num_sge) {
+ err = -ENOSPC;
+ goto err2;
+ }
+
+ if (sge->length) {
+ mr = lookup_mr(pd, access, sge->lkey, VHOST_LOOKUP_LOCAL);
+ if (!mr) {
+ err = -EINVAL;
+ goto err1;
+ }
+ } else {
+ continue;
+ }
+ }
+
+ if (bytes > sge->length - offset)
+ bytes = sge->length - offset;
+
+ if (bytes > 0) {
+ iova = sge->addr + offset;
+
+ err = vhost_rdma_mr_copy(pd->dev->mem, mr, iova, addr, bytes, dir, crcp);
+ if (err)
+ goto err2;
+
+ offset += bytes;
+ resid -= bytes;
+ length -= bytes;
+ addr += bytes;
+ }
+ }
+
+ dma->sge_offset = offset;
+ dma->resid = resid;
+
+ if (mr)
+ vhost_rdma_drop_ref(mr, pd->dev, mr);
+
+ return 0;
+
+err2:
+ if (mr)
+ vhost_rdma_drop_ref(mr, pd->dev, mr);
+err1:
+ return err;
+}
+
+static int
+vhost_rdma_finish_packet(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *wqe,
+ struct vhost_rdma_pkt_info *pkt,
+ struct rte_mbuf *skb, int paylen)
+{
+ uint32_t crc = 0;
+ uint32_t *p;
+ int err;
+
+ err = vhost_rdma_prepare(pkt, skb, &crc);
+ if (err)
+ return err;
+
+ if (pkt->mask & VHOST_WRITE_OR_SEND) {
+ if (wqe->wr->send_flags & VHOST_RDMA_IB_SEND_INLINE) {
+ uint8_t *tmp = &wqe->dma.inline_data[wqe->dma.sge_offset];
+
+ crc = crc32(crc, tmp, paylen);
+ memcpy(payload_addr(pkt), tmp, paylen);
+
+ wqe->dma.resid -= paylen;
+ wqe->dma.sge_offset += paylen;
+ }else{
+ err = copy_data(qp->pd, 0, &wqe->dma,
+ payload_addr(pkt), paylen,
+ VHOST_RDMA_TO_MR_OBJ,
+ &crc);
+ if (err)
+ return err;
+ }
+ if (bth_pad(pkt)) {
+ uint8_t *pad = payload_addr(pkt) + paylen;
+
+ memset(pad, 0, bth_pad(pkt));
+ crc = crc32(crc, pad, bth_pad(pkt));
+ }
+ }
+ p = payload_addr(pkt) + paylen + bth_pad(pkt);
+
+ *p = ~crc;
+
+ return 0;
+}
+
+static void
+save_state(struct vhost_rdma_send_wqe *wqe,
+ struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *rollback_wqe,
+ uint32_t *rollback_psn)
+{
+ rollback_wqe->state = wqe->state;
+ rollback_wqe->first_psn = wqe->first_psn;
+ rollback_wqe->last_psn = wqe->last_psn;
+ *rollback_psn = qp->req.psn;
+}
+
+static void
+rollback_state(struct vhost_rdma_send_wqe *wqe,
+ struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *rollback_wqe,
+ uint32_t rollback_psn)
+{
+ wqe->state = rollback_wqe->state;
+ wqe->first_psn = rollback_wqe->first_psn;
+ wqe->last_psn = rollback_wqe->last_psn;
+ qp->req.psn = rollback_psn;
+}
+
+void
+retransmit_timer(__rte_unused struct rte_timer *timer, void* arg)
+{
+ struct vhost_rdma_qp *qp = arg;
+
+ if (qp->valid) {
+ qp->comp.timeout = 1;
+ vhost_rdma_run_task(&qp->comp.task, 1);
+ }
+}
+
+static void
+update_state(struct vhost_rdma_qp *qp, struct vhost_rdma_pkt_info *pkt)
+{
+ qp->req.opcode = pkt->opcode;
+
+ if (pkt->mask & VHOST_END_MASK)
+ qp->req.wqe_index += 1;
+
+ qp->need_req_mbuf = 0;
+
+ if (qp->qp_timeout_ticks && !rte_timer_pending(&qp->retrans_timer))
+ rte_timer_reset(&qp->retrans_timer, qp->qp_timeout_ticks, SINGLE,
+ rte_lcore_id(), retransmit_timer, qp);
+}
+
+static __rte_always_inline void
+update_wqe_state(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_send_wqe *wqe,
+ struct vhost_rdma_pkt_info *pkt)
+{
+ if (pkt->mask & VHOST_END_MASK) {
+ if (qp->type == VHOST_RDMA_IB_QPT_RC)
+ wqe->state = WQE_STATE_PENDING;
+ } else {
+ wqe->state = WQE_STATE_PROCESSING;
+ }
+}
+
+static __rte_always_inline void
+update_wqe_psn(struct vhost_rdma_qp *qp, struct vhost_rdma_send_wqe *wqe,
+ struct vhost_rdma_pkt_info *pkt, int payload)
+{
+ /* number of packets left to send including current one */
+ int num_pkt = (wqe->dma.resid + payload + qp->mtu - 1) / qp->mtu;
+
+ /* handle zero length packet case */
+ if (num_pkt == 0)
+ num_pkt = 1;
+
+ if (pkt->mask & VHOST_START_MASK) {
+ wqe->first_psn = qp->req.psn;
+ wqe->last_psn = (qp->req.psn + num_pkt - 1) & VHOST_RDMA_PSN_MASK;
+ }
+
+ if (pkt->mask & VHOST_READ_MASK)
+ qp->req.psn = (wqe->first_psn + num_pkt) & VHOST_RDMA_PSN_MASK;
+ else
+ qp->req.psn = (qp->req.psn + 1) & VHOST_RDMA_PSN_MASK;
+}
+
+int vhost_rdma_requester(void *arg)
+{
+ struct vhost_rdma_qp *qp = (struct vhost_rdma_qp *)arg;
+ struct vhost_rdma_pkt_info pkt;
+ struct rte_mbuf *mbuf;
+ struct vhost_rdma_send_wqe *wqe;
+ enum vhost_rdma_hdr_mask mask;
+ struct vhost_rdma_send_wqe rollback_wqe;
+ struct vhost_rdma_queue *q = &qp->sq.queue;
+ uint32_t rollback_psn;
+ int payload;
+ int mtu;
+ int opcode;
+ int ret;
+
+ vhost_rdma_add_ref(qp);
+
+next_wqe:
+ if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR))
+ goto exit;
+
+ if (unlikely(qp->req.state == QP_STATE_RESET)) {
+ qp->req.wqe_index = q->consumer_index;
+ qp->req.opcode = -1;
+ qp->req.need_rd_atomic = 0;
+ qp->req.wait_psn = 0;
+ qp->req.need_retry = 0;
+ goto exit;
+ }
+
+ if (unlikely(qp->req.need_retry)) {
+ vhost_rdma_req_retry(qp);
+ qp->req.need_retry = 0;
+ }
+
+ wqe = vhost_rdma_req_next_wqe(qp);
+ if (unlikely(!wqe))
+ goto exit;
+
+ assert(!(wqe->mask & WR_LOCAL_OP_MASK));
+
+ if (unlikely(qp->type == VHOST_RDMA_IB_QPT_RC &&
+ psn_compare(qp->req.psn, (qp->comp.psn + VHOST_MAX_UNACKED_PSNS)) > 0)) {
+ qp->req.wait_psn = 1;
+ goto exit;
+ }
+
+ if (unlikely(rte_atomic32_read(&qp->mbuf_out) >
+ VHOST_INFLIGHT_SKBS_PER_QP_HIGH)) {
+ qp->need_req_mbuf = 1;
+ goto exit;
+ }
+
+ assert(!(wqe->mask & WR_LOCAL_OP_MASK));
+
+ if (unlikely(qp->type == VHOST_RDMA_IB_QPT_RC &&
+ psn_compare(qp->req.psn, (qp->comp.psn +
+ VHOST_MAX_UNACKED_PSNS)) > 0)) {
+ qp->req.wait_psn = 1;
+ goto exit;
+ }
+
+ /* Limit the number of inflight SKBs per QP */
+ if (unlikely(rte_atomic32_read(&qp->mbuf_out) >
+ VHOST_INFLIGHT_SKBS_PER_QP_HIGH)) {
+ qp->need_req_mbuf = 1;
+ goto exit;
+ }
+
+ opcode = vhost_rdma_next_opcode(qp, wqe, wqe->wr->opcode);
+ if (unlikely(opcode < 0)) {
+ wqe->status = VHOST_RDMA_IB_WC_LOC_QP_OP_ERR;
+ goto exit;
+ }
+
+ mask = vhost_rdma_opcode[opcode].mask;
+ if (unlikely(mask & VHOST_READ_OR_ATOMIC)) {
+ if (check_init_depth(qp, wqe))
+ goto exit;
+ }
+
+ mtu = get_mtu(qp);
+ payload = (mask & VHOST_WRITE_OR_SEND) ? wqe->dma.resid : 0;
+
+ if (payload > mtu) {
+ if (qp->type == VHOST_RDMA_IB_QPT_UD) {
+ /* C10-93.1.1: If the total sum of all the buffer lengths specified for a
+ * UD message exceeds the MTU of the port as returned by QueryHCA, the CI
+ * shall not emit any packets for this message. Further, the CI shall not
+ * generate an error due to this condition.
+ */
+
+ /* fake a successful UD send */
+ wqe->first_psn = qp->req.psn;
+ wqe->last_psn = qp->req.psn;
+ qp->req.psn = (qp->req.psn + 1) & VHOST_RDMA_PSN_MASK;
+ qp->req.opcode = IB_OPCODE_UD_SEND_ONLY;
+ qp->req.wqe_index += 1;
+ wqe->state = WQE_STATE_DONE;
+ wqe->status =VHOST_RDMA_IB_WC_SUCCESS;
+ __vhost_rdma_do_task(&qp->comp.task);
+ vhost_rdma_drop_ref(qp, qp->dev, qp);
+ return 0;
+ }
+ payload = mtu;
+ }
+
+ mbuf = vhost_rdma_init_req_packet(qp, wqe, opcode, payload, &pkt);
+ if (unlikely(!mbuf)) {
+ RDMA_LOG_ERR_DP("qp#%d Failed allocating mbuf", qp->qpn);
+ wqe->status = VHOST_RDMA_IB_WC_LOC_QP_OP_ERR;
+ goto err;
+ }
+
+ ret = vhost_rdma_finish_packet(qp, wqe, &pkt, mbuf, payload);
+ if (unlikely(ret)) {
+ RDMA_LOG_DEBUG("qp#%d Error during finish packet", qp->qpn);
+ if (ret == -EFAULT)
+ wqe->status = VHOST_RDMA_IB_WC_LOC_PROT_ERR;
+ else
+ wqe->status = VHOST_RDMA_IB_WC_LOC_QP_OP_ERR;
+ rte_pktmbuf_free(mbuf);
+ goto err;
+ }
+ /*
+ * To prevent a race on wqe access between requester and completer,
+ * wqe members state and psn need to be set before calling
+ * rxe_xmit_packet().
+ * Otherwise, completer might initiate an unjustified retry flow.
+ */
+ save_state(wqe, qp, &rollback_wqe, &rollback_psn);
+ update_wqe_state(qp, wqe, &pkt);
+ update_wqe_psn(qp, wqe, &pkt, payload);
+ ret = vhost_rdma_xmit_packet(qp, &pkt, mbuf);
+ if (ret) {
+ qp->need_req_mbuf = 1;
+
+ rollback_state(wqe, qp, &rollback_wqe, rollback_psn);
+
+ if (ret == -EAGAIN) {
+ vhost_rdma_run_task(&qp->req.task, 1);
+ goto exit;
+ }
+
+ wqe->status = VHOST_RDMA_IB_WC_LOC_QP_OP_ERR;
+ goto err;
+ }
+
+ update_state(qp, &pkt);
+
+ goto next_wqe;
+
+err:
+ wqe->state = WQE_STATE_ERROR;
+ __vhost_rdma_do_task(&qp->comp.task);
+
+exit:
+ vhost_rdma_drop_ref(qp, qp->dev, qp);
+ return -EAGAIN;
+}
+
int vhost_rdma_completer(void* arg)
{
//TODO: handle complete
diff --git a/examples/vhost_user_rdma/vhost_rdma_queue.h b/examples/vhost_user_rdma/vhost_rdma_queue.h
index 260eea51f8..fb5a90235f 100644
--- a/examples/vhost_user_rdma/vhost_rdma_queue.h
+++ b/examples/vhost_user_rdma/vhost_rdma_queue.h
@@ -19,6 +19,10 @@
#include <linux/types.h>
#include "vhost_rdma_ib.h"
+#include "vhost_rdma_pkt.h"
+#include "vhost_rdma_opcode.h"
+#include "vhost_rdma.h"
+#include "vhost_rdma_log.h"
#define QP_OPCODE_INVAILD (-1)
@@ -36,17 +40,15 @@ struct vhost_rdma_bth {
#define VHOST_RDMA_TVER (0)
#define VHOST_RDMA_DEF_PKEY (0xffff)
-#define VHOST_RDMA_SE_MASK (0x80)
-#define VHOST_RDMA_MIG_MASK (0x40)
-#define VHOST_RDMA_PAD_MASK (0x30)
-#define VHOST_RDMA_TVER_MASK (0x0f)
-#define VHOST_RDMA_FECN_MASK (0x80000000)
-#define VHOST_RDMA_BECN_MASK (0x40000000)
-#define VHOST_RDMA_RESV6A_MASK (0x3f000000)
-#define VHOST_RDMA_QPN_MASK (0x00ffffff)
-#define VHOST_RDMA_ACK_MASK (0x80000000)
-#define VHOST_RDMA_RESV7_MASK (0x7f000000)
-#define VHOST_RDMA_PSN_MASK (0x00ffffff)
+#define VHOST_MAX_UNACKED_PSNS 128
+#define VHOST_INFLIGHT_SKBS_PER_QP_HIGH 64
+#define VHOST_INFLIGHT_SKBS_PER_QP_LOW 16
+#define VHOST_MAX_PKT_PER_ACK 64
+
+#define VHOST_ICRC_SIZE (4)
+#define VHOST_MAX_HDR_LENGTH (80)
+
+#define IB_DEFAULT_PKEY_FULL 0xFFFF
/**
* @brief Operation codes for Work Completions (WC)
@@ -94,6 +96,16 @@ enum {
TASK_STATE_ARMED = 2,
};
+enum vhost_rdma_mr_copy_dir {
+ VHOST_RDMA_TO_MR_OBJ,
+ VHOST_RDMA_FROM_MR_OBJ,
+};
+
+enum vhost_rdma_mr_lookup_type {
+ VHOST_LOOKUP_LOCAL,
+ VHOST_LOOKUP_REMOTE,
+};
+
/**
* @brief Send Queue Work Request (WR) structure from userspace
*
@@ -208,10 +220,129 @@ vhost_rdma_queue_get_data(struct vhost_rdma_queue *queue, size_t idx)
return queue->data + queue->elem_size * idx;
}
+static __rte_always_inline void*
+addr_from_index(struct vhost_rdma_queue *q, unsigned int index)
+{
+ uint16_t cons;
+ uint16_t desc_idx;
+
+ cons = index & (q->num_elems - 1);
+ desc_idx = q->vq->vring.avail->ring[cons];
+
+ return vhost_rdma_queue_get_data(q, desc_idx);
+}
+
+static __rte_always_inline bool queue_empty(struct vhost_rdma_queue *q)
+{
+ uint16_t prod;
+ uint16_t cons;
+
+ prod = q->producer_index;
+ cons = q->consumer_index;
+
+ return prod == cons;
+}
+
+static __rte_always_inline void*
+consumer_addr(struct vhost_rdma_queue *q)
+{
+ uint16_t cons;
+ uint16_t desc_idx;
+
+ cons = q->consumer_index & (q->num_elems - 1);
+ desc_idx = q->vq->vring.avail->ring[cons];
+
+ return vhost_rdma_queue_get_data(q, desc_idx);
+}
+
+static __rte_always_inline void*
+queue_head(struct vhost_rdma_queue *q)
+{
+ return queue_empty(q) ? NULL : consumer_addr(q);
+}
+
+static inline int psn_compare(uint32_t psn_a, uint32_t psn_b)
+{
+ int32_t diff;
+
+ diff = (psn_a - psn_b) << 8;
+ return diff;
+}
+
+static __rte_always_inline int
+check_init_depth(struct vhost_rdma_qp *qp, struct vhost_rdma_send_wqe *wqe)
+{
+ int depth;
+
+ if (wqe->has_rd_atomic)
+ return 0;
+
+ qp->req.need_rd_atomic = 1;
+ depth = rte_atomic32_sub_return(&qp->req.rd_atomic, 1);
+
+ if (depth >= 0) {
+ qp->req.need_rd_atomic = 0;
+ wqe->has_rd_atomic = 1;
+ return 0;
+ }
+
+ rte_atomic32_inc(&qp->req.rd_atomic);
+ return -EAGAIN;
+}
+
+static __rte_always_inline int
+get_mtu(struct vhost_rdma_qp *qp)
+{
+ struct vhost_rdma_device *dev = qp->dev;
+
+ if (qp->type == VHOST_RDMA_IB_QPT_RC || qp->type == VHOST_RDMA_IB_QPT_UC)
+ return qp->mtu;
+
+ return dev->mtu_cap;
+}
+
+static inline void bth_init(struct vhost_rdma_pkt_info *pkt, uint8_t opcode, int se,
+ int mig, int pad, uint16_t pkey, uint32_t qpn, int ack_req,
+ uint32_t psn)
+{
+ struct vhost_bth *bth = (struct vhost_bth *)(pkt->hdr);
+
+ bth->opcode = opcode;
+ bth->flags = (pad << 4) & VHOST_RDMA_PAD_MASK;
+ if (se)
+ bth->flags |= VHOST_RDMA_SE_MASK;
+ if (mig)
+ bth->flags |= VHOST_RDMA_MIG_MASK;
+ bth->pkey = rte_cpu_to_be_16(pkey);
+ bth->qpn = rte_cpu_to_be_32(qpn & VHOST_RDMA_QPN_MASK);
+ psn &= VHOST_RDMA_PSN_MASK;
+ if (ack_req)
+ psn |= VHOST_RDMA_ACK_MASK;
+ bth->apsn = rte_cpu_to_be_32(psn);
+}
+
+static inline size_t header_size(struct vhost_rdma_pkt_info *pkt)
+{
+ return vhost_rdma_opcode[pkt->opcode].length;
+}
+
+static inline void *payload_addr(struct vhost_rdma_pkt_info *pkt)
+{
+ return pkt->hdr + vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_PAYLOAD];
+}
+
+static inline size_t payload_size(struct vhost_rdma_pkt_info *pkt)
+{
+ return pkt->paylen - vhost_rdma_opcode[pkt->opcode].offset[VHOST_RDMA_PAYLOAD]
+ - bth_pad(pkt) - VHOST_ICRC_SIZE;
+}
+
/*
* Function declarations
*/
+int vhost_rdma_advance_dma_data(struct vhost_rdma_dma_info *dma, unsigned int length);
+
/**
* @brief Initialize an internal Send WQE from a user WR
*
@@ -335,4 +466,72 @@ void vhost_rdma_qp_destroy(struct vhost_rdma_qp *qp);
int vhost_rdma_av_chk_attr(struct vhost_rdma_device *dev,
struct vhost_rdma_ah_attr *attr);
+struct vhost_rdma_av *vhost_rdma_get_av(struct vhost_rdma_pkt_info *pkt);
+struct rte_mbuf* vhost_rdma_init_packet(struct vhost_rdma_device *dev,
+ struct vhost_rdma_av *av,
+ int paylen,
+ struct vhost_rdma_pkt_info *pkt);
+
+int vhost_rdma_send(struct vhost_rdma_pkt_info *pkt, struct rte_mbuf *mbuf);
+
+int copy_data(struct vhost_rdma_pd *pd, int access,
+ struct vhost_rdma_dma_info *dma,
+ void *addr, int length,
+ enum vhost_rdma_mr_copy_dir dir, uint32_t *crcp);
+
+struct vhost_rdma_mr* lookup_mr(struct vhost_rdma_pd *pd,
+ int access,
+ uint32_t key,
+ enum vhost_rdma_mr_lookup_type type);
+
+int mr_check_range(struct vhost_rdma_mr *mr,
+ uint64_t iova,
+ size_t length);
+
+int vhost_rdma_mr_copy(struct rte_vhost_memory *mem,
+ struct vhost_rdma_mr *mr,
+ uint64_t iova,
+ void *addr,
+ uint64_t length,
+ enum vhost_rdma_mr_copy_dir dir,
+ uint32_t *crcp);
+
+void retransmit_timer(__rte_unused struct rte_timer *timer, void* arg);
+
+static __rte_always_inline int
+vhost_rdma_xmit_packet(struct vhost_rdma_qp *qp,
+ struct vhost_rdma_pkt_info *pkt, struct rte_mbuf *m)
+{
+ int err;
+ int is_request = pkt->mask & VHOST_REQ_MASK;
+ struct vhost_rdma_device *dev = qp->dev;
+
+ if ((is_request && (qp->req.state != QP_STATE_READY)) ||
+ (!is_request && (qp->resp.state != QP_STATE_READY))) {
+ RDMA_LOG_ERR("Packet dropped. QP is not in ready state\n");
+ goto drop;
+ }
+
+ err = vhost_rdma_send(pkt, m);
+ if (err) {
+ vhost_rdma_counter_inc(dev, VHOST_RDMA_CNT_SEND_ERR);
+ return err;
+ }
+
+ if ((qp->type != VHOST_RDMA_IB_QPT_RC) &&
+ (pkt->mask & VHOST_END_MASK)) {
+ pkt->wqe->state = WQE_STATE_DONE;
+ vhost_rdma_run_task(&qp->comp.task, 1);
+ }
+
+ vhost_rdma_counter_inc(dev, VHOST_RDMA_CNT_SENT_PKTS);
+ goto done;
+
+drop:
+ rte_pktmbuf_free(m);
+ err = 0;
+done:
+ return err;
+}
+
#endif /* VHOST_RDMA_QUEUE_H_ */
\ No newline at end of file
--
2.43.0
Powered by blists - more mailing lists