[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251217080605.38473-2-15927021679@163.com>
Date: Wed, 17 Dec 2025 16:05:11 +0800
From: 15927021679@....com
To: Alexei Starovoitov <ast@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>,
"David S . Miller" <davem@...emloft.net>,
Jakub Kicinski <kuba@...nel.org>,
Jesper Dangaard Brouer <hawk@...nel.org>,
John Fastabend <john.fastabend@...il.com>,
Stanislav Fomichev <sdf@...ichev.me>
Cc: linux-kernel@...r.kernel.org,
netdev@...r.kernel.org,
xiongweimin <xiongweimin@...inos.cn>
Subject: [PATCH 01/14] examples/vhost_user_rdma: implement core application initialization for supporting vhost_user_rdma device
From: xiongweimin <xiongweimin@...inos.cn>
This commit introduces the main initialization routine for vHost RDMA
application built on DPDK. The implementation includes:
1. DPDK EAL environment initialization with proper signal handling
2. Argument parsing for application-specific configuration
3. Creation of shared memory resources:
- Packet buffer pools with per-core caching
- Optimized ring buffers for RX/TX with SP/MC synchronization flags
4. Backend network device detection and initialization
5. Worker thread launch across available cores
6. Multi-device support with shared/dedicated resource allocation
7. vHost device construction and driver registration
Key features:
- NUMA-aware resource allocation using rte_socket_id()
- Optimized ring flags (SP_ENQ, MC_HTS_DEQ) for lockless operation
- Graceful shutdown handling through signal interception
- Resource isolation for multi-device configurations
Signed-off-by: Xiong Weimin <xiongweimin@...inos.cn>
Change-Id: I1a42aeaa04595d13fc392452c1c9ca3f97442acc
---
examples/meson.build | 1 +
examples/vhost_user_rdma/main.c | 607 ++++++++++++++++++
examples/vhost_user_rdma/meson.build | 45 ++
examples/vhost_user_rdma/vhost_rdma.c | 697 +++++++++++++++++++++
examples/vhost_user_rdma/vhost_rdma.h | 444 ++++++++++++++
examples/vhost_user_rdma/vhost_rdma_ib.c | 647 ++++++++++++++++++++
examples/vhost_user_rdma/vhost_rdma_ib.h | 710 ++++++++++++++++++++++
examples/vhost_user_rdma/vhost_rdma_log.h | 52 ++
examples/vhost_user_rdma/vhost_rdma_pkt.h | 296 +++++++++
9 files changed, 3499 insertions(+)
create mode 100644 examples/vhost_user_rdma/main.c
create mode 100644 examples/vhost_user_rdma/meson.build
create mode 100644 examples/vhost_user_rdma/vhost_rdma.c
create mode 100644 examples/vhost_user_rdma/vhost_rdma.h
create mode 100644 examples/vhost_user_rdma/vhost_rdma_ib.c
create mode 100644 examples/vhost_user_rdma/vhost_rdma_ib.h
create mode 100644 examples/vhost_user_rdma/vhost_rdma_log.h
create mode 100644 examples/vhost_user_rdma/vhost_rdma_pkt.h
diff --git a/examples/meson.build b/examples/meson.build
index 8e8968a1fa..780d49d4b4 100644
--- a/examples/meson.build
+++ b/examples/meson.build
@@ -54,6 +54,7 @@ all_examples = [
'vdpa',
'vhost',
'vhost_blk',
+ 'vhost_user_rdma',
'vhost_crypto',
'vm_power_manager',
'vm_power_manager/guest_cli',
diff --git a/examples/vhost_user_rdma/main.c b/examples/vhost_user_rdma/main.c
new file mode 100644
index 0000000000..d5dda47e4e
--- /dev/null
+++ b/examples/vhost_user_rdma/main.c
@@ -0,0 +1,607 @@
+/*
+ * Vhost-user RDMA Device - Initialization and Packet Forwarding
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ * Copyright (C) 2025 KylinSoft Inc. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@...inos.cn>
+ *
+ */
+
+#include <signal.h>
+#include <getopt.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <stdarg.h>
+#include <ctype.h>
+#include <errno.h>
+
+/* DPDK headers */
+#include <rte_memory.h>
+#include <rte_launch.h>
+#include <rte_eal.h>
+#include <rte_per_lcore.h>
+#include <rte_lcore.h>
+#include <rte_debug.h>
+#include <rte_log.h>
+#include <rte_ethdev.h>
+#include <rte_mbuf.h>
+#include <rte_ring.h>
+#include <rte_malloc.h>
+#include <dev_driver.h>
+
+/* Local headers */
+#include "vhost_rdma_ib.h"
+#include "vhost_rdma.h"
+#include "vhost_rdma_pkt.h"
+#include "vhost_rdma_log.h"
+
+/**
+ * Maximum length for Unix socket path
+ */
+#define SOCKET_PATH_MAX 64
+
+/**
+ * Default number of RX/TX descriptors
+ */
+#define MAX_NB_RXD 1024
+#define MAX_NB_TXD 1024
+
+/**
+ * Size of shared rings between vhost devices and datapath
+ */
+#define MAX_RING_COUNT 1024
+
+/**
+ * Default number of mbufs in memory pool
+ */
+#define NUM_MBUFS_DEFAULT (1UL << 16) // 65536
+
+/**
+ * Cache size for per-lcore mbuf cache
+ */
+#define MBUF_CACHE_SIZE 256
+
+/**
+ * Data buffer size in each mbuf
+ */
+#define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE
+
+/* Forward declarations */
+extern struct vhost_rdma_device g_vhost_rdma_dev[];
+
+/* Global configuration */
+static char *socket_path; /* Array of socket paths */
+static int nb_sockets = 0; /* Number of vhost sockets */
+static uint16_t pair_port_id = UINT16_MAX; /* Physical port ID to forward packets */
+static volatile bool force_quit; /* Signal to exit cleanly */
+
+/* Stats and feature flags */
+static uint32_t enable_stats; /* Enable periodic stats printing (seconds) */
+static uint32_t enable_tx_csum; /* Enable TX checksum offload */
+static int total_num_mbufs = NUM_MBUFS_DEFAULT;/* Total mbufs across pools */
+
+/* Shared resources */
+static struct rte_ring *vhost_rdma_rx_ring;
+static struct rte_ring *vhost_rdma_tx_ring;
+static struct rte_mempool *vhost_rdma_mbuf_pool;
+
+/* Per-lcore info for device management */
+struct lcore_info {
+ uint32_t device_num;
+ TAILQ_HEAD(vhost_dev_tailq_list, vhost_rdma_device) vdev_list;
+};
+
+static struct lcore_info lcore_info[RTE_MAX_LCORE];
+static unsigned int lcore_ids[RTE_MAX_LCORE];
+
+/* Port configuration templates */
+static struct rte_eth_conf default_port_config;
+
+static struct rte_eth_conf offload_port_config = {
+ .txmode = {
+ .offloads = RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
+ RTE_ETH_TX_OFFLOAD_UDP_CKSUM |
+ RTE_ETH_TX_OFFLOAD_TCP_CKSUM,
+ },
+};
+
+enum {
+#define OPT_STATS "stats"
+ OPT_STATS_NUM,
+#define OPT_SOCKET_FILE "socket-file"
+ OPT_SOCKET_FILE_NUM,
+#define OPT_TX_CSUM "tx-csum"
+ OPT_TX_CSUM_NUM,
+#define OPT_NUM_MBUFS "total-num-mbufs"
+ OPT_NUM_MBUFS_NUM,
+};
+
+/**
+ * @brief Unregister all registered vhost drivers.
+ *
+ * Called during signal cleanup to ensure no stale sockets remain.
+ *
+ * @param socket_num Number of socket paths to unregister
+ */
+static void
+unregister_drivers(int socket_num)
+{
+ int i, ret;
+
+ for (i = 0; i < socket_num; i++) {
+ const char *path = socket_path + i * SOCKET_PATH_MAX;
+ ret = rte_vhost_driver_unregister(path);
+ if (ret != 0) {
+ RDMA_LOG_ERR("Failed to unregister vhost driver for socket %s\n", path);
+ } else {
+ RDMA_LOG_INFO("Unregistered socket: %s\n", path);
+ }
+ }
+}
+
+/**
+ * @brief Signal handler for graceful shutdown (SIGINT).
+ *
+ * Cleans up vhost driver registrations and exits.
+ */
+static void
+vhost_rdma_signal_handler(__rte_unused int signum)
+{
+ RDMA_LOG_INFO("Received SIGINT, shutting down...\n");
+
+ if((signum == SIGINT) || (signum == SIGTERM))
+ force_quit = true;
+
+ unregister_drivers(nb_sockets);
+ exit(0);
+}
+
+/**
+ * @brief Initialize an Ethernet port with given offload settings.
+ *
+ * Configures one RX/TX queue, sets up descriptor rings, starts the port.
+ *
+ * @param port_id The port identifier
+ * @param offload Whether to enable hardware offloads
+ * @return 0 on success, negative on failure
+ */
+static int
+vhost_rdma_init_port(uint16_t port_id, bool offload)
+{
+ int ret;
+ uint16_t nb_rxd = MAX_NB_RXD;
+ uint16_t nb_txd = MAX_NB_TXD;
+ struct rte_eth_dev_info dev_info;
+ struct rte_eth_conf port_conf = offload ? offload_port_config : default_port_config;
+ struct rte_eth_txconf txconf;
+ struct rte_ether_addr addr;
+ char mac_str[RTE_ETHER_ADDR_FMT_SIZE];
+
+ RDMA_LOG_INFO("Initializing port %u with %s offloads\n", port_id,
+ offload ? "enabled" : "disabled");
+
+ ret = rte_eth_dev_info_get(port_id, &dev_info);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to get device info for port %u\n", port_id);
+ goto out;
+ }
+
+ ret = rte_eth_dev_configure(port_id, 1, 1, &port_conf);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to configure port %u\n", port_id);
+ goto out;
+ }
+
+ ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd);
+ if (ret < 0) {
+ LOG_WARN("Failed to adjust number of descriptors for port %u\n", port_id);
+ }
+
+ ret = rte_eth_rx_queue_setup(port_id, 0, nb_rxd,
+ rte_eth_dev_socket_id(port_id),
+ NULL,
+ vhost_rdma_mbuf_pool);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to setup RX queue for port %u\n", port_id);
+ goto out;
+ }
+
+ txconf = dev_info.default_txconf;
+ txconf.offloads = port_conf.txmode.offloads;
+ ret = rte_eth_tx_queue_setup(port_id, 0, nb_txd,
+ rte_eth_dev_socket_id(port_id),
+ &txconf);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to setup TX queue for port %u\n", port_id);
+ goto out;
+ }
+
+ ret = rte_eth_dev_start(port_id);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to start port %u\n", port_id);
+ goto out;
+ }
+
+ ret = rte_eth_promiscuous_enable(port_id);
+ if (ret < 0) {
+ LOG_WARN("Failed to enable promiscuous mode on port %u\n", port_id);
+ }
+
+ ret = rte_eth_macaddr_get(port_id, &addr);
+ if (ret == 0) {
+ rte_ether_format_addr(mac_str, sizeof(mac_str), &addr);
+ RDMA_LOG_INFO("Port %u MAC address: %s\n", port_id, mac_str);
+ } else {
+ LOG_WARN("Could not read MAC address for port %u\n", port_id);
+ }
+
+out:
+ return ret;
+}
+
+/**
+ * @brief Print usage information.
+ */
+static void
+vhost_rdma_usage(const char *prgname)
+{
+ printf("%s [EAL options] --\n"
+ " -p PORTMASK\n"
+ " --socket-file <path> : Path to vhost-user socket (can be repeated)\n"
+ " --stats <N> : Print stats every N seconds (0=disable)\n"
+ " --tx-csum <0|1> : Disable/enable TX checksum offload\n"
+ " --total-num-mbufs <N> : Total number of mbufs in pool (default: %ld)\n",
+ prgname, NUM_MBUFS_DEFAULT);
+}
+
+/**
+ * @brief Parse a numeric option safely.
+ *
+ * @param q_arg Input string
+ * @param max_valid_value Maximum allowed value
+ * @return Parsed integer or -1 on error
+ */
+static int
+vhost_rdma_parse_num_opt(const char *q_arg, uint32_t max_valid_value)
+{
+ char *end = NULL;
+ unsigned long num;
+
+ errno = 0;
+ num = strtoul(q_arg, &end, 10);
+
+ if (!q_arg || q_arg[0] == '\0' || end == NULL || *end != '\0')
+ return -1;
+ if (errno != 0 || num > max_valid_value)
+ return -1;
+
+ return (int)num;
+}
+
+/**
+ * @brief Parse and store vhost socket path.
+ *
+ * Supports multiple sockets via repeated --socket-file.
+ *
+ * @param q_arg Socket file path
+ * @return 0 on success, -1 on failure
+ */
+static int
+vhost_rdma_parse_socket_path(const char *q_arg)
+{
+ char *old_ptr;
+
+ if (strnlen(q_arg, SOCKET_PATH_MAX) >= SOCKET_PATH_MAX) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Socket path too long: %s\n", q_arg);
+ return -1;
+ }
+
+ old_ptr = socket_path;
+ socket_path = realloc(socket_path, SOCKET_PATH_MAX * (nb_sockets + 1));
+ if (socket_path == NULL) {
+ free(old_ptr);
+ return -1;
+ }
+
+ strncpy(socket_path + nb_sockets * SOCKET_PATH_MAX, q_arg, SOCKET_PATH_MAX - 1);
+ socket_path[(nb_sockets + 1) * SOCKET_PATH_MAX - 1] = '\0';
+
+ RDMA_LOG_INFO("Registered socket[%d]: %s\n",
+ nb_sockets, socket_path + nb_sockets * SOCKET_PATH_MAX);
+
+ nb_sockets++;
+ return 0;
+}
+
+/**
+ * @brief Parse command-line arguments.
+ *
+ * Supported options:
+ * --socket-file, --stats, --tx-csum, --total-num-mbufs
+ *
+ * @param argc Argument count
+ * @param argv Argument vector
+ * @return 0 on success, -1 on failure
+ */
+static int
+vhost_rdma_parse_args(int argc, char **argv)
+{
+ int opt, ret;
+ int option_idx;
+ const char *prgname = argv[0];
+
+ static struct option lgopts[] = {
+ { "stats", required_argument, NULL, OPT_STATS_NUM },
+ { "socket-file", required_argument, NULL, OPT_SOCKET_FILE_NUM },
+ { "tx-csum", required_argument, NULL, OPT_TX_CSUM_NUM },
+ { "total-num-mbufs",required_argument, NULL, OPT_NUM_MBUFS_NUM },
+ { NULL, 0, NULL, 0 }
+ };
+
+ while ((opt = getopt_long(argc, argv, "",
+ lgopts, &option_idx)) != EOF) {
+ switch (opt) {
+ case OPT_STATS_NUM:
+ ret = vhost_rdma_parse_num_opt(optarg, INT32_MAX);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Invalid value for --stats\n");
+ vhost_rdma_usage(prgname);
+ return -1;
+ }
+ enable_stats = ret;
+ break;
+
+ case OPT_NUM_MBUFS_NUM:
+ ret = vhost_rdma_parse_num_opt(optarg, INT32_MAX);
+ if (ret < 0 || ret == 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Invalid value for --total-num-mbufs\n");
+ vhost_rdma_usage(prgname);
+ return -1;
+ }
+ total_num_mbufs = ret;
+ break;
+
+ case OPT_SOCKET_FILE_NUM:
+ if (vhost_rdma_parse_socket_path(optarg) < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Invalid socket path: %s\n", optarg);
+ vhost_rdma_usage(prgname);
+ return -1;
+ }
+ break;
+
+ case OPT_TX_CSUM_NUM:
+ ret = vhost_rdma_parse_num_opt(optarg, 1);
+ if (ret < 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "Invalid value for --tx-csum (must be 0 or 1)\n");
+ vhost_rdma_usage(prgname);
+ return -1;
+ }
+ enable_tx_csum = ret;
+ break;
+
+ default:
+ vhost_rdma_usage(prgname);
+ return -1;
+ }
+ }
+
+ if (nb_sockets == 0) {
+ RTE_LOG(ERR, VHOST_CONFIG, "At least one --socket-file must be specified.\n");
+ vhost_rdma_usage(prgname);
+ return -1;
+ }
+
+ return 0;
+}
+
+static int
+vhost_rdma_main_loop(__rte_unused void* arg)
+{
+ while (!force_quit) {
+
+ }
+ return 0;
+}
+
+/**
+ * @brief Application entry point.
+ *
+ * Initializes EAL, parses args, sets up ports, mempools, rings,
+ * registers vhost drivers, launches threads.
+ */
+int main(int argc, char **argv)
+{
+ unsigned lcore_id, core_id = 0;
+ int ret;
+ uint16_t port_id;
+ bool pair_found = false;
+ struct rte_eth_dev_info dev_info;
+
+ force_quit = false;
+ enable_stats = 0;
+ enable_tx_csum = 0;
+
+ /* Register signal handler for clean shutdown */
+ signal(SIGINT, vhost_rdma_signal_handler);
+ signal(SIGTERM, vhost_rdma_signal_handler);
+
+ /* Initialize DPDK Environment Abstraction Layer */
+ ret = rte_eal_init(argc, argv);
+ if (ret < 0)
+ rte_panic("Unable to initialize DPDK EAL\n");
+
+ argc -= ret;
+ argv += ret;
+
+ rte_log_set_global_level(RTE_LOG_NOTICE);
+
+ /* Parse application-specific arguments */
+ if (vhost_rdma_parse_args(argc, argv) != 0) {
+ rte_exit(EXIT_FAILURE, "Argument parsing failed\n");
+ }
+
+ /* Initialize per-lcore data structures */
+ for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
+ TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
+ if (rte_lcore_is_enabled(lcore_id)) {
+ lcore_ids[core_id++] = lcore_id;
+ }
+ }
+
+ if (rte_lcore_count() < 2) {
+ rte_exit(EXIT_FAILURE, "At least two cores required (one main + one worker)\n");
+ }
+
+ /*
+ * Create shared memory pool for mbufs
+ * Used by both RX and TX paths
+ */
+ vhost_rdma_mbuf_pool = rte_pktmbuf_pool_create(
+ "mbuf_pool_shared",
+ total_num_mbufs,
+ MBUF_CACHE_SIZE,
+ sizeof(struct vhost_rdma_pkt_info),
+ MBUF_DATA_SIZE,
+ rte_socket_id());
+
+ if (vhost_rdma_mbuf_pool == NULL) {
+ rte_exit(EXIT_FAILURE, "Cannot create mbuf pool: %s\n", rte_strerror(rte_errno));
+ }
+
+ /*
+ * Create shared rings for packet exchange
+ * SP_ENQ: Single-producer enqueue (from NIC)
+ * MC_HTS_DEQ: Multi-consumer with HTS dequeue (to workers)
+ */
+ vhost_rdma_rx_ring = rte_ring_create(
+ "ring_rx_shared",
+ MAX_RING_COUNT,
+ rte_socket_id(),
+ RING_F_SP_ENQ | RING_F_MC_HTS_DEQ
+ );
+ if (vhost_rdma_rx_ring == NULL)
+ rte_exit(EXIT_FAILURE, "Failed to create RX ring: %s\n", rte_strerror(rte_errno));
+
+ vhost_rdma_tx_ring = rte_ring_create(
+ "ring_tx_shared",
+ MAX_RING_COUNT,
+ rte_socket_id(),
+ RING_F_MP_HTS_ENQ | RING_F_SC_DEQ
+ );
+ if (vhost_rdma_tx_ring == NULL)
+ rte_exit(EXIT_FAILURE, "Failed to create TX ring: %s\n", rte_strerror(rte_errno));
+
+ /*
+ * Find and initialize backend Ethernet device (e.g., net_tap or net_vhost)
+ */
+ RTE_ETH_FOREACH_DEV(port_id) {
+ ret = rte_eth_dev_info_get(port_id, &dev_info);
+ if (ret != 0) {
+ RDMA_LOG_ERR("Failed to get info for port %u\n", port_id);
+ continue;
+ }
+
+ if (!pair_found &&
+ (strcmp(dev_info.driver_name, "net_tap") == 0 ||
+ strcmp(dev_info.driver_name, "net_vhost") == 0)) {
+
+ pair_port_id = port_id;
+ pair_found = true;
+
+ ret = vhost_rdma_init_port(port_id, !!enable_tx_csum);
+ if (ret != 0) {
+ rte_exit(EXIT_FAILURE, "Failed to initialize port %u: %s\n",
+ port_id, rte_strerror(-ret));
+ }
+
+ RDMA_LOG_INFO("Using device %s (port %u) as backend interface\n",
+ dev_info.device->name, port_id);
+ }
+ }
+
+ if (!pair_found) {
+ rte_exit(EXIT_FAILURE, "No suitable backend Ethernet device found\n");
+ }
+
+ /*
+ * Setup per-vhost-device resources and register vhost drivers
+ */
+ char name_buf[SOCKET_PATH_MAX];
+ for (int i = 0; i < nb_sockets; i++) {
+ const char *sock_path = socket_path + i * SOCKET_PATH_MAX;
+ struct vhost_rdma_device *dev = &g_vhost_rdma_dev[i];
+
+ dev->vid = i;
+
+ if (i == 0) {
+ /* Use shared resources for first device */
+ dev->rx_ring = vhost_rdma_rx_ring;
+ dev->tx_ring = vhost_rdma_tx_ring;
+ dev->mbuf_pool = vhost_rdma_mbuf_pool;
+ } else {
+ /* Create dedicated resources for additional devices */
+ snprintf(name_buf, sizeof(name_buf), "dev%u_rx_ring", i);
+ dev->rx_ring = rte_ring_create(name_buf, MAX_RING_COUNT,
+ rte_socket_id(), RING_F_SP_ENQ | RING_F_MC_HTS_DEQ);
+ if (!dev->rx_ring)
+ rte_exit(EXIT_FAILURE, "Failed to create RX ring %d\n", i);
+
+ snprintf(name_buf, sizeof(name_buf), "dev%u_tx_ring", i);
+ dev->tx_ring = rte_ring_create(name_buf, MAX_RING_COUNT,
+ rte_socket_id(), RING_F_MP_HTS_ENQ | RING_F_SC_DEQ);
+ if (!dev->tx_ring)
+ rte_exit(EXIT_FAILURE, "Failed to create TX ring %d\n", i);
+
+ snprintf(name_buf, sizeof(name_buf), "dev%u_mbuf_pool", i);
+ dev->mbuf_pool = rte_pktmbuf_pool_create(name_buf,
+ total_num_mbufs,
+ MBUF_CACHE_SIZE,
+ sizeof(struct vhost_rdma_pkt_info),
+ MBUF_DATA_SIZE,
+ rte_socket_id());
+ if (!dev->mbuf_pool)
+ rte_exit(EXIT_FAILURE, "Failed to create mbuf pool %d\n", i);
+ }
+
+ snprintf(name_buf, sizeof(name_buf), "dev%u_task_ring", i);
+ dev->task_ring = rte_ring_create(name_buf, MAX_RING_COUNT,
+ rte_socket_id(),
+ RING_F_MP_HTS_ENQ | RING_F_MC_HTS_DEQ);
+ if (!dev->task_ring)
+ rte_exit(EXIT_FAILURE, "Failed to create task ring %d\n", i);
+
+ /* Construct and register vhost device */
+ ret = vhost_rdma_construct(dev, sock_path, i);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to construct vhost device %d\n", i);
+ continue;
+ }
+
+ ret = rte_vhost_driver_start(sock_path);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to start vhost driver for %s\n", sock_path);
+ } else {
+ RDMA_LOG_INFO("Successfully started vhost driver: %s\n", sock_path);
+ }
+ }
+
+ /* Wait for all worker threads to complete (they won't unless forced) */
+ RTE_LCORE_FOREACH_WORKER(lcore_id) {
+ rte_eal_wait_lcore(lcore_id);
+ }
+
+ vhost_rdma_main_loop(NULL);
+
+ /* Cleanup */
+ rte_eal_cleanup();
+ free(socket_path);
+
+ RDMA_LOG_INFO("Application terminated gracefully.\n");
+ return 0;
+}
diff --git a/examples/vhost_user_rdma/meson.build b/examples/vhost_user_rdma/meson.build
new file mode 100644
index 0000000000..d6ccaf32a4
--- /dev/null
+++ b/examples/vhost_user_rdma/meson.build
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2017 Intel Corporation
+
+# meson file, for building this example as part of a main DPDK build.
+#
+# To build this example as a standalone application with an already-installed
+# DPDK instance, use 'make'
+
+if not is_linux
+ build = false
+ subdir_done()
+endif
+
+deps += ['vhost', 'timer']
+
+allow_experimental_apis = true
+
+cflags_options = [
+ '-std=c11',
+ '-Wno-strict-prototypes',
+ '-Wno-pointer-arith',
+ '-Wno-maybe-uninitialized',
+ '-Wno-discarded-qualifiers',
+ '-Wno-old-style-definition',
+ '-Wno-sign-compare',
+ '-Wno-stringop-overflow',
+ '-O3',
+ '-g',
+ '-DALLOW_EXPERIMENTAL_API',
+ '-DDEBUG_RDMA',
+ '-DDEBUG_RDMA_DP',
+]
+
+foreach option:cflags_options
+ if cc.has_argument(option)
+ cflags += option
+ endif
+endforeach
+
+sources = files(
+ 'main.c',
+ 'vhost_rdma.c',
+ 'vhost_rdma_ib.c',
+)
+
diff --git a/examples/vhost_user_rdma/vhost_rdma.c b/examples/vhost_user_rdma/vhost_rdma.c
new file mode 100644
index 0000000000..2cf47a6baa
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma.c
@@ -0,0 +1,697 @@
+/*
+ * Vhost-user RDMA device : init and packets forwarding
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@...inos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <unistd.h>
+#include <stdlib.h>
+
+#include <rte_malloc.h>
+#include <rte_bitmap.h>
+#include <rte_common.h>
+#include <rte_ring.h>
+#include <rte_vhost.h>
+#include <rte_malloc.h>
+
+#include "vhost_rdma.h"
+#include "vhost_rdma_ib.h"
+#include "vhost_rdma_pkt.h"
+#include "vhost_rdma_log.h"
+
+#define VHOST_MAX_DEVICES 32
+
+struct vhost_rdma_device g_vhost_rdma_dev[MAX_VHOST_RDMA_DEV_NUM];
+struct vhost_rdma_net_dev g_vhost_rdma_net_dev[MAX_VHOST_RDMA_DEV_NUM];
+
+/**
+ * @brief Install required vhost-user protocol features for RDMA device.
+ *
+ * Enables CONFIG and MQ features which are essential for multi-queue
+ * and configuration space access in vhost-user frontend.
+ *
+ * @param path Socket or VFIO device path used by vhost driver
+ */
+static void
+vhost_rdma_install_rte_compat_hooks(const char *path)
+{
+ uint64_t protocol_features = 0;
+
+ if (!path) {
+ RDMA_LOG_ERR("Invalid path parameter");
+ return;
+ }
+
+ /* Retrieve current protocol features */
+ if (rte_vhost_driver_get_protocol_features(path, &protocol_features) < 0) {
+ RDMA_LOG_DEBUG("Failed to get protocol features for %s, assuming 0", path);
+ protocol_features = 0;
+ }
+
+ /* Enable mandatory features */
+ protocol_features |= (1ULL << VHOST_USER_PROTOCOL_F_CONFIG); // For GET/SET_CONFIG
+ protocol_features |= (1ULL << VHOST_USER_PROTOCOL_F_MQ); // Multi-queue support
+
+ if (rte_vhost_driver_set_protocol_features(path, protocol_features) < 0) {
+ RDMA_LOG_ERR("Failed to set protocol features on %s", path);
+ } else {
+ RDMA_LOG_DEBUG("Enabled CONFIG and MQ features for %s", path);
+ }
+}
+
+/**
+ * @brief Construct a net device with given queues.
+ *
+ * Initializes the per-device queue mapping and state.
+ *
+ * @param queues Array of vhost-user queues
+ * @param idx Device index
+ */
+void
+vhost_rdma_net_construct(struct vhost_user_queue *queues, int idx)
+{
+ if (idx < 0 || idx >= VHOST_MAX_DEVICES) {
+ RDMA_LOG_ERR("Invalid device index: %d", idx);
+ return;
+ }
+
+ if (!queues) {
+ RDMA_LOG_ERR("NULL queues pointer for device %d", idx);
+ return;
+ }
+
+ g_vhost_rdma_net_dev[idx].queues = queues;
+ g_vhost_rdma_net_dev[idx].started = false;
+
+ RDMA_LOG_DEBUG("Net device %d constructed with queues=%p", idx, queues);
+}
+
+/**
+ * @brief Initialize an object pool with bitmap-based allocation tracking.
+ *
+ * Allocates contiguous memory for `num` objects and a bitmap to track usage.
+ * Optionally reserves index 0 (when !start_zero), useful for representing invalid handles.
+ *
+ * @param pool [out] Pool structure to initialize
+ * @param name Name used in memory allocation (can be NULL)
+ * @param num Number of objects to allocate
+ * @param size Size of each object
+ * @param start_zero If true, index 0 is usable; else reserved
+ * @param cleanup Optional callback called on free (can be NULL)
+ *
+ * @return 0 on success, -1 on failure
+ */
+int
+vhost_rdma_pool_init(struct vhost_rdma_pool *pool,
+ const char *name,
+ uint32_t num,
+ uint32_t size,
+ bool start_zero,
+ void (*cleanup)(void *))
+{
+ void *mem = NULL;
+ uint32_t actual_num;
+ struct rte_bitmap *bmp = NULL;
+ const char *pool_name = name ? name : "vhost_rdma_pool";
+
+ if (!pool || num == 0 || size == 0) {
+ RDMA_LOG_ERR("Invalid parameters: pool=%p, num=%u, size=%u", pool, num, size);
+ return -1;
+ }
+
+ /* Adjust total number: reserve index 0 if needed */
+ actual_num = start_zero ? num : num + 1;
+
+ /* Allocate object storage */
+ pool->objs = rte_zmalloc(pool_name, actual_num * size, RTE_CACHE_LINE_SIZE);
+ if (!pool->objs) {
+ RDMA_LOG_ERR("Failed to allocate %u * %u bytes for objects", actual_num, size);
+ goto err_objs;
+ }
+
+ /* Allocate bitmap metadata */
+ uint32_t bmp_size = rte_bitmap_get_memory_footprint(actual_num);
+ mem = rte_zmalloc(pool_name, bmp_size, RTE_CACHE_LINE_SIZE);
+ if (!mem) {
+ RDMA_LOG_ERR("Failed to allocate %u bytes for bitmap", bmp_size);
+ goto err_bmp_mem;
+ }
+
+ /* Initialize bitmap */
+ bmp = rte_bitmap_init(actual_num, mem, bmp_size);
+ if (!bmp) {
+ RDMA_LOG_ERR("Failed to init bitmap with %u bits", actual_num);
+ goto err_bmp_init;
+ }
+
+ /* Mark all slots as FREE (bitmap: SET = free) */
+ for (uint32_t i = 0; i < actual_num; i++) {
+ rte_bitmap_set(bmp, i);
+ }
+
+ /* Reserve index 0 if not starting from zero */
+ if (!start_zero) {
+ rte_bitmap_clear(bmp, 0); /* Now allocated/reserved */
+ }
+
+ /* Finalize pool setup */
+ pool->bitmap = bmp;
+ pool->bitmap_mem = mem;
+ pool->num = actual_num;
+ pool->size = size;
+ pool->cleanup = cleanup;
+
+ RDMA_LOG_DEBUG("Pool '%s' initialized: %u entries, obj_size=%u, start_zero=%d",
+ pool_name, actual_num, size, start_zero);
+
+ return 0;
+
+err_bmp_init:
+ rte_free(mem);
+err_bmp_mem:
+ rte_free(pool->objs);
+err_objs:
+ return -1;
+}
+
+/**
+ * @brief Get pointer to object at given index if it is currently allocated.
+ *
+ * Does NOT check thread safety.
+ *
+ * @param pool Pool instance
+ * @param idx Object index
+ * @return Pointer to object if allocated, NULL otherwise or if out-of-bounds
+ */
+void *
+vhost_rdma_pool_get(struct vhost_rdma_pool *pool, uint32_t idx)
+{
+ if (!pool || idx >= pool->num) {
+ RDMA_LOG_DEBUG("Invalid pool or index: pool=%p, idx=%u, num=%u",
+ pool, idx, pool ? pool->num : 0);
+ return NULL;
+ }
+
+ /* Bitmap: SET = free, CLEAR = allocated */
+ if (rte_bitmap_get(pool->bitmap, idx)) {
+ RDMA_LOG_DEBUG("Object at index %u is free, cannot get", idx);
+ return NULL;
+ }
+
+ return RTE_PTR_ADD(pool->objs, idx * pool->size);
+}
+
+/**
+ * @brief Allocate a new object from the pool.
+ *
+ * Finds the first available slot, clears its bit (marks as used), optionally zeroes memory,
+ * and returns a pointer. Also outputs the assigned index via `idx` parameter.
+ *
+ * @param pool Pool to allocate from
+ * @param idx [out] Assigned index (optional, pass NULL if not needed)
+ * @return Pointer to allocated object, or NULL if no space
+ */
+void *
+vhost_rdma_pool_alloc(struct vhost_rdma_pool *pool, uint32_t *idx)
+{
+ uint32_t pos = 0;
+ uint64_t slab = 0;
+ void *obj;
+
+ if (!pool) {
+ RDMA_LOG_ERR("NULL pool");
+ return NULL;
+ }
+
+ __rte_bitmap_scan_init(pool->bitmap);
+ int found = rte_bitmap_scan(pool->bitmap, &pos, &slab);
+ if (!found) {
+ RDMA_LOG_DEBUG("No free objects in pool");
+ return NULL;
+ }
+
+ uint32_t allocated_idx = pos + __builtin_ctzll(slab);
+ obj = RTE_PTR_ADD(pool->objs, allocated_idx * pool->size);
+
+ /* Zero-initialize new object */
+ memset(obj, 0, pool->size);
+
+ /* Mark as allocated */
+ rte_bitmap_clear(pool->bitmap, allocated_idx);
+
+ if (idx) {
+ *idx = allocated_idx;
+ }
+
+ RDMA_LOG_DEBUG("Allocated object at index %u", allocated_idx);
+ return obj;
+}
+
+/**
+ * @brief Free an object back into the pool.
+ *
+ * Calls optional cleanup callback before releasing.
+ * Not thread-safe — must be externally synchronized.
+ *
+ * @param pool Pool containing the object
+ * @param idx Index of object to free
+ */
+void
+vhost_rdma_pool_free(struct vhost_rdma_pool *pool, uint32_t idx)
+{
+ if (!pool || idx >= pool->num) {
+ RDMA_LOG_ERR("Invalid pool or index: pool=%p, idx=%u", pool, idx);
+ return;
+ }
+
+ void *obj = vhost_rdma_pool_get(pool, idx);
+ if (!obj) {
+ RDMA_LOG_DEBUG("Index %u already free, skipping", idx);
+ return; /* Idempotent: already free */
+ }
+
+ /* Call user-defined cleanup hook */
+ if (pool->cleanup) {
+ pool->cleanup(obj);
+ }
+
+ /* Return to free list */
+ rte_bitmap_set(pool->bitmap, idx);
+
+ RDMA_LOG_DEBUG("Freed object at index %u", idx);
+}
+
+/**
+ * @brief Destroy the entire pool and release all memory.
+ *
+ * WARNING: Caller must ensure no live references exist.
+ * Does NOT call cleanup() on remaining live objects.
+ *
+ * @param pool Pool to destroy
+ */
+void
+vhost_rdma_pool_destroy(struct vhost_rdma_pool *pool)
+{
+ if (!pool) {
+ return;
+ }
+
+ if (pool->bitmap_mem) {
+ rte_bitmap_free(pool->bitmap); /* Frees internal state too */
+ rte_free(pool->bitmap_mem);
+ pool->bitmap = NULL;
+ pool->bitmap_mem = NULL;
+ }
+
+ if (pool->objs) {
+ rte_free(pool->objs);
+ pool->objs = NULL;
+ }
+
+ pool->num = 0;
+ pool->size = 0;
+ pool->cleanup = NULL;
+
+ RDMA_LOG_DEBUG("Pool destroyed");
+}
+
+/**
+ * @brief Set up the vhost-user network backend for a given device.
+ *
+ * Initializes guest memory mapping, negotiates features (e.g., merged RX buffers),
+ * sets header length accordingly, disables unnecessary notifications during setup,
+ * and marks the device as started.
+ *
+ * @param vid Vhost device ID (from rte_vhost driver)
+ */
+void
+vs_vhost_rdma_net_setup(int vid)
+{
+ struct vhost_rdma_net_dev *dev;
+ uint64_t negotiated_features = 0;
+ int ret;
+
+ /* Validate input */
+ if (vid < 0 || vid >= VHOST_MAX_DEVICES) {
+ RDMA_LOG_ERR("Invalid vhost device ID: %d", vid);
+ return;
+ }
+
+ dev = &g_vhost_rdma_net_dev[vid];
+ if (!dev) {
+ RDMA_LOG_ERR("Device structure not initialized for vid=%d", vid);
+ return; /* Should never happen */
+ }
+
+ /* Initialize device context */
+ dev->vid = vid;
+ dev->started = false;
+
+ /* Step 1: Get negotiated VirtIO features */
+ if (rte_vhost_get_negotiated_features(vid, &negotiated_features) < 0) {
+ RDMA_LOG_ERR("Failed to get negotiated features for vid=%d", vid);
+ return;
+ }
+ dev->features = negotiated_features;
+
+ /* Step 2: Determine virtio-net header size based on features */
+ if (negotiated_features & ((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
+ (1ULL << VIRTIO_F_VERSION_1))) {
+ dev->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+ RDMA_LOG_DEBUG("Using merged RX buffer header (size=%zu) for vid=%d",
+ dev->hdr_len, vid);
+ } else {
+ dev->hdr_len = sizeof(struct virtio_net_hdr);
+ RDMA_LOG_DEBUG("Using standard net header (size=%zu) for vid=%d",
+ dev->hdr_len, vid);
+ }
+
+ /* Step 3: Get guest memory table (VA->GPA/HPA translation) */
+ if (dev->mem) {
+ rte_free(dev->mem);
+ dev->mem = NULL;
+ }
+
+ ret = rte_vhost_get_mem_table(vid, &dev->mem);
+ if (ret < 0 || dev->mem == NULL) {
+ RDMA_LOG_ERR("Failed to retrieve guest memory layout for vid=%d", vid);
+ return;
+ }
+
+ RDMA_LOG_INFO("Guest memory table acquired: %u regions mapped", dev->mem->nregions);
+
+ /* Step 4: Disable guest notification during initial setup */
+ ret = rte_vhost_enable_guest_notification(vid, VHOST_NET_RXQ, 0);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to disable RX queue kick suppression for vid=%d", vid);
+ }
+
+ ret = rte_vhost_enable_guest_notification(vid, VHOST_NET_TXQ, 0);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to disable TX queue kick suppression for vid=%d", vid);
+ }
+
+ /* Final step: Mark device as ready */
+ dev->started = true;
+
+ RDMA_LOG_INFO("vhost-user net device vid=%d setup completed successfully", vid);
+}
+
+/**
+ * @brief Callback: A new vhost-user device has been negotiated and is ready for setup.
+ *
+ * This function initializes the backend RDMA device context, sets up networking parameters,
+ * allocates required resources, and marks the device as started.
+ *
+ * @param vid Vhost device identifier assigned by rte_vhost
+ * @return 0 on success, negative on failure (though return value is often ignored)
+ */
+static int
+vhost_rdma_new_device(int vid)
+{
+ struct vhost_rdma_device *dev;
+
+ /* Validate device ID */
+ if (vid < 0 || vid >= VHOST_MAX_DEVICES) {
+ RDMA_LOG_ERR("Invalid vhost device ID: %d", vid);
+ return -1;
+ }
+
+ dev = &g_vhost_rdma_dev[vid];
+
+ /* Avoid re-initializing an already started device */
+ if (dev->started) {
+ RDMA_LOG_DEBUG("Device vid=%d already started, skipping initialization", vid);
+ return 0;
+ }
+
+ /* Setup network layer: features, header size, memory table */
+ vs_vhost_rdma_net_setup(vid);
+
+ /* Finalize device state */
+ dev->vid = vid;
+ dev->started = true;
+ dev->stopped = false;
+
+ RDMA_LOG_INFO("New vhost-RDMA device created: vid=%d", vid);
+ return 0;
+}
+
+/**
+ * @brief Clean up guest memory mapping for a vhost device.
+ *
+ * Frees memory allocated by rte_vhost_get_mem_table().
+ * Safe to call multiple times (idempotent).
+ *
+ * @param vid Device ID
+ */
+static void
+vs_vhost_rdma_net_remove(int vid)
+{
+ struct vhost_rdma_net_dev *net_dev;
+
+ if (vid < 0 || vid >= VHOST_MAX_DEVICES) {
+ RDMA_LOG_ERR("Invalid device ID in net_remove: %d", vid);
+ return;
+ }
+
+ net_dev = &g_vhost_rdma_net_dev[vid];
+
+ if (net_dev->mem) {
+ RDMA_LOG_DEBUG("Freeing guest memory table for vid=%d", vid);
+ rte_free(net_dev->mem); /* Use rte_free() because allocated via DPDK */
+ net_dev->mem = NULL;
+ } else {
+ RDMA_LOG_DEBUG("No memory table to free for vid=%d", vid);
+ }
+}
+
+/**
+ * @brief Destroy and release all resources associated with a vhost-RDMA device.
+ *
+ * Called when frontend disconnects or device is removed.
+ * Ensures safe teardown of IB context, queues, memory mappings, and notification states.
+ *
+ * @param vid Vhost device ID
+ */
+static void
+vhost_rdma_destroy_device(__rte_unused int vid)
+{
+ struct vhost_rdma_device *dev;
+ struct vhost_user_queue *vq;
+ unsigned int lcore_id;
+
+ if (vid < 0 || vid >= VHOST_MAX_DEVICES) {
+ RDMA_LOG_ERR("Attempted to destroy invalid device ID: %d", vid);
+ return;
+ }
+
+ dev = &g_vhost_rdma_dev[vid];
+
+ if (!dev->started) {
+ RDMA_LOG_DEBUG("Device vid=%d not started, nothing to destroy", vid);
+ return;
+ }
+
+ /* Mark device as stopping */
+ dev->started = false;
+ dev->stopped = true;
+
+ RDMA_LOG_INFO("Destroying vhost-RDMA device: vid=%d", vid);
+
+ /*
+ * Wait gracefully until device is no longer in use.
+ * Use atomic counter if available, or yield CPU.
+ *
+ * Note: Original code had `while (dev->inuse == 0)` which waits forever if never used!
+ * Should be: while (dev->inuse > 0)
+ */
+ while (dev->inuse > 0) {
+ lcore_id = rte_lcore_id();
+ if (lcore_id != RTE_MAX_LCORE) {
+ rte_pause(); /* Yield CPU time on polling lcore */
+ } else {
+ rte_delay_us_block(100); /* Background thread sleep */
+ }
+ }
+
+ /* Step 1: Remove from network subsystem */
+ vs_vhost_rdma_net_remove(vid);
+
+ /* Step 2: Destroy InfiniBand/RDMA components (QP, CQ, MR cleanup) */
+ vhost_rdma_destroy_ib(dev);
+
+ /* Step 3: Persist vring indices before shutdown */
+ for (int i = 0; i < NUM_VHOST_QUEUES; i++) {
+ vq = &dev->vqs[i];
+
+ if (vq->enabled) {
+ int ret = rte_vhost_set_vring_base(dev->vid, i,
+ vq->last_avail_idx,
+ vq->last_used_idx);
+ if (ret < 0) {
+ RDMA_LOG_ERR("Failed to save vring base for queue %d", i);
+ }
+
+ vq->enabled = false;
+ RDMA_LOG_DEBUG("Disabled vring %d", i);
+ }
+ }
+
+ /* Step 4: Free per-device memory table (if any) */
+ if (dev->mem) {
+ RDMA_LOG_DEBUG("Freeing device memory table for vid=%d", vid);
+ rte_free(dev->mem);
+ dev->mem = NULL;
+ }
+
+ RDMA_LOG_INFO("vhost-RDMA device destroyed successfully: vid=%d", vid);
+}
+
+static enum rte_vhost_msg_result extern_vhost_pre_msg_handler(__rte_unused int vid, void *_msg)
+{
+ struct vhost_rdma_device *dev;
+ struct vhost_user_rdma_msg *msg = _msg;
+
+ dev = &g_vhost_rdma_dev[vid];
+
+ switch ((int)msg->request) {
+ case VHOST_USER_GET_VRING_BASE:
+ case VHOST_USER_SET_VRING_BASE:
+ case VHOST_USER_SET_VRING_ADDR:
+ case VHOST_USER_SET_VRING_NUM:
+ case VHOST_USER_SET_VRING_KICK:
+ case VHOST_USER_SET_VRING_CALL:
+ case VHOST_USER_SET_MEM_TABLE:
+ break;
+ case VHOST_USER_GET_CONFIG: {
+ rte_memcpy(msg->payload.cfg.region, &dev->rdma_config, sizeof(dev->rdma_config));
+ return RTE_VHOST_MSG_RESULT_REPLY;
+ }
+ case VHOST_USER_SET_CONFIG:
+ default:
+ break;
+ }
+
+ return RTE_VHOST_MSG_RESULT_NOT_HANDLED;
+}
+
+struct rte_vhost_user_extern_ops g_extern_vhost_ops = {
+ .pre_msg_handle = extern_vhost_pre_msg_handler,
+};
+
+static int vhost_rdma_new_connection(int vid)
+{
+ int ret = 0;
+
+ ret = rte_vhost_extern_callback_register(vid, &g_extern_vhost_ops, NULL);
+ if (ret != 0)
+ RDMA_LOG_ERR(
+ "rte_vhost_extern_callback_register failed for vid = %d\n",
+ vid);
+
+ g_vhost_rdma_dev[vid].vid = vid;
+ return ret;
+}
+
+static int vhost_rdma_vring_state_changed(int vid, uint16_t queue_id, int enable)
+{
+ struct vhost_rdma_device *dev = &g_vhost_rdma_dev[vid];
+ struct vhost_user_queue *vq;
+
+ assert(dev->vid == vid);
+
+ if (enable) {
+ vq = &dev->vqs[queue_id];
+
+ if (vq->enabled)
+ return 0;
+
+ vq->id = queue_id;
+
+ assert(rte_vhost_get_vhost_vring(dev->vid, queue_id,
+ &vq->vring) == 0);
+
+ assert(rte_vhost_get_vring_base(dev->vid, queue_id,
+ &vq->last_avail_idx,
+ &vq->last_used_idx) == 0);
+
+ vq->enabled = true;
+ /*
+ * ctrl_handler MUST start when the virtqueue is enabled,
+ * NOT start in new_device(). because driver will query some
+ * informations through ctrl vq in ib_register_device() when
+ * the device is not enabled.
+ */
+ if (queue_id == VHOST_NET_ROCE_CTRL_QUEUE && !dev->ctrl_intr_registered) {
+ assert(rte_vhost_get_mem_table(vid, &dev->mem) == 0);
+ assert(dev->mem != NULL);
+
+ dev->ctrl_intr_handle.fd = dev->vqs[VHOST_NET_ROCE_CTRL_QUEUE].vring.kickfd;
+ dev->ctrl_intr_handle.type = RTE_INTR_HANDLE_EXT;
+ rte_intr_callback_register(&dev->ctrl_intr_handle,
+ vhost_rdma_handle_ctrl_vq, dev);
+ dev->ctrl_intr_registered = 1;
+ }
+ }
+ return 0;
+}
+
+static const struct rte_vhost_device_ops vhost_rdma_device_ops = {
+ .new_device = vhost_rdma_new_device,
+ .destroy_device = vhost_rdma_destroy_device,
+ .new_connection = vhost_rdma_new_connection,
+ .vring_state_changed = vhost_rdma_vring_state_changed,
+};
+
+int vhost_rdma_construct(struct vhost_rdma_device *dev, const char *path, int idx)
+{
+ int ret;
+
+ unlink(path);
+
+ ret = rte_vhost_driver_register(path, 0);
+ if (ret != 0) {
+ RDMA_LOG_ERR("Socket %s already exists\n", path);
+ return ret;
+ }
+
+ ret = rte_vhost_driver_set_features(path, VHOST_RDMA_FEATURE);
+ if (ret != 0) {
+ RDMA_LOG_ERR("Set vhost driver features failed\n");
+ rte_vhost_driver_unregister(path);
+ return ret;
+ }
+
+ dev->stopped = false;
+ dev->inuse = 0;
+
+ /* set vhost user protocol features */
+ vhost_rdma_install_rte_compat_hooks(path);
+
+ dev->rdma_vqs = &dev->vqs[VHOST_NET_ROCE_CTRL_QUEUE];
+
+ vhost_rdma_net_construct(dev->vqs, idx);
+
+ vhost_rdma_init_ib(dev);
+ rte_spinlock_init(&dev->port_lock);
+
+ rte_vhost_driver_callback_register(path,
+ &vhost_rdma_device_ops);
+
+ for (int i = 0; i < VHOST_RDMA_NUM_OF_COUNTERS; i++) {
+ rte_atomic64_init(&dev->stats_counters[i]);
+ }
+
+ if(dev->tx_ring){
+ rte_eal_mp_remote_launch(vhost_rdma_task_scheduler, dev, SKIP_MAIN);
+ }
+
+ return 0;
+}
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma.h b/examples/vhost_user_rdma/vhost_rdma.h
new file mode 100644
index 0000000000..c1531d1a7a
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma.h
@@ -0,0 +1,444 @@
+/*
+ * Vhost-user RDMA device : init and packets forwarding
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@...inos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef VHOST_RDMA_H_
+#define VHOST_RDMA_H_
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <rte_byteorder.h>
+#include <rte_common.h>
+#include <rte_vhost.h>
+#include <rte_interrupts.h>
+#include <rte_atomic.h>
+#include <rte_spinlock.h>
+#include <rte_mempool.h>
+#include <rte_ring.h>
+#include <rte_bitmap.h>
+
+#include "vhost_rdma_ib.h"
+#include "eal_interrupts.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief Number of vhost queues.
+ *
+ * One CTRL VQ + 64 CQ + 64 TX + 64 RX event queues
+ */
+#define NUM_VHOST_QUEUES 193
+
+/**
+ * @brief Maximum GID table length
+ */
+#define VHOST_MAX_GID_TBL_LEN 512
+
+/**
+ * @brief Port PKey table length (single entry for default)
+ */
+#define VHOST_PORT_PKEY_TBL_LEN 1
+
+/**
+ * @brief Number of RDMA ports supported (currently only one)
+ */
+#define NUM_OF_VHOST_RDMA_PORT 1
+
+
+#define MAX_VHOST_RDMA_DEV_NUM 16
+
+#define VIRTIO_NET_F_ROCE 48
+
+#define VHOST_NET_ROCE_CTRL_QUEUE 0
+
+#define VHOST_RDMA_GID_TYPE_ILLIGAL (-1u)
+
+#define DEFAULT_IB_MTU VHOST_RDMA_IB_MTU_1024
+
+#define VHOST_NET_RXQ 0
+#define VHOST_NET_TXQ 1
+
+/* VIRTIO_F_EVENT_IDX is NOT supported now */
+#define VHOST_RDMA_FEATURE ((1ULL << VIRTIO_F_VERSION_1) |\
+ (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | \
+ (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
+ (1ULL << VHOST_USER_PROTOCOL_F_STATUS) | \
+ (1ULL << VIRTIO_NET_F_CTRL_VQ) | \
+ (1ULL << VIRTIO_NET_F_ROCE))
+
+__rte_always_inline uint32_t
+roundup_pow_of_two(uint32_t n)
+{
+ return n < 2 ? n : (1u << (32 - __builtin_clz (n - 1)));
+}
+
+/**
+ * @brief Counter types for statistics in vhost RDMA device
+ */
+enum vhost_rdma_counters {
+ VHOST_RDMA_CNT_SENT_PKTS,
+ VHOST_RDMA_CNT_RCVD_PKTS,
+ VHOST_RDMA_CNT_DUP_REQ,
+ VHOST_RDMA_CNT_OUT_OF_SEQ_REQ,
+ VHOST_RDMA_CNT_RCV_RNR,
+ VHOST_RDMA_CNT_SND_RNR,
+ VHOST_RDMA_CNT_RCV_SEQ_ERR,
+ VHOST_RDMA_CNT_COMPLETER_SCHED,
+ VHOST_RDMA_CNT_RETRY_EXCEEDED,
+ VHOST_RDMA_CNT_RNR_RETRY_EXCEEDED,
+ VHOST_RDMA_CNT_COMP_RETRY,
+ VHOST_RDMA_CNT_SEND_ERR,
+ VHOST_RDMA_CNT_LINK_DOWNED,
+ VHOST_RDMA_CNT_RDMA_SEND,
+ VHOST_RDMA_CNT_RDMA_RECV,
+ VHOST_RDMA_NUM_OF_COUNTERS
+};
+
+struct vhost_rdma_net_dev {
+ int vid;
+ uint64_t features;
+ size_t hdr_len;
+ bool started;
+ struct rte_vhost_memory *mem;
+ struct vhost_user_queue *queues;
+}__rte_cache_aligned;
+
+struct vhost_user_queue {
+ struct rte_vhost_vring vring;
+ uint16_t last_avail_idx;
+ uint16_t last_used_idx;
+ uint16_t id;
+ bool enabled;
+};
+
+/**
+ * @brief Configuration structure exposed to guest via virtio config space
+ *
+ * All fields are in little-endian byte order.
+ */
+struct vhost_rdma_config {
+ uint32_t phys_port_cnt; /**< Physical port count */
+ uint64_t sys_image_guid; /**< System image GUID */
+ uint32_t vendor_id; /**< Vendor ID */
+ uint32_t vendor_part_id; /**< Vendor part number */
+ uint32_t hw_ver; /**< Hardware version */
+ uint64_t max_mr_size; /**< Max memory region size */
+ uint64_t page_size_cap; /**< Page size capabilities */
+ uint32_t max_qp; /**< Max number of QPs */
+ uint32_t max_qp_wr; /**< Max work requests per QP */
+ uint64_t device_cap_flag; /**< Device capability flags */
+ uint32_t max_send_sge; /**< Max SGEs in send WR */
+ uint32_t max_recv_sge; /**< Max SGEs in recv WR */
+ uint32_t max_sge_rd; /**< Max SGEs for RD operations */
+ uint32_t max_cq; /**< Max completion queues */
+ uint32_t max_cqe; /**< Max entries per CQ */
+ uint32_t max_mr; /**< Max memory regions */
+ uint32_t max_pd; /**< Max protection domains */
+ uint32_t max_qp_rd_atom; /**< Max RDMA read-atoms per QP */
+ uint32_t max_res_rd_atom; /**< Max responder resources */
+ uint32_t max_qp_init_rd_atom; /**< Max initiator RD atoms */
+ uint32_t atomic_cap; /**< Atomic operation support */
+ uint32_t max_mw; /**< Max memory windows */
+ uint32_t max_mcast_grp; /**< Max multicast groups */
+ uint32_t max_mcast_qp_attach; /**< Max QPs per multicast group */
+ uint32_t max_total_mcast_qp_attach;/**< Total multicast attachments */
+ uint32_t max_ah; /**< Max address handles */
+ uint32_t max_fast_reg_page_list_len; /**< Fast registration page list len */
+ uint32_t max_pi_fast_reg_page_list_len; /**< PI fast reg list len */
+ uint16_t max_pkeys; /**< Max partition keys */
+ uint8_t local_ca_ack_delay; /**< Local CA ACK delay */
+ uint8_t reserved[5]; /* Pad to 8-byte alignment before variable area */
+ uint8_t reserved1[64]; /**< Reserved for future use */
+};
+
+/**
+ * @brief Device attributes (host-native format, not exposed directly)
+ */
+struct vhost_rdma_dev_attr {
+ uint64_t max_mr_size;
+ uint64_t page_size_cap;
+ uint32_t hw_ver;
+ uint32_t max_qp_wr;
+ uint64_t device_cap_flags;
+ uint32_t max_qps;
+ uint32_t max_cqs;
+ uint32_t max_send_sge;
+ uint32_t max_recv_sge;
+ uint32_t max_sge_rd;
+ uint32_t max_cqe;
+ uint32_t max_mr;
+ uint32_t max_mw;
+ uint32_t max_pd;
+ uint32_t max_qp_rd_atom;
+ uint32_t max_qp_init_rd_atom;
+ uint32_t max_ah;
+ uint32_t max_fast_reg_page_list_len;
+ uint8_t local_ca_ack_delay;
+};
+
+/**
+ * @brief Port-level attributes
+ */
+struct vhost_rdma_port_attr {
+ uint32_t bad_pkey_cntr; /**< Bad PKey counter */
+ uint32_t qkey_viol_cntr; /**< QKey violation counter */
+};
+
+/**
+ * @brief GID entry with type indicator
+ */
+struct vhost_rdma_gid {
+#define VHOST_RDMA_GID_TYPE_INVALID ((uint32_t)(-1))
+ uint32_t type; /**< GID type: RoCEv1, RoCEv2, etc. */
+ uint8_t gid[16]; /**< 128-bit GID value */
+};
+
+/**
+ * @brief Generic object pool for managing RDMA resources
+ */
+struct vhost_rdma_pool {
+ void *objs; /**< Array of allocated objects */
+ uint32_t num; /**< Number of objects in pool */
+ uint32_t size; /**< Size of each object */
+
+ struct rte_bitmap *bitmap; /**< Bitmap tracking free slots */
+ void *bitmap_mem; /**< Memory backing the bitmap */
+
+ void (*cleanup)(void *arg); /**< Optional cleanup function */
+};
+
+/**
+ * @brief Main RDMA vhost device structure
+ */
+struct vhost_rdma_device {
+ int vid; /**< Vhost-Rdma device ID */
+ int started; /**< Device start state */
+ volatile bool stopped; /**< Stop flag for threads */
+ volatile int inuse; /**< Reference count */
+
+ /* Memory and resource management */
+ struct rte_vhost_memory *mem; /**< Guest physical memory map */
+ struct rte_mempool *mbuf_pool; /**< mbuf pool for packet I/O */
+ struct rte_ring *tx_ring; /**< TX ring for outbound packets */
+ struct rte_ring *rx_ring; /**< RX ring for inbound packets */
+
+ /* Queues */
+ struct vhost_user_queue vqs[NUM_VHOST_QUEUES]; /**< All vhost queues */
+ struct vhost_user_queue *rdma_vqs; /**< Shortcut to RDMA queues */
+ struct vhost_user_queue *cq_vqs; /**< Shortcut to CQ notification queues */
+ struct vhost_user_queue *qp_vqs; /**< Shortcut to QP data queues */
+ struct rte_ring *task_ring; /**< Task scheduling ring */
+
+ /* Interrupt handling for control plane */
+ struct rte_intr_handle ctrl_intr_handle; /**< Control interrupt handle */
+ int ctrl_intr_registered; /**< Whether interrupt is registered */
+
+ /* Virtio-net configuration (exposed to guest) */
+ struct virtio_net_config config; /**< Generic virtio-net config */
+ struct vhost_rdma_config rdma_config; /**< RDMA-specific config */
+ uint32_t max_inline_data; /**< Max inline data size */
+
+ /* Device attributes (cached from config) */
+ struct vhost_rdma_dev_attr attr; /**< Cached device attributes */
+
+ /* Single port support */
+ struct vhost_rdma_port_attr port_attr; /**< Port-level counters */
+ rte_spinlock_t port_lock; /**< Lock for port access */
+ unsigned int mtu_cap; /**< MTU capability */
+ struct vhost_rdma_gid gid_tbl[VHOST_MAX_GID_TBL_LEN]; /**< GID table */
+ struct vhost_rdma_qp *qp_gsi; /**< Global shared inbox QP? */
+
+ /* Resource pools */
+ struct vhost_rdma_pool pd_pool; /**< Protection domain pool */
+ struct vhost_rdma_pool mr_pool; /**< Memory region pool */
+ struct vhost_rdma_pool cq_pool; /**< Completion queue pool */
+ struct vhost_rdma_pool qp_pool; /**< Queue pair pool */
+ struct vhost_rdma_pool ah_pool; /**< Address handle pool */
+
+ /* Statistics counters */
+ rte_atomic64_t stats_counters[VHOST_RDMA_NUM_OF_COUNTERS];
+};
+
+#define vhost_rdma_drop_ref(obj, dev, type) \
+ do { \
+ if (rte_atomic32_dec_and_test(&(obj)->refcnt)) { \
+ struct vhost_rdma_pool* pool = &(dev)->type##_pool; \
+ if (pool->cleanup) { \
+ pool->cleanup(obj); \
+ } \
+ vhost_rdma_pool_free(pool, (obj)->type##n); \
+ } \
+ }while(0)
+
+#define vhost_rdma_add_ref(obj) rte_atomic32_inc(&(obj)->refcnt)
+
+/**
+ * @brief Check if there is a new available descriptor in the virtqueue.
+ *
+ * This function compares the current avail->idx from the guest with the last
+ * processed index. If they differ, at least one new descriptor is ready.
+ *
+ * @param vq Pointer to the virtual queue.
+ * @return true if a new descriptor is available, false otherwise.
+ */
+static __rte_always_inline bool
+vhost_rdma_vq_is_avail(struct vhost_user_queue *vq)
+{
+ return vq->vring.avail->idx != vq->last_avail_idx;
+}
+
+/**
+ * @brief Get pointer to element at given index in a generic data ring.
+ *
+ * Used for accessing pre-allocated memory pools where each element has fixed size.
+ *
+ * @param queue Pointer to the queue containing data buffer.
+ * @param idx Index of the desired element.
+ * @return Pointer to the data at position idx.
+ */
+static __rte_always_inline void *
+vhost_rdma_queue_get_data(struct vhost_rdma_queue *queue, size_t idx)
+{
+ return queue->data + queue->elem_size * idx;
+}
+
+/**
+ * @brief Retrieve the next available descriptor index from the avail ring.
+ *
+ * Reads the descriptor index at the current position in the avail ring,
+ * increments last_avail_idx, and returns the descriptor index.
+ *
+ * @param vq Pointer to the virtual queue.
+ * @return Index of the first descriptor in the incoming request chain.
+ */
+static __rte_always_inline uint16_t
+vhost_rdma_vq_get_desc_idx(struct vhost_user_queue *vq)
+{
+ uint16_t desc_idx;
+ uint16_t last_avail_idx;
+
+ /* Mask with ring size to handle wraparound */
+ last_avail_idx = vq->last_avail_idx & (vq->vring.size - 1);
+ desc_idx = vq->vring.avail->ring[last_avail_idx];
+
+ /* Advance the local index tracker */
+ vq->last_avail_idx++;
+
+ return desc_idx;
+}
+
+/**
+ * @brief Get the next descriptor in the chain, if any.
+ *
+ * Checks the VRING_DESC_F_NEXT flag. If set, returns pointer to the next
+ * descriptor using the 'next' field as an index into the descriptor table.
+ *
+ * @param table Base address of the descriptor table.
+ * @param desc Current descriptor.
+ * @return Pointer to next descriptor, or NULL if end of chain.
+ */
+static __rte_always_inline struct vring_desc *
+vhost_rdma_vring_get_next_desc(struct vring_desc *table, struct vring_desc *desc)
+{
+ if (desc->flags & VRING_DESC_F_NEXT)
+ return &table[desc->next];
+
+ return NULL;
+}
+
+/**
+ * @brief Add a used descriptor entry to the used ring.
+ *
+ * Records that a buffer has been consumed by the host/device, including its
+ * original descriptor index and the number of bytes written.
+ *
+ * Uses memory barriers to ensure ordering before updating used->idx.
+ *
+ * @param vq Virtual queue.
+ * @param idx Descriptor index being returned.
+ * @param len Number of bytes written (for writeable descriptors).
+ */
+static __rte_always_inline void
+vhost_rdma_queue_push(struct vhost_user_queue *vq, uint16_t idx, uint32_t len)
+{
+ struct vring_used *used = vq->vring.used;
+ uint16_t slot = used->idx & (vq->vring.size - 1);
+
+ used->ring[slot].id = idx;
+ used->ring[slot].len = len;
+
+ /* Full memory barrier before incrementing idx to ensure visibility */
+ rte_smp_mb();
+ used->idx++;
+ rte_smp_mb();
+}
+
+/**
+ * @brief Notify the frontend (guest) about used descriptor updates.
+ *
+ * Calls into the DPDK vhost library to signal the guest via eventfd or doorbell.
+ *
+ * @param vid Virtual host device ID.
+ * @param vq Pointer to the virtual queue that needs notification.
+ */
+static __rte_always_inline void
+vhost_rdma_queue_notify(int vid, struct vhost_user_queue *vq)
+{
+ rte_vhost_vring_call(vid, vq->id);
+}
+
+/**
+ * @brief Translate Guest Physical Address (GPA) to Virtual VA in host.
+ *
+ * Wrapper around DPDK's rte_vhost_va_from_guest_pa(). This function performs
+ * address translation using the guest memory map provided through vhost-user.
+ *
+ * @param mem Pointer to vhost memory region mapping.
+ * @param gpa Guest physical address to translate.
+ * @param len [in/out] On input: requested length; on output: actual mapped length.
+ * @return Host virtual address corresponding to GPA, or 0 on failure.
+ */
+static __rte_always_inline uint64_t
+gpa_to_vva(struct rte_vhost_memory *mem, uint64_t gpa, uint64_t *len)
+{
+ assert(mem != NULL);
+ return rte_vhost_va_from_guest_pa(mem, gpa, len);
+}
+
+int vhost_rdma_construct(struct vhost_rdma_device *dev, const char *path, int idx);
+void vhost_rdma_net_construct(struct vhost_user_queue *queues, int idx);
+void vs_vhost_rdma_net_setup(int vid);
+
+
+void vhost_rdma_destroy(const char* path);
+int vhost_rdma_pool_init(struct vhost_rdma_pool* pool,
+ const char* name,
+ uint32_t num,
+ uint32_t size,
+ bool start_zero,
+ void (*cleanup)(void*));
+void* vhost_rdma_pool_get(struct vhost_rdma_pool* pool, uint32_t idx);
+void vhost_rdma_pool_free(struct vhost_rdma_pool* pool, uint32_t idx);
+void* vhost_rdma_pool_alloc(struct vhost_rdma_pool* pool, uint32_t *idx);
+void vhost_rdma_pool_destroy(struct vhost_rdma_pool* pool);
+
+extern struct vhost_rdma_device g_vhost_rdma_dev[MAX_VHOST_RDMA_DEV_NUM];
+extern struct vhost_rdma_net_dev g_vhost_rdma_net_dev[MAX_VHOST_RDMA_DEV_NUM];
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* VHOST_RDMA_H_ */
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.c b/examples/vhost_user_rdma/vhost_rdma_ib.c
new file mode 100644
index 0000000000..5535a8696b
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_ib.c
@@ -0,0 +1,647 @@
+/*
+ * Vhost-user RDMA device : init and packets forwarding
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@...inos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <unistd.h>
+#include <sys/uio.h>
+#include <stdlib.h>
+
+#include <rte_ethdev.h>
+#include <rte_spinlock.h>
+#include <rte_malloc.h>
+
+#include "vhost_rdma.h"
+#include "vhost_rdma_ib.h"
+#include "vhost_rdma_log.h"
+#include "vhost_rdma_pkt.h"
+
+#define CHK_IOVEC(tp, iov) \
+ do { \
+ if(iov->iov_len < sizeof(*tp)) { \
+ RDMA_LOG_ERR("%s: " #iov " iovec is too small : %ld, %ld", __func__, sizeof(*tp), iov->iov_len); \
+ return -1; \
+ } \
+ tp = iov->iov_base; \
+ } while(0); \
+
+#define DEFINE_VIRTIO_RDMA_CMD(cmd, handler) [cmd] = {handler, #cmd}
+
+#define CTRL_NO_CMD __rte_unused struct iovec *__in
+#define CTRL_NO_RSP __rte_unused struct iovec *__out
+
+/**
+ * @brief Free resources held by a response entry in the RDMA responder path.
+ *
+ * Cleans up mbuf (for ATOMIC) or MR reference (for RDMA READ), then resets type.
+ * Uses RDMA_LOG_* macros for consistent logging.
+ *
+ * @param qp Queue Pair (currently unused)
+ * @param res Response resource to free (in/out)
+ */
+void
+free_rd_atomic_resource(__rte_unused struct vhost_rdma_qp *qp,
+ struct vhost_rdma_resp_res *res)
+{
+ if (!res) {
+ RDMA_LOG_ERR("Cannot free NULL response resource");
+ return;
+ }
+
+ switch (res->type) {
+ case VHOST_ATOMIC_MASK: {
+ struct rte_mbuf *mbuf = res->atomic.mbuf;
+ if (mbuf) {
+ RDMA_LOG_DEBUG("Freeing mbuf=%p from ATOMIC response", mbuf);
+ rte_pktmbuf_free(mbuf);
+ res->atomic.mbuf = NULL;
+ }
+ break;
+ }
+
+ case VHOST_READ_MASK: {
+ struct vhost_rdma_mr *mr = res->read.mr;
+ if (mr) {
+ RDMA_LOG_DEBUG("Dropping MR reference %p from RDMA READ response", mr);
+ vhost_rdma_drop_ref(mr, qp->dev, mr);
+ res->read.mr = NULL;
+ }
+ break;
+ }
+
+ case 0:
+ /* Already freed — silent no-op */
+ break;
+
+ default:
+ RDMA_LOG_ERR("Unknown response resource type %u (possible memory corruption)", res->type);
+ break;
+ }
+
+ /* Reset type to mark as free */
+ res->type = 0;
+}
+
+/**
+ * @brief Free all RD/Atomic response resources allocated for a Queue Pair.
+ *
+ * Iterates through the pre-allocated array of response tracking entries
+ * (used for RDMA READ and ATOMIC operations), frees associated mbufs or MRs,
+ * then releases the entire array memory.
+ *
+ * Safe to call multiple times (idempotent).
+ *
+ * @param qp Pointer to the Queue Pair whose response resources should be freed
+ */
+void
+free_rd_atomic_resources(struct vhost_rdma_qp *qp)
+{
+ if (!qp) {
+ RDMA_LOG_ERR("Cannot free response resources: qp is NULL");
+ return;
+ }
+
+ if (!qp->resp.resources) {
+ RDMA_LOG_DEBUG("No response resources to free for QP %u", qp->qpn);
+ return;
+ }
+
+ const uint32_t max_ops = qp->attr.max_dest_rd_atomic;
+
+ RDMA_LOG_DEBUG("Freeing %u RD/Atomic response resources for QP %u",
+ max_ops, qp->qpn);
+
+ for (uint32_t i = 0; i < max_ops; i++) {
+ struct vhost_rdma_resp_res *res = &qp->resp.resources[i];
+
+ /* Frees internal resources (mbuf or mr) and resets type */
+ free_rd_atomic_resource(qp, res);
+ }
+
+ /* Now free the entire array */
+ rte_free(qp->resp.resources);
+ qp->resp.resources = NULL;
+
+ RDMA_LOG_DEBUG("Successfully freed response resource array for QP %u", qp->qpn);
+}
+
+
+/**
+ * @brief Clean up a vhost RDMA queue.
+ */
+void
+vhost_rdma_queue_cleanup(struct vhost_rdma_qp *qp, struct vhost_rdma_queue *queue)
+{
+ if (!queue)
+ return;
+
+ if (queue->cb && qp)
+ rte_intr_callback_unregister(&queue->intr_handle, queue->cb, qp);
+
+ rte_free(queue->data);
+ queue->data = NULL;
+}
+
+/**
+ * @brief Cleanup callback for MR: reset type.
+ */
+void
+vhost_rdma_mr_cleanup(void *arg)
+{
+ struct vhost_rdma_mr *mr = arg;
+
+ if (mr)
+ mr->type = VHOST_MR_TYPE_NONE;
+}
+
+/**
+ * @brief Cleanup callback for QP: drop references and free resources.
+ */
+void
+vhost_rdma_qp_cleanup(void *arg)
+{
+ struct vhost_rdma_qp *qp = arg;
+
+ if (!qp)
+ return;
+
+ if (qp->scq) {
+ vhost_rdma_drop_ref(qp->scq, qp->dev, cq);
+ qp->scq = NULL;
+ }
+
+ if (qp->rcq) {
+ vhost_rdma_drop_ref(qp->rcq, qp->dev, cq);
+ qp->rcq = NULL;
+ }
+
+ if (qp->pd) {
+ vhost_rdma_drop_ref(qp->pd, qp->dev, pd);
+ qp->pd = NULL;
+ }
+
+ if (qp->resp.mr) {
+ vhost_rdma_drop_ref(qp->resp.mr, qp->dev, mr);
+ qp->resp.mr = NULL;
+ }
+
+ free_rd_atomic_resources(qp);
+}
+
+void
+vhost_rdma_init_ib(struct vhost_rdma_device *dev)
+{
+ uint32_t qpn;
+
+ if (!dev) {
+ return;
+ }
+
+ /* Initialize device attributes (virtio-rdma IB capability) */
+ dev->attr.max_qps = 64;
+ dev->attr.max_cqs = 64;
+ dev->attr.max_mr_size = UINT64_MAX;
+ dev->attr.page_size_cap = 0xFFFFF000U;
+ dev->attr.max_qp_wr = 1024;
+ dev->attr.device_cap_flags = VIRTIO_IB_DEVICE_RC_RNR_NAK_GEN;
+ dev->attr.max_send_sge = 32;
+ dev->attr.max_recv_sge = 32;
+ dev->attr.max_sge_rd = 32;
+ dev->attr.max_cqe = 1024;
+ dev->attr.max_mr = 0x00001000;
+ dev->attr.max_mw = 0;
+ dev->attr.max_pd = 0x7FFC;
+ dev->attr.max_qp_rd_atom = 128;
+ dev->attr.max_qp_init_rd_atom = 128;
+ dev->attr.max_ah = 100;
+ dev->attr.max_fast_reg_page_list_len = 512;
+ dev->attr.local_ca_ack_delay = 15;
+
+ /* Point to the RDMA configuration structure for cleaner assignment */
+ struct vhost_rdma_config *cfg = &dev->rdma_config;
+
+ /* Copy basic limits from device attributes */
+ cfg->max_qp = dev->attr.max_qps;
+ cfg->max_cq = dev->attr.max_cqs;
+ cfg->max_mr = dev->attr.max_mr;
+ cfg->max_pd = dev->attr.max_pd;
+ cfg->max_ah = dev->attr.max_ah;
+ cfg->max_cqe = dev->attr.max_cqe;
+ cfg->max_qp_wr = dev->attr.max_qp_wr;
+ cfg->max_send_sge = dev->attr.max_send_sge;
+ cfg->max_recv_sge = dev->attr.max_recv_sge;
+ cfg->max_sge_rd = dev->attr.max_sge_rd;
+ cfg->max_qp_rd_atom = dev->attr.max_qp_rd_atom;
+ cfg->max_qp_init_rd_atom = dev->attr.max_qp_init_rd_atom;
+ cfg->max_mr_size = dev->attr.max_mr_size;
+ cfg->max_mw = dev->attr.max_mw;
+ cfg->max_fast_reg_page_list_len = dev->attr.max_fast_reg_page_list_len;
+ cfg->page_size_cap = dev->attr.page_size_cap;
+ cfg->device_cap_flag = dev->attr.device_cap_flags;
+ cfg->local_ca_ack_delay = dev->attr.local_ca_ack_delay;
+ cfg->phys_port_cnt = 1;
+ cfg->max_pkeys = 1;
+ cfg->vendor_id = 0x1AF4;
+ cfg->vendor_part_id = 0x0042;
+ cfg->sys_image_guid = 1;
+
+ /* Derived capabilities */
+ cfg->max_res_rd_atom = cfg->max_qp_rd_atom * cfg->max_qp;
+ cfg->max_total_mcast_qp_attach = 8192UL * 56UL;
+ cfg->max_pi_fast_reg_page_list_len = cfg->max_fast_reg_page_list_len / 2;
+
+ /* Inline data and MTU settings */
+ dev->max_inline_data = dev->attr.max_send_sge * sizeof(struct vhost_user_rdma_sge);
+ dev->mtu_cap = ib_mtu_enum_to_int(DEFAULT_IB_MTU);
+
+ /* Reset port counters */
+ dev->port_attr.bad_pkey_cntr = 0;
+ dev->port_attr.qkey_viol_cntr = 0;
+
+ /* Initialize GID table (illegal by default) */
+ for (int i = 0; i < VHOST_MAX_GID_TBL_LEN; i++) {
+ dev->gid_tbl[i].type = VHOST_RDMA_GID_TYPE_ILLIGAL; /* Typo? Should be ILLEGAL? */
+ }
+
+ /* Setup virtual queue mappings:
+ * rdma_vqs[0] is reserved (likely control),
+ * cq_vqs starts at index 1,
+ * qp_vqs follows after all CQs.
+ */
+ dev->cq_vqs = &dev->rdma_vqs[1];
+ dev->qp_vqs = &dev->rdma_vqs[1 + dev->attr.max_cqs];
+
+ /* Initialize resource pools */
+ vhost_rdma_pool_init(&dev->pd_pool, "pd_pool", dev->attr.max_pd,
+ sizeof(struct vhost_rdma_pd), false, NULL);
+
+ vhost_rdma_pool_init(&dev->mr_pool, "mr_pool", dev->attr.max_mr,
+ sizeof(struct vhost_rdma_mr), false, vhost_rdma_mr_cleanup);
+
+ vhost_rdma_pool_init(&dev->cq_pool, "cq_pool", dev->attr.max_cqs,
+ sizeof(struct vhost_rdma_cq), true, NULL); /* Shared across cores? */
+
+ vhost_rdma_pool_init(&dev->qp_pool, "qp_pool", dev->attr.max_qps,
+ sizeof(struct vhost_rdma_qp), false, vhost_rdma_qp_cleanup);
+
+ vhost_rdma_pool_init(&dev->ah_pool, "ah_pool", dev->attr.max_ah,
+ sizeof(struct vhost_rdma_av), false, NULL);
+
+ /* Allocate special GSI QP (QP number 1), used for subsystem management (e.g., SM in IB) */
+ dev->qp_gsi = vhost_rdma_pool_alloc(&dev->qp_pool, &qpn);
+ if (!dev->qp_gsi) {
+ return; /* Failed to allocate GSI QP */
+ }
+ vhost_rdma_add_ref(dev->qp_gsi); /* Hold a reference */
+ assert(qpn == 1); /* GSI must be assigned QPN 1 */
+}
+
+/**
+ * @brief Destroy and clean up all RDMA resources associated with the device.
+ *
+ * This function safely releases all allocated QPs, CQs, MRs, PDs, and AVs,
+ * then destroys their respective memory pools.
+ *
+ * Note: It assumes no external references exist to these objects.
+ */
+void
+vhost_rdma_destroy_ib(struct vhost_rdma_device *dev)
+{
+ struct vhost_rdma_mr *mr;
+ struct vhost_rdma_pd *pd;
+ struct vhost_rdma_cq *cq;
+ struct vhost_rdma_qp *qp;
+ struct vhost_rdma_av *av;
+ uint32_t i;
+
+ if (!dev) {
+ return;
+ }
+
+ /* Clean up Memory Regions (MR): cleanup callback may have already reset state */
+ for (i = 0; i < dev->attr.max_mr; i++) {
+ mr = vhost_rdma_pool_get(&dev->mr_pool, i);
+ if (mr) {
+ vhost_rdma_pool_free(&dev->mr_pool, i); /* Triggers cleanup if registered */
+ }
+ }
+
+ /* Clean up Protection Domains (PD) */
+ for (i = 0; i < dev->attr.max_pd; i++) {
+ pd = vhost_rdma_pool_get(&dev->pd_pool, i);
+ if (pd) {
+ vhost_rdma_pool_free(&dev->pd_pool, i);
+ }
+ }
+
+ /* Clean up Completion Queues (CQ) */
+ for (i = 0; i < dev->attr.max_cqs; i++) {
+ cq = vhost_rdma_pool_get(&dev->cq_pool, i);
+ if (cq) {
+ vhost_rdma_pool_free(&dev->cq_pool, i);
+ }
+ }
+
+ /* Clean up Queue Pairs (QP): must drain SQ/RQ before freeing */
+ for (i = 0; i < dev->attr.max_qps; i++) {
+ qp = vhost_rdma_pool_get(&dev->qp_pool, i);
+ if (qp) {
+ /* Cleanup send and receive queues (e.g., unregister intr handlers, free ring buffers) */
+ vhost_rdma_queue_cleanup(qp, &qp->sq.queue);
+ vhost_rdma_queue_cleanup(qp, &qp->rq.queue);
+
+ /* Now free the QP from the pool (triggers vhost_rdma_qp_cleanup if set) */
+ vhost_rdma_pool_free(&dev->qp_pool, i);
+ }
+ }
+
+ /* Clean up Address Handles (AH / AV) */
+ for (i = 0; i < dev->attr.max_ah; i++) {
+ av = vhost_rdma_pool_get(&dev->ah_pool, i);
+ if (av) {
+ vhost_rdma_pool_free(&dev->ah_pool, i);
+ }
+ }
+
+ /*
+ * Destroy resource pools.
+ * This frees internal pool metadata and backing arrays.
+ * Pools should be empty at this point.
+ */
+ vhost_rdma_pool_destroy(&dev->mr_pool);
+ vhost_rdma_pool_destroy(&dev->pd_pool);
+ vhost_rdma_pool_destroy(&dev->cq_pool);
+ vhost_rdma_pool_destroy(&dev->qp_pool);
+ vhost_rdma_pool_destroy(&dev->ah_pool);
+}
+
+/**
+ * @brief Convert a guest physical address payload into iovec entries.
+ *
+ * This function translates a contiguous memory region (starting at 'payload'
+ * with length 'remaining') into one or more iovecs by looking up the virtual
+ * address via gpa_to_vva(). The resulting iovecs are stored in 'iovs', and
+ * 'iov_index' is updated accordingly.
+ *
+ * @param mem Pointer to vhost memory structure for GPA->VVA translation.
+ * @param iovs Array of iovec structures to fill.
+ * @param iov_index Current index in the iovs array (updated on success).
+ * @param payload Guest physical address (GPA) of the data.
+ * @param remaining Total number of bytes left to translate.
+ * @param num_iovs Maximum number of iovecs allowed.
+ * @return 0 on success, -1 on error (e.g., translation failure or overflow).
+ */
+static int
+desc_payload_to_iovs(struct rte_vhost_memory *mem,
+ struct iovec *iovs,
+ uint32_t *iov_index,
+ uintptr_t payload,
+ uint64_t remaining,
+ uint16_t num_iovs)
+{
+ void *vva;
+ uint64_t len;
+
+ do {
+ if (*iov_index >= num_iovs) {
+ RDMA_LOG_ERR("MAX_IOVS reached");
+ return -1;
+ }
+
+ len = remaining;
+ vva = (void *)(uintptr_t)gpa_to_vva(mem, payload, &len);
+ if (!vva || !len) {
+ RDMA_LOG_ERR("failed to translate desc address.");
+ return -1;
+ }
+
+ iovs[*iov_index].iov_base = vva;
+ iovs[*iov_index].iov_len = len;
+
+ payload += len;
+ remaining -= len;
+ (*iov_index)++;
+ } while (remaining);
+
+ return 0;
+}
+
+/**
+ * @brief Set up iovecs from vring descriptors for a given request.
+ *
+ * Parses the descriptor chain starting at 'req_idx'. Handles both direct and
+ * indirect descriptors. Fills the provided 'iovs' array with valid memory
+ * regions derived from GPA-to-VVA translation. Also counts input/output descriptors.
+ *
+ * @param mem Vhost memory configuration for address translation.
+ * @param vq Virtual queue containing the descriptor ring.
+ * @param req_idx Index of the first descriptor in the chain.
+ * @param iovs Pre-allocated iovec array to populate.
+ * @param num_iovs Size of the iovs array (maximum entries).
+ * @param num_in Output: number of writable (input) descriptors.
+ * @param num_out Output: number of readable (output) descriptors.
+ * @return Number of filled iovecs on success, -1 on error.
+ */
+int
+setup_iovs_from_descs(struct rte_vhost_memory *mem,
+ struct vhost_user_queue *vq,
+ uint16_t req_idx,
+ struct iovec *iovs,
+ uint16_t num_iovs,
+ uint16_t *num_in,
+ uint16_t *num_out)
+{
+ struct vring_desc *desc = &vq->vring.desc[req_idx];
+ struct vring_desc *desc_table;
+ uint32_t iovs_idx = 0;
+ uint64_t len;
+ uint16_t in = 0, out = 0;
+
+ /* Handle indirect descriptors */
+ if (desc->flags & VRING_DESC_F_INDIRECT) {
+ len = desc->len;
+ desc_table = (struct vring_desc *)(uintptr_t)gpa_to_vva(mem, desc->addr, &len);
+ if (!desc_table || !len) {
+ RDMA_LOG_ERR("failed to translate desc address.");
+ return -1;
+ }
+ assert(len == desc->len);
+ desc = desc_table;
+ } else {
+ desc_table = vq->vring.desc;
+ }
+
+ /* Walk through descriptor chain */
+ do {
+ if (iovs_idx >= num_iovs) {
+ RDMA_LOG_ERR("MAX_IOVS reached\n");
+ return -1;
+ }
+
+ if (desc->flags & VRING_DESC_F_WRITE) {
+ in++; /* Descriptor allows write from device perspective (input) */
+ } else {
+ out++; /* Descriptor allows read (output) */
+ }
+
+ /* Translate payload (address + length) into iovec(s) */
+ if (desc_payload_to_iovs(mem, iovs,
+ &iovs_idx,
+ desc->addr,
+ desc->len,
+ num_iovs) != 0) {
+ RDMA_LOG_ERR("Failed to convert desc payload to iovs");
+ return -1;
+ }
+
+ /* Move to next descriptor in chain */
+ desc = vhost_rdma_vring_get_next_desc(desc_table, desc);
+ } while (desc != NULL);
+
+ *num_in = in;
+ *num_out = out;
+ return iovs_idx;
+}
+
+static int
+vhost_rdma_query_device(struct vhost_rdma_device *dev, CTRL_NO_CMD,
+ struct iovec *out)
+{
+ struct vhost_rdma_ack_query_device *rsp;
+
+ CHK_IOVEC(rsp, out);
+
+ rsp->max_mr_size = dev->attr.max_mr_size;
+ rsp->page_size_cap = dev->attr.page_size_cap;
+ rsp->max_qp_wr = dev->attr.max_qp_wr;
+ rsp->device_cap_flags = dev->attr.device_cap_flags;
+ rsp->max_send_sge = dev->attr.max_send_sge;
+ rsp->max_recv_sge = dev->attr.max_recv_sge;
+ rsp->max_sge_rd = dev->attr.max_sge_rd;
+ rsp->max_cqe = dev->attr.max_cqe;
+ rsp->max_mr = dev->attr.max_mr;
+ rsp->max_pd = dev->attr.max_pd;
+ rsp->max_qp_rd_atom = dev->attr.max_qp_rd_atom;
+ rsp->max_qp_init_rd_atom = dev->attr.max_qp_init_rd_atom;
+ rsp->max_ah = dev->attr.max_ah;
+ rsp->local_ca_ack_delay = dev->attr.local_ca_ack_delay;
+
+ return 0;
+}
+
+/* Command handler table declaration */
+struct {
+ int (*handler)(struct vhost_rdma_device *dev, struct iovec *in, struct iovec *out);
+ const char *name; /* Name of the command (for logging) */
+} cmd_tbl[] = {
+ DEFINE_VIRTIO_RDMA_CMD(VHOST_RDMA_CTRL_ROCE_QUERY_DEVICE, vhost_rdma_query_device),
+};
+
+/**
+ * @brief Main handler for control virtqueue events.
+ *
+ * Processes incoming requests from the control virtual queue. Waits for kick
+ * notification via eventfd, then processes available descriptor chains.
+ * Each chain contains a header followed by optional input/output data.
+ * Executes corresponding handler based on command ID.
+ *
+ * @param arg Pointer to vhost_rdma_device instance.
+ */
+void
+vhost_rdma_handle_ctrl_vq(void *arg)
+{
+ struct vhost_rdma_device *dev = arg;
+ struct vhost_rdma_ctrl_hdr *hdr;
+ struct vhost_user_queue *ctrl_vq = &dev->rdma_vqs[0];
+ struct iovec data_iovs[4]; /* Fixed-size iovec buffer */
+ struct iovec *in_iovs, *out_iovs;
+ uint16_t desc_idx, num_in, num_out;
+ uint8_t *status;
+ int kick_fd, nbytes, i, in_len;
+
+ kick_fd = ctrl_vq->vring.kickfd;
+
+ /* Wait until we get a valid kick (notification) */
+ do {
+ uint64_t kick_data;
+ nbytes = eventfd_read(kick_fd, &kick_data);
+ if (nbytes < 0) {
+ if (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN) {
+ continue; /* Retry on transient errors */
+ }
+ RDMA_LOG_ERR("Failed to read kickfd of ctrl virtq: %s", strerror(errno));
+ }
+ break;
+ } while (1);
+
+ /* Process all available requests in the control queue */
+ while (vhost_rdma_vq_is_avail(ctrl_vq)) {
+ desc_idx = vhost_rdma_vq_get_desc_idx(ctrl_vq);
+ /* Build iovecs from descriptor chain */
+ if (setup_iovs_from_descs(dev->mem, ctrl_vq,
+ desc_idx, data_iovs, 4,
+ &num_in, &num_out) < 0) {
+ RDMA_LOG_ERR("read from desc failed");
+ break;
+ }
+ /* Split iovecs into output (device reads) and input (device writes) */
+ out_iovs = data_iovs;
+ in_iovs = &data_iovs[num_out];
+ in_len = 0;
+
+ /* Calculate total input data length */
+ for (i = 0; i < num_in; i++) {
+ in_len += in_iovs[i].iov_len;
+ }
+
+ /* First output iovec should contain the control header */
+ hdr = (struct vhost_rdma_ctrl_hdr *)out_iovs[0].iov_base;
+ status = (uint8_t *)in_iovs[0].iov_base;
+
+ /* Validate header size */
+ if (out_iovs[0].iov_len != sizeof(*hdr)) {
+ RDMA_LOG_ERR("invalid header");
+ *status = VIRTIO_NET_ERR;
+ goto pushq;
+ }
+
+ /* Check if command ID is within valid range */
+ if (hdr->cmd >= (sizeof(cmd_tbl) / sizeof(cmd_tbl[0]))) {
+ RDMA_LOG_ERR("unknown cmd %d", hdr->cmd);
+ *status = VIRTIO_NET_ERR;
+ goto pushq;
+ }
+
+ /* Dispatch command handler; set status based on result */
+ *status = (cmd_tbl[hdr->cmd].handler(dev,
+ num_out > 1 ? &out_iovs[1] : NULL,
+ num_in > 1 ? &in_iovs[1] : NULL) == 0)
+ ? VIRTIO_NET_OK
+ : VIRTIO_NET_ERR;
+
+pushq:
+ /* Log command execution result */
+ RDMA_LOG_INFO("cmd=%d %s status: %d",
+ hdr->cmd,
+ cmd_tbl[hdr->cmd].name ? cmd_tbl[hdr->cmd].name : "unknown",
+ *status);
+
+ /* Return used descriptor to the avail ring and notify frontend */
+ vhost_rdma_queue_push(ctrl_vq, desc_idx, in_len);
+ vhost_rdma_queue_notify(dev->vid, ctrl_vq);
+ }
+}
+
+int
+vhost_rdma_task_scheduler(void *arg)
+{
+ return 0;
+}
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_ib.h b/examples/vhost_user_rdma/vhost_rdma_ib.h
new file mode 100644
index 0000000000..4ac896d82e
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_ib.h
@@ -0,0 +1,710 @@
+/**
+ * @file vhost_rdma_ib.h
+ * @brief Vhost-user RDMA device: IB emulation layer and control path definitions.
+ *
+ * This header defines the internal data structures, constants, and function interfaces
+ * used by the vhost-user RDMA backend to emulate InfiniBand/RoCE semantics over virtio.
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@...inos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __VHOST_RDMA_IB_H__
+#define __VHOST_RDMA_IB_H__
+
+#include <netinet/in.h>
+#include <linux/virtio_net.h>
+
+#include <rte_spinlock.h>
+#include <rte_atomic.h>
+#include <rte_timer.h>
+#include <rte_mbuf.h>
+#include <rte_ring.h>
+#include <rte_vhost.h>
+#include <linux/vhost_types.h>
+
+#include "eal_interrupts.h"
+
+/* Forward declarations */
+struct vhost_rdma_device;
+struct vhost_queue;
+
+/**
+ * @defgroup constants Constants & Limits
+ * @{
+ */
+
+/** Invalid opcode marker */
+#define OPCODE_NONE (-1)
+
+/** Device capability flags */
+#define VIRTIO_IB_DEVICE_RC_RNR_NAK_GEN (1 << 0)
+
+/** Maximum number of memory regions in vhost-user memory table */
+#define VHOST_USER_MEMORY_MAX_NREGIONS 8
+
+/** Maximum size for config space read/write operations */
+#define VHOST_USER_MAX_CONFIG_SIZE 256
+
+/** ROCE control command types (virtio-rdma extension) */
+#define VHOST_RDMA_CTRL_ROCE 6
+#define VHOST_RDMA_CTRL_ROCE_QUERY_DEVICE 0
+#define VHOST_RDMA_CTRL_ROCE_QUERY_PORT 1
+#define VHOST_RDMA_CTRL_ROCE_CREATE_CQ 2
+#define VHOST_RDMA_CTRL_ROCE_DESTROY_CQ 3
+#define VHOST_RDMA_CTRL_ROCE_CREATE_PD 4
+#define VHOST_RDMA_CTRL_ROCE_DESTROY_PD 5
+#define VHOST_RDMA_CTRL_ROCE_GET_DMA_MR 6
+#define VHOST_RDMA_CTRL_ROCE_ALLOC_MR 7
+#define VHOST_RDMA_CTRL_ROCE_REG_USER_MR 9
+#define VHOST_RDMA_CTRL_ROCE_MAP_MR_SG 8
+#define VHOST_RDMA_CTRL_ROCE_DEREG_MR 10
+#define VHOST_RDMA_CTRL_ROCE_CREATE_QP 11
+#define VHOST_RDMA_CTRL_ROCE_MODIFY_QP 12
+#define VHOST_RDMA_CTRL_ROCE_QUERY_QP 13
+#define VHOST_RDMA_CTRL_ROCE_DESTROY_QP 14
+#define VHOST_RDMA_CTRL_ROCE_QUERY_PKEY 15
+#define VHOST_RDMA_CTRL_ROCE_ADD_GID 16
+#define VHOST_RDMA_CTRL_ROCE_DEL_GID 17
+#define VHOST_RDMA_CTRL_ROCE_REQ_NOTIFY_CQ 18
+
+struct vhost_rdma_ack_query_device {
+#define VIRTIO_IB_DEVICE_RC_RNR_NAK_GEN (1 << 0)
+ /* Capabilities mask */
+ uint64_t device_cap_flags;
+ /* Largest contiguous block that can be registered */
+ uint64_t max_mr_size;
+ /* Supported memory shift sizes */
+ uint64_t page_size_cap;
+ /* Hardware version */
+ uint32_t hw_ver;
+ /* Maximum number of outstanding Work Requests (WR) on Send Queue (SQ) and Receive Queue (RQ) */
+ uint32_t max_qp_wr;
+ /* Maximum number of scatter/gather (s/g) elements per WR for SQ for non RDMA Read operations */
+ uint32_t max_send_sge;
+ /* Maximum number of s/g elements per WR for RQ for non RDMA Read operations */
+ uint32_t max_recv_sge;
+ /* Maximum number of s/g per WR for RDMA Read operations */
+ uint32_t max_sge_rd;
+ /* Maximum size of Completion Queue (CQ) */
+ uint32_t max_cqe;
+ /* Maximum number of Memory Regions (MR) */
+ uint32_t max_mr;
+ /* Maximum number of Protection Domains (PD) */
+ uint32_t max_pd;
+ /* Maximum number of RDMA Read perations that can be outstanding per Queue Pair (QP) */
+ uint32_t max_qp_rd_atom;
+ /* Maximum depth per QP for initiation of RDMA Read operations */
+ uint32_t max_qp_init_rd_atom;
+ /* Maximum number of Address Handles (AH) */
+ uint32_t max_ah;
+ /* Local CA ack delay */
+ uint8_t local_ca_ack_delay;
+ /* Padding */
+ uint8_t padding[3];
+ /* Reserved for future */
+ uint32_t reserved[14];
+};
+
+
+/**
+ * @defgroup qp_states Queue Pair States
+ * @{
+ */
+enum vhost_rdma_ib_qp_state {
+ VHOST_RDMA_IB_QPS_RESET,
+ VHOST_RDMA_IB_QPS_INIT,
+ VHOST_RDMA_IB_QPS_RTR,
+ VHOST_RDMA_IB_QPS_RTS,
+ VHOST_RDMA_IB_QPS_SQD,
+ VHOST_RDMA_IB_QPS_SQE,
+ VHOST_RDMA_IB_QPS_ERR
+};
+/** @} */
+
+/**
+ * @defgroup mtu_sizes IB MTU Sizes
+ * @{
+ */
+enum vhost_rdma_ib_mtu {
+ VHOST_RDMA_IB_MTU_256 = 1,
+ VHOST_RDMA_IB_MTU_512 = 2,
+ VHOST_RDMA_IB_MTU_1024 = 3,
+ VHOST_RDMA_IB_MTU_2048 = 4,
+ VHOST_RDMA_IB_MTU_4096 = 5
+};
+/** @} */
+
+/**
+ * @defgroup wc_status Work Completion Status Codes
+ * @{
+ */
+enum vhost_rdma_ib_wc_status {
+ VHOST_RDMA_IB_WC_SUCCESS,
+ VHOST_RDMA_IB_WC_LOC_LEN_ERR,
+ VHOST_RDMA_IB_WC_LOC_QP_OP_ERR,
+ VHOST_RDMA_IB_WC_LOC_PROT_ERR,
+ VHOST_RDMA_IB_WC_WR_FLUSH_ERR,
+ VHOST_RDMA_IB_WC_BAD_RESP_ERR,
+ VHOST_RDMA_IB_WC_LOC_ACCESS_ERR,
+ VHOST_RDMA_IB_WC_REM_INV_REQ_ERR,
+ VHOST_RDMA_IB_WC_REM_ACCESS_ERR,
+ VHOST_RDMA_IB_WC_REM_OP_ERR,
+ VHOST_RDMA_IB_WC_RETRY_EXC_ERR,
+ VHOST_RDMA_IB_WC_RNR_RETRY_EXC_ERR,
+ VHOST_RDMA_IB_WC_REM_ABORT_ERR,
+ VHOST_RDMA_IB_WC_FATAL_ERR,
+ VHOST_RDMA_IB_WC_RESP_TIMEOUT_ERR,
+ VHOST_RDMA_IB_WC_GENERAL_ERR
+};
+/** @} */
+
+/**
+ * @defgroup res_state Responder Resource States
+ * @{
+ */
+enum vhost_rdma_res_state {
+ VHOST_RDMA_RES_STATE_NEXT,
+ VHOST_RDMA_RES_STATE_NEW,
+ VHOST_RDMA_RES_STATE_REPLAY,
+};
+/** @} */
+
+/**
+ * @defgroup vhost_user_requests Vhost-user Message Types
+ * @{
+ */
+enum vhost_user_rdma_request {
+ VHOST_USER_NONE = 0,
+ VHOST_USER_GET_FEATURES = 1,
+ VHOST_USER_SET_FEATURES = 2,
+ VHOST_USER_SET_OWNER = 3,
+ VHOST_USER_RESET_OWNER = 4,
+ VHOST_USER_SET_MEM_TABLE = 5,
+ VHOST_USER_SET_LOG_BASE = 6,
+ VHOST_USER_SET_LOG_FD = 7,
+ VHOST_USER_SET_VRING_NUM = 8,
+ VHOST_USER_SET_VRING_ADDR = 9,
+ VHOST_USER_SET_VRING_BASE = 10,
+ VHOST_USER_GET_VRING_BASE = 11,
+ VHOST_USER_SET_VRING_KICK = 12,
+ VHOST_USER_SET_VRING_CALL = 13,
+ VHOST_USER_SET_VRING_ERR = 14,
+ VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+ VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+ VHOST_USER_GET_QUEUE_NUM = 17,
+ VHOST_USER_SET_VRING_ENABLE = 18,
+ VHOST_USER_GET_CONFIG = 24,
+ VHOST_USER_SET_CONFIG = 25,
+ VHOST_USER_MAX
+};
+/** @} */
+
+/**
+ * @brief QP capabilities structure
+ */
+struct vhost_rdma_qp_cap {
+ uint32_t max_send_wr; /**< Max work requests in send queue */
+ uint32_t max_send_sge; /**< Max scatter-gather elements per send WR */
+ uint32_t max_recv_wr; /**< Max work requests in receive queue */
+ uint32_t max_recv_sge; /**< Max SGEs per receive WR */
+ uint32_t max_inline_data; /**< Max inline data size supported */
+};
+
+/**
+ * @brief Global route attributes (used in AH/GRH)
+ */
+struct vhost_rdma_global_route {
+ uint8_t dgid[16]; /**< Destination GID or MGID */
+ uint32_t flow_label; /**< IPv6-style flow label */
+ uint8_t sgid_index; /**< Source GID table index */
+ uint8_t hop_limit; /**< TTL/Hop Limit */
+ uint8_t traffic_class; /**< Traffic class field */
+};
+
+/**
+ * @brief Address Handle (AH) attributes
+ */
+struct vhost_rdma_ah_attr {
+ struct vhost_rdma_global_route grh; /**< GRH fields */
+ uint8_t sl; /**< Service Level */
+ uint8_t static_rate; /**< Static rate (encoding) */
+ uint8_t port_num; /**< Physical port number */
+ uint8_t ah_flags; /**< Flags (e.g., GRH present) */
+ uint8_t dmac[6]; /**< Destination MAC address (for RoCE) */
+} __rte_packed;
+
+/**
+ * @brief Queue Pair attributes
+ */
+struct vhost_rdma_qp_attr {
+ enum vhost_rdma_ib_qp_state qp_state; /**< Target QP state */
+ enum vhost_rdma_ib_qp_state cur_qp_state; /**< Current QP state */
+ enum vhost_rdma_ib_mtu path_mtu; /**< Path MTU */
+ uint32_t qkey; /**< QKey for UD/RC */
+ uint32_t rq_psn; /**< Receive PSN */
+ uint32_t sq_psn; /**< Send PSN */
+ uint32_t dest_qp_num; /**< Remote QPN */
+ uint32_t qp_access_flags; /**< Access permissions */
+ uint8_t sq_draining; /**< Is SQ draining? */
+ uint8_t max_rd_atomic; /**< Max outstanding RDMA reads/atomics */
+ uint8_t max_dest_rd_atomic; /**< Max at responder side */
+ uint8_t min_rnr_timer; /**< Minimum RNR NAK timer value */
+ uint8_t timeout; /**< Timeout exponent for ACKs */
+ uint8_t retry_cnt; /**< Retry counter limit */
+ uint8_t rnr_retry; /**< RNR retry count */
+ uint32_t rate_limit; /**< Rate limit (Mb/s) */
+ struct vhost_rdma_qp_cap cap; /**< QP capacity limits */
+ struct vhost_rdma_ah_attr ah_attr; /**< AH attributes for RC/UC */
+};
+
+/**
+ * @brief Protection Domain (PD)
+ */
+struct vhost_rdma_pd {
+ struct vhost_rdma_device *dev; /**< Backing device */
+ uint32_t pdn; /**< PD identifier */
+ rte_atomic32_t refcnt; /**< Reference count */
+};
+
+/**
+ * @brief Generic queue abstraction (used for SQ/RQ)
+ */
+struct vhost_rdma_queue {
+ struct vhost_user_queue *vq; /**< Associated vhost vring */
+ void *data; /**< Ring buffer base pointer */
+ size_t elem_size; /**< Size of each element */
+ size_t num_elems; /**< Number of elements */
+ uint16_t consumer_index; /**< Consumer index (local) */
+ uint16_t producer_index; /**< Producer index (from guest) */
+
+ struct rte_intr_handle intr_handle; /**< Interrupt handler */
+ rte_intr_callback_fn cb; /**< Optional callback on kick */
+};
+
+/**
+ * @brief Padded memory region layout (fixed-size vhost_memory)
+ */
+struct vhost_memory_padded {
+ uint32_t nregions; /**< Number of valid regions */
+ uint32_t padding; /**< Alignment padding */
+ struct vhost_memory_region regions[VHOST_USER_MEMORY_MAX_NREGIONS];
+};
+
+/**
+ * @brief Configuration access payload
+ */
+struct vhost_user_rdma_config {
+ uint32_t offset; /**< Offset in config space */
+ uint32_t size; /**< Data size */
+ uint32_t flags; /**< Reserved/flags */
+ uint8_t region[VHOST_USER_MAX_CONFIG_SIZE]; /**< Config data */
+};
+
+/**
+ * @brief Vhost-user RDMA message structure
+ */
+struct vhost_user_rdma_msg {
+ enum vhost_user_rdma_request request;
+
+#define VHOST_USER_VERSION_MASK 0x3
+#define VHOST_USER_REPLY_MASK (0x1 << 2)
+ uint32_t flags; /**< Version and reply flag */
+ uint32_t size; /**< Payload size */
+
+ union {
+#define VHOST_USER_VRING_IDX_MASK 0xff
+#define VHOST_USER_VRING_NOFD_MASK (0x1 << 8)
+ uint64_t u64;
+ struct vhost_vring_state state;
+ struct vhost_vring_addr addr;
+ struct vhost_memory_padded memory;
+ struct vhost_user_rdma_config cfg;
+ } payload;
+} __rte_packed;
+
+/**
+ * @brief Completion Queue (CQ)
+ */
+struct vhost_rdma_cq {
+ struct vhost_queue *vq; /**< Notification V-ring */
+ rte_spinlock_t cq_lock; /**< Protect CQ operations */
+ uint8_t notify; /**< Notify pending flag */
+ bool is_dying; /**< Being destroyed */
+
+ uint32_t cqn; /**< CQ identifier */
+ rte_atomic32_t refcnt; /**< Reference count */
+};
+
+/**
+ * @brief Send Queue (SQ) container
+ */
+struct vhost_rdma_sq {
+ rte_spinlock_t lock; /**< Guard SQ access */
+ struct vhost_rdma_queue queue; /**< Underlying ring */
+};
+
+/**
+ * @brief Receive Queue (RQ) container
+ */
+struct vhost_rdma_rq {
+ rte_spinlock_t lock; /**< Guard RQ access */
+ struct vhost_rdma_queue queue; /**< Underlying ring */
+};
+
+/**
+ * @brief Address Vector (AV) - cached routing info
+ */
+struct vhost_rdma_av {
+ uint8_t network_type; /**< e.g., IPv4/IPv6/Ethernet */
+ uint8_t dmac[6]; /**< Destination MAC */
+ struct vhost_rdma_global_route grh; /**< GRH fields */
+
+ union {
+ struct sockaddr_in _sockaddr_in;
+ struct sockaddr_in6 _sockaddr_in6;
+ } sgid_addr, dgid_addr; /**< GID resolution cache (optional) */
+};
+
+/**
+ * @brief Lightweight task abstraction with scheduling support
+ */
+struct vhost_rdma_task {
+ char name[8]; /**< Task name (debug) */
+ int state; /**< Execution state */
+ bool destroyed; /**< Marked for cleanup */
+ rte_atomic16_t sched; /**< Scheduled flag */
+ rte_spinlock_t state_lock; /**< Lock for state transitions */
+ struct rte_ring *task_ring; /**< Work submission ring */
+
+ int (*func)(void *arg); /**< Task function */
+ void *arg; /**< Argument to func */
+ int ret; /**< Return code */
+};
+
+/**
+ * @brief Requester-side operation tracking
+ */
+struct vhost_rdma_req_info {
+ enum vhost_rdma_ib_qp_state state;
+ int wqe_index; /**< Current WQE index */
+ uint32_t psn; /**< Packet Sequence Number */
+ int opcode; /**< Operation type */
+ rte_atomic32_t rd_atomic; /**< Outstanding RDMA read/atomic count */
+ int wait_fence; /**< Fence required */
+ int need_rd_atomic; /**< Need atomic resource */
+ int wait_psn; /**< Waiting for PSN gap */
+ int need_retry; /**< Should retry */
+ int noack_pkts; /**< Packets sent without ACK */
+ struct vhost_rdma_task task; /**< Retransmission task */
+};
+
+/**
+ * @brief Completer-side retry and retransmit context
+ */
+struct vhost_rdma_comp_info {
+ uint32_t psn; /**< Last packet PSN */
+ int opcode;
+ int timeout; /**< Timeout occurred */
+ int timeout_retry;
+ int started_retry;
+ uint32_t retry_cnt;
+ uint32_t rnr_retry;
+ struct vhost_rdma_task task; /**< RNR/retry handling task */
+};
+
+/**
+ * @brief Scatter-Gather Element (SGE)
+ */
+struct vhost_rdma_sge {
+ __le64 addr; /**< Guest virtual address */
+ __le32 length; /**< Length in bytes */
+ __le32 lkey; /**< Local key */
+};
+
+/**
+ * @brief DMA transfer context
+ */
+struct vhost_rdma_dma_info {
+ uint32_t length; /**< Total transfer length */
+ uint32_t resid; /**< Remaining bytes */
+ uint32_t cur_sge; /**< Current SGE index */
+ uint32_t num_sge; /**< Total SGE count */
+ uint32_t sge_offset; /**< Offset within current SGE */
+ uint32_t reserved;
+ union {
+ uint8_t *inline_data; /**< Inline data pointer */
+ struct vhost_rdma_sge *sge; /**< SGE array */
+ void *raw; /**< Generic pointer */
+ };
+};
+
+/**
+ * @brief Receive Work Queue Entry (WQE)
+ */
+struct vhost_rdma_recv_wqe {
+ __aligned_u64 wr_id; /**< User-defined WR ID */
+ __u32 num_sge;
+ __u32 padding;
+ struct vhost_rdma_dma_info dma; /**< DMA context */
+};
+
+/**
+ * @brief Memory Region (MR) types
+ */
+enum vhost_rdma_mr_type {
+ VHOST_MR_TYPE_NONE,
+ VHOST_MR_TYPE_DMA,
+ VHOST_MR_TYPE_MR,
+};
+
+/**
+ * @brief MR lifecycle states
+ */
+enum vhost_rdma_mr_state {
+ VHOST_MR_STATE_ZOMBIE,
+ VHOST_MR_STATE_INVALID,
+ VHOST_MR_STATE_FREE,
+ VHOST_MR_STATE_VALID,
+};
+
+/**
+ * @brief Memory Region (MR) object
+ */
+struct vhost_rdma_mr {
+ struct vhost_rdma_pd *pd; /**< Owning PD */
+ enum vhost_rdma_mr_type type; /**< Type of MR */
+ enum vhost_rdma_mr_state state; /**< State machine */
+ uint64_t va; /**< Virtual address (host VA) */
+ uint64_t iova; /**< IOVA / virtual address in guest */
+ size_t length; /**< Length of mapping */
+ uint32_t offset; /**< Offset in page array */
+ int access; /**< Access flags (e.g., LOCAL_WRITE) */
+
+ uint32_t lkey; /**< Local key */
+ uint32_t rkey; /**< Remote key */
+
+ uint32_t npages; /**< Number of mapped pages */
+ uint32_t max_pages; /**< Allocated page array size */
+ uint64_t *pages; /**< Array of page addresses */
+
+ uint32_t mrn; /**< MR identifier */
+ rte_atomic32_t refcnt; /**< Reference counter */
+};
+
+/**
+ * @brief Responder resource (used for replay and ACK handling)
+ */
+struct vhost_rdma_resp_res {
+ int type; /**< Resource type */
+ int replay; /**< Is this a replay? */
+ uint32_t first_psn;
+ uint32_t last_psn;
+ uint32_t cur_psn;
+ enum vhost_rdma_res_state state;
+
+ union {
+ struct {
+ struct rte_mbuf *mbuf; /**< Packet buffer */
+ } atomic;
+ struct {
+ struct vhost_rdma_mr *mr;
+ uint64_t va_org; /**< Original VA */
+ uint32_t rkey;
+ uint32_t length;
+ uint64_t va; /**< Current VA */
+ uint32_t resid; /**< Residual length */
+ } read;
+ };
+};
+
+/**
+ * @brief Response processing context (responder side)
+ */
+struct vhost_rdma_resp_info {
+ enum vhost_rdma_ib_qp_state state;
+ uint32_t msn; /**< Message sequence number */
+ uint32_t psn; /**< Current PSN */
+ uint32_t ack_psn; /**< Acknowledged PSN */
+ int opcode;
+ int drop_msg; /**< Drop current message */
+ int goto_error; /**< Transition to error state */
+ int sent_psn_nak; /**< Has sent NAK */
+ enum vhost_rdma_ib_wc_status status;
+ uint8_t aeth_syndrome; /**< AETH error code */
+
+ /* Receive path only */
+ struct vhost_rdma_recv_wqe *wqe;
+
+ /* RDMA read / atomic operations */
+ uint64_t va;
+ uint64_t offset;
+ struct vhost_rdma_mr *mr;
+ uint32_t resid;
+ uint32_t rkey;
+ uint32_t length;
+ uint64_t atomic_orig;
+
+ /* Circular buffer of responder resources */
+ struct vhost_rdma_resp_res *resources;
+ unsigned int res_head;
+ unsigned int res_tail;
+ struct vhost_rdma_resp_res *res;
+
+ struct vhost_rdma_task task; /**< Timeout/retry task */
+};
+
+/**
+ * @brief Queue Pair (QP)
+ */
+struct vhost_rdma_qp {
+ struct vhost_rdma_device *dev; /**< Parent device */
+ struct vhost_rdma_qp_attr attr; /**< Current attributes */
+ uint32_t qpn; /**< Queue Pair Number */
+ uint8_t type; /**< QP type (RC/UC/UD) */
+ unsigned int valid; /**< Is QP active? */
+ unsigned int mtu; /**< Effective MTU in bytes */
+
+ struct vhost_rdma_pd *pd; /**< Owning PD */
+ struct vhost_rdma_cq *scq; /**< Send CQ */
+ struct vhost_rdma_cq *rcq; /**< Receive CQ */
+
+ uint8_t sq_sig_all; /**< Every send WQE signals completion */
+
+ struct vhost_rdma_sq sq; /**< Send Queue */
+ struct vhost_rdma_rq rq; /**< Receive Queue */
+ void *srq; /**< Shared Receive Queue (reserved) */
+
+ uint32_t dst_cookie; /**< Cookie from destination */
+ uint16_t src_port; /**< Source UDP port (RoCE) */
+
+ struct vhost_rdma_av av; /**< Cached path information */
+
+ struct rte_ring *req_pkts; /**< Request packets ring (from guest) */
+ struct rte_mbuf *req_pkts_head; /**< Head for peeking packets */
+ struct rte_ring *resp_pkts; /**< Response packets ring (to guest) */
+
+ struct vhost_rdma_req_info req; /**< Requester context */
+ struct vhost_rdma_comp_info comp; /**< Completer context */
+ struct vhost_rdma_resp_info resp; /**< Responder context */
+
+ rte_atomic32_t ssn; /**< Send Sequence Number */
+ rte_atomic32_t mbuf_out; /**< Number of mbufs in flight */
+ int need_req_mbuf; /**< Need more mbufs for requests */
+
+ /* Retransmission timer (RC only) */
+ struct rte_timer retrans_timer;
+ uint64_t qp_timeout_ticks;
+
+ /* RNR NAK handling timer */
+ struct rte_timer rnr_nak_timer;
+
+ rte_spinlock_t state_lock; /**< Protect state changes */
+ rte_atomic32_t refcnt; /**< Reference count */
+};
+
+/**
+ * @brief User-space SGE (control path)
+ */
+struct vhost_user_rdma_sge {
+ uint64_t addr; /**< Host/user virtual address */
+ uint32_t length;
+ uint32_t lkey;
+};
+
+struct vhost_rdma_ctrl_hdr {
+ uint8_t cmd;
+};
+
+/**
+ * @brief Convert IB MTU enum to byte size
+ * @param mtu The MTU enum value
+ * @return Byte size on success, -1 if invalid
+ */
+static inline int
+ib_mtu_enum_to_int(enum vhost_rdma_ib_mtu mtu)
+{
+ switch (mtu) {
+ case VHOST_RDMA_IB_MTU_256: return 256;
+ case VHOST_RDMA_IB_MTU_512: return 512;
+ case VHOST_RDMA_IB_MTU_1024: return 1024;
+ case VHOST_RDMA_IB_MTU_2048: return 2048;
+ case VHOST_RDMA_IB_MTU_4096: return 4096;
+ default: return -1;
+ }
+}
+
+/* Function declarations */
+
+/**
+ * @brief Initialize RDMA device's IB attributes and resource pools
+ * @param dev RDMA device instance
+ */
+void vhost_rdma_init_ib(struct vhost_rdma_device *dev);
+
+/**
+ * @brief Destroy all IB resources and release memory pools
+ * @param dev RDMA device instance
+ */
+void vhost_rdma_destroy_ib(struct vhost_rdma_device *dev);
+
+/**
+ * @brief Handle control virtqueue messages (device configuration)
+ * @param arg Pointer to device or thread context
+ */
+void vhost_rdma_handle_ctrl_vq(void *arg);
+
+/**
+ * @brief Main scheduler loop for RDMA tasks (retries, timeouts)
+ * @param arg Device context
+ * @return 0 on exit
+ */
+int vhost_rdma_task_scheduler(void *arg);
+
+/**
+ * @brief Cleanup callback for MR pool objects
+ * @param arg Pointer to struct vhost_rdma_mr
+ */
+void vhost_rdma_mr_cleanup(void *arg);
+
+/**
+ * @brief Cleanup callback for QP pool objects
+ * @param arg Pointer to struct vhost_rdma_qp
+ */
+void vhost_rdma_qp_cleanup(void *arg);
+
+/**
+ * @brief Clean up a vhost_rdma_queue (drain rings, unregister interrupts)
+ * @param qp Owning QP
+ * @param queue Queue to clean
+ */
+void vhost_rdma_queue_cleanup(struct vhost_rdma_qp *qp, struct vhost_rdma_queue *queue);
+
+/**
+ * @brief Release one RDMA read/atomic responder resource
+ * @param qp QP owning the resource
+ * @param res Resource to free
+ */
+void free_rd_atomic_resource(struct vhost_rdma_qp *qp, struct vhost_rdma_resp_res *res);
+
+/**
+ * @brief Release all RDMA read/atomic responder resources
+ * @param qp QP whose resources to free
+ */
+void free_rd_atomic_resources(struct vhost_rdma_qp *qp);
+
+int setup_iovs_from_descs(struct rte_vhost_memory *mem,
+ struct vhost_user_queue *vq,
+ uint16_t req_idx,
+ struct iovec *iovs,
+ uint16_t num_iovs,
+ uint16_t *num_in,
+ uint16_t *num_out);
+
+#endif /* __VHOST_RDMA_IB_H__ */
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_log.h b/examples/vhost_user_rdma/vhost_rdma_log.h
new file mode 100644
index 0000000000..dfb4d1adae
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_log.h
@@ -0,0 +1,52 @@
+/*
+ * Vhost-user RDMA device : init and packets forwarding
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@...inos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#ifndef __VHOST_RDMA_LOG_H__
+#define __VHOST_RDMA_LOG_H__
+
+#include <rte_log.h>
+
+#define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER2
+#define RTE_LOGTYPE_ETHER RTE_LOGTYPE_USER3
+#define RTE_LOGTYPE_RDMA RTE_LOGTYPE_USER1
+
+#define LOG_DEBUG_DP(f, ...) RTE_LOG_DP(DEBUG, ETHER, f "\n", ##__VA_ARGS__)
+#define LOG_INFO_DP(f, ...) RTE_LOG_DP(INFO, ETHER, f "\n", ##__VA_ARGS__)
+#define LOG_WARN_DP(f, ...) RTE_LOG_DP(WARNING, ETHER, f "\n", ##__VA_ARGS__)
+#define LOG_ERR_DP(f, ...) RTE_LOG_DP(ERR, ETHER, f "\n", ##__VA_ARGS__)
+
+#define LOG_DEBUG(f, ...) RTE_LOG(DEBUG, ETHER, f "\n", ##__VA_ARGS__)
+#define LOG_INFO(f, ...) RTE_LOG(INFO, ETHER, f "\n", ##__VA_ARGS__)
+#define LOG_WARN(f, ...) RTE_LOG(WARNING, ETHER, f "\n", ##__VA_ARGS__)
+#define LOG_ERR(f, ...) RTE_LOG(ERR, ETHER, f "\n", ##__VA_ARGS__)
+
+#define RDMA_LOG_DEBUG(f, ...) RTE_LOG(DEBUG, RDMA, "[ %s ]: " f "\n", __func__, ##__VA_ARGS__)
+#define RDMA_LOG_INFO(f, ...) RTE_LOG(INFO, RDMA, "[ %s ]: " f "\n", __func__, ##__VA_ARGS__)
+#define RDMA_LOG_ERR(f, ...) RTE_LOG(ERR, RDMA, "[ %s ]: " f "\n", __func__, ##__VA_ARGS__)
+
+#ifdef DEBUG_RDMA_DP
+#define RDMA_LOG_DEBUG_DP(f, ...) RTE_LOG(DEBUG, RDMA, "[%u] " f "\n", \
+ rte_lcore_id(), ##__VA_ARGS__)
+#define RDMA_LOG_INFO_DP(f, ...) RTE_LOG(INFO, RDMA, "[%u] " f "\n", \
+ rte_lcore_id(), ##__VA_ARGS__)
+#define RDMA_LOG_ERR_DP(f, ...) RTE_LOG(ERR, RDMA, "[%u] " f "\n", \
+ rte_lcore_id(), ##__VA_ARGS__)
+#else
+#define RDMA_LOG_DEBUG_DP(f, ...) RTE_LOG_DP(DEBUG, RDMA, "[%u] " f "\n", \
+ rte_lcore_id(), ##__VA_ARGS__)
+#define RDMA_LOG_INFO_DP(f, ...) RTE_LOG_DP(INFO, RDMA, "[%u] " f "\n", \
+ rte_lcore_id(), ##__VA_ARGS__)
+#define RDMA_LOG_ERR_DP(f, ...) RTE_LOG_DP(ERR, RDMA, "[%u] " f "\n", \
+ rte_lcore_id(), ##__VA_ARGS__)
+#endif
+
+#endif
\ No newline at end of file
diff --git a/examples/vhost_user_rdma/vhost_rdma_pkt.h b/examples/vhost_user_rdma/vhost_rdma_pkt.h
new file mode 100644
index 0000000000..2bbc030e0a
--- /dev/null
+++ b/examples/vhost_user_rdma/vhost_rdma_pkt.h
@@ -0,0 +1,296 @@
+/**
+ * @file vhost_rdma_pkt.h
+ * @brief Vhost-user RDMA packet format and opcode definitions.
+ *
+ * This header defines the internal packet representation, InfiniBand/RoCE header layout,
+ * opcode mapping, and control flags used during packet parsing and transmission
+ * in the vhost-user RDMA backend.
+ *
+ * Copyright (C) 2025 KylinSoft Inc. and/or its affiliates. All rights reserved.
+ *
+ * Author: Xiong Weimin <xiongweimin@...inos.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __VHOST_RDMA_PKT_H__
+#define __VHOST_RDMA_PKT_H__
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include <rte_byteorder.h>
+#include <rte_mbuf.h> /* For struct rte_mbuf if needed later */
+
+/* Forward declarations */
+struct vhost_rdma_dev;
+struct vhost_rdma_qp;
+struct vhost_rdma_send_wqe;
+
+#ifndef BIT
+#define BIT(x) (1U << (x)) /**< Generate bitmask from bit index */
+#endif
+
+/**
+ * @defgroup constants Constants & Limits
+ * @{
+ */
+
+/** Maximum number of QP types supported for WR mask dispatching */
+#define WR_MAX_QPT 8
+
+/** Invalid opcode marker */
+#define OPCODE_NONE (-1)
+
+/** Total number of defined opcodes (must be power-of-2 >= 256) */
+#define VHOST_NUM_OPCODE 256
+
+/** @} */
+
+/**
+ * @defgroup wr_masks Work Request Type Masks
+ * @{
+ */
+enum vhost_rdma_wr_mask {
+ WR_INLINE_MASK = BIT(0), /**< WR contains inline data */
+ WR_ATOMIC_MASK = BIT(1), /**< WR is an atomic operation */
+ WR_SEND_MASK = BIT(2), /**< WR is a send-type operation */
+ WR_READ_MASK = BIT(3), /**< WR initiates RDMA read */
+ WR_WRITE_MASK = BIT(4), /**< WR performs RDMA write */
+ WR_LOCAL_OP_MASK = BIT(5), /**< WR triggers local memory op */
+
+ WR_READ_OR_WRITE_MASK = WR_READ_MASK | WR_WRITE_MASK,
+ WR_READ_WRITE_OR_SEND_MASK = WR_READ_OR_WRITE_MASK | WR_SEND_MASK,
+ WR_WRITE_OR_SEND_MASK = WR_WRITE_MASK | WR_SEND_MASK,
+ WR_ATOMIC_OR_READ_MASK = WR_ATOMIC_MASK | WR_READ_MASK,
+};
+/** @} */
+
+/**
+ * @brief Metadata about each Work Request (WR) opcode
+ *
+ * Used to determine which operations are valid per QP type.
+ */
+struct vhost_rdma_wr_opcode_info {
+ const char *name; /**< Human-readable name */
+ enum vhost_rdma_wr_mask mask[WR_MAX_QPT]; /**< Validity per QP type */
+};
+
+/* Extern declaration of global opcode metadata table */
+extern struct vhost_rdma_wr_opcode_info vhost_rdma_wr_opcode_info[];
+
+/**
+ * @defgroup hdr_types Header Types (for offset tracking)
+ * @{
+ */
+enum vhost_rdma_hdr_type {
+ VHOST_RDMA_LRH, /**< Link Layer Header (InfiniBand only) */
+ VHOST_RDMA_GRH, /**< Global Route Header (IPv6-style GIDs) */
+ VHOST_RDMA_BTH, /**< Base Transport Header */
+ VHOST_RDMA_RETH, /**< RDMA Extended Transport Header */
+ VHOST_RDMA_AETH, /**< Acknowledge/Error Header */
+ VHOST_RDMA_ATMETH, /**< Atomic Operation Request Header */
+ VHOST_RDMA_ATMACK, /**< Atomic Operation Response Header */
+ VHOST_RDMA_IETH, /**< Immediate Data + Error Code Header */
+ VHOST_RDMA_RDETH, /**< Reliable Datagram Extended Transport Header */
+ VHOST_RDMA_DETH, /**< Datagram Endpoint Identifier Header */
+ VHOST_RDMA_IMMDT, /**< Immediate Data Header */
+ VHOST_RDMA_PAYLOAD, /**< Payload section */
+ NUM_HDR_TYPES /**< Number of known header types */
+};
+/** @} */
+
+/**
+ * @defgroup hdr_masks Header Presence and Semantic Flags
+ * @{
+ */
+enum vhost_rdma_hdr_mask {
+ VHOST_LRH_MASK = BIT(VHOST_RDMA_LRH),
+ VHOST_GRH_MASK = BIT(VHOST_RDMA_GRH),
+ VHOST_BTH_MASK = BIT(VHOST_RDMA_BTH),
+ VHOST_IMMDT_MASK = BIT(VHOST_RDMA_IMMDT),
+ VHOST_RETH_MASK = BIT(VHOST_RDMA_RETH),
+ VHOST_AETH_MASK = BIT(VHOST_RDMA_AETH),
+ VHOST_ATMETH_MASK = BIT(VHOST_RDMA_ATMETH),
+ VHOST_ATMACK_MASK = BIT(VHOST_RDMA_ATMACK),
+ VHOST_IETH_MASK = BIT(VHOST_RDMA_IETH),
+ VHOST_RDETH_MASK = BIT(VHOST_RDMA_RDETH),
+ VHOST_DETH_MASK = BIT(VHOST_RDMA_DETH),
+ VHOST_PAYLOAD_MASK = BIT(VHOST_RDMA_PAYLOAD),
+
+ /* Semantic packet type flags */
+ VHOST_REQ_MASK = BIT(NUM_HDR_TYPES + 0), /**< Request packet */
+ VHOST_ACK_MASK = BIT(NUM_HDR_TYPES + 1), /**< ACK/NACK packet */
+ VHOST_SEND_MASK = BIT(NUM_HDR_TYPES + 2), /**< Send operation */
+ VHOST_WRITE_MASK = BIT(NUM_HDR_TYPES + 3), /**< RDMA Write */
+ VHOST_READ_MASK = BIT(NUM_HDR_TYPES + 4), /**< RDMA Read */
+ VHOST_ATOMIC_MASK = BIT(NUM_HDR_TYPES + 5), /**< Atomic operation */
+
+ /* Packet fragmentation flags */
+ VHOST_RWR_MASK = BIT(NUM_HDR_TYPES + 6), /**< RDMA with Immediate + Invalidate */
+ VHOST_COMP_MASK = BIT(NUM_HDR_TYPES + 7), /**< Completion required */
+
+ VHOST_START_MASK = BIT(NUM_HDR_TYPES + 8), /**< First fragment */
+ VHOST_MIDDLE_MASK = BIT(NUM_HDR_TYPES + 9), /**< Middle fragment */
+ VHOST_END_MASK = BIT(NUM_HDR_TYPES + 10), /**< Last fragment */
+
+ VHOST_LOOPBACK_MASK = BIT(NUM_HDR_TYPES + 12), /**< Loopback within host */
+
+ /* Composite masks */
+ VHOST_READ_OR_ATOMIC = (VHOST_READ_MASK | VHOST_ATOMIC_MASK),
+ VHOST_WRITE_OR_SEND = (VHOST_WRITE_MASK | VHOST_SEND_MASK),
+};
+/** @} */
+
+/**
+ * @brief Per-opcode metadata for parsing and validation
+ */
+struct vhost_rdma_opcode_info {
+ const char *name; /**< Opcode name (e.g., "RC SEND_FIRST") */
+ int length; /**< Fixed payload length (if any) */
+ int offset[NUM_HDR_TYPES]; /**< Offset of each header within packet */
+ enum vhost_rdma_hdr_mask mask; /**< Header presence and semantic flags */
+};
+
+/* Global opcode info table (indexed by IB opcode byte) */
+extern struct vhost_rdma_opcode_info vhost_rdma_opcode[VHOST_NUM_OPCODE];
+
+/**
+ * @brief Helper macro to define IB opcodes by transport and operation
+ *
+ * Expands to e.g.: `IB_OPCODE_RC_SEND_FIRST = IB_OPCODE_RC + IB_OPCODE_SEND_FIRST`
+ */
+#define IB_OPCODE(transport, op) \
+ IB_OPCODE_ ## transport ## _ ## op = \
+ (IB_OPCODE_ ## transport + IB_OPCODE_ ## op)
+
+/**
+ * @defgroup ib_opcodes InfiniBand OpCode Definitions
+ *
+ * Based on IBTA Vol 1 Table 38 and extended for RoCE semantics.
+ * @{
+ */
+
+enum {
+ /* Transport types (base values) */
+ IB_OPCODE_RC = 0x00, /**< Reliable Connection */
+ IB_OPCODE_UC = 0x20, /**< Unreliable Connection */
+ IB_OPCODE_RD = 0x40, /**< Reliable Datagram */
+ IB_OPCODE_UD = 0x60, /**< Unreliable Datagram */
+ IB_OPCODE_CNP = 0x80, /**< Congestion Notification Packet */
+ IB_OPCODE_MSP = 0xe0, /**< Manufacturer Specific Protocol */
+
+ /* Operation subtypes */
+ IB_OPCODE_SEND_FIRST = 0x00,
+ IB_OPCODE_SEND_MIDDLE = 0x01,
+ IB_OPCODE_SEND_LAST = 0x02,
+ IB_OPCODE_SEND_LAST_WITH_IMMEDIATE = 0x03,
+ IB_OPCODE_SEND_ONLY = 0x04,
+ IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE = 0x05,
+ IB_OPCODE_RDMA_WRITE_FIRST = 0x06,
+ IB_OPCODE_RDMA_WRITE_MIDDLE = 0x07,
+ IB_OPCODE_RDMA_WRITE_LAST = 0x08,
+ IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE = 0x09,
+ IB_OPCODE_RDMA_WRITE_ONLY = 0x0a,
+ IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE = 0x0b,
+ IB_OPCODE_RDMA_READ_REQUEST = 0x0c,
+ IB_OPCODE_RDMA_READ_RESPONSE_FIRST = 0x0d,
+ IB_OPCODE_RDMA_READ_RESPONSE_MIDDLE = 0x0e,
+ IB_OPCODE_RDMA_READ_RESPONSE_LAST = 0x0f,
+ IB_OPCODE_RDMA_READ_RESPONSE_ONLY = 0x10,
+ IB_OPCODE_ACKNOWLEDGE = 0x11,
+ IB_OPCODE_ATOMIC_ACKNOWLEDGE = 0x12,
+ IB_OPCODE_COMPARE_SWAP = 0x13,
+ IB_OPCODE_FETCH_ADD = 0x14,
+ /* 0x15 is reserved */
+ IB_OPCODE_SEND_LAST_WITH_INVALIDATE = 0x16,
+ IB_OPCODE_SEND_ONLY_WITH_INVALIDATE = 0x17,
+
+ /* Real opcodes generated via IB_OPCODE() macro */
+ IB_OPCODE(RC, SEND_FIRST),
+ IB_OPCODE(RC, SEND_MIDDLE),
+ IB_OPCODE(RC, SEND_LAST),
+ IB_OPCODE(RC, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RC, SEND_ONLY),
+ IB_OPCODE(RC, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_WRITE_FIRST),
+ IB_OPCODE(RC, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(RC, RDMA_WRITE_LAST),
+ IB_OPCODE(RC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_WRITE_ONLY),
+ IB_OPCODE(RC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RC, RDMA_READ_REQUEST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_FIRST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_MIDDLE),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_LAST),
+ IB_OPCODE(RC, RDMA_READ_RESPONSE_ONLY),
+ IB_OPCODE(RC, ACKNOWLEDGE),
+ IB_OPCODE(RC, ATOMIC_ACKNOWLEDGE),
+ IB_OPCODE(RC, COMPARE_SWAP),
+ IB_OPCODE(RC, FETCH_ADD),
+ IB_OPCODE(RC, SEND_LAST_WITH_INVALIDATE),
+ IB_OPCODE(RC, SEND_ONLY_WITH_INVALIDATE),
+
+ /* UC opcodes */
+ IB_OPCODE(UC, SEND_FIRST),
+ IB_OPCODE(UC, SEND_MIDDLE),
+ IB_OPCODE(UC, SEND_LAST),
+ IB_OPCODE(UC, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(UC, SEND_ONLY),
+ IB_OPCODE(UC, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(UC, RDMA_WRITE_FIRST),
+ IB_OPCODE(UC, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(UC, RDMA_WRITE_LAST),
+ IB_OPCODE(UC, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(UC, RDMA_WRITE_ONLY),
+ IB_OPCODE(UC, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+
+ /* RD opcodes */
+ IB_OPCODE(RD, SEND_FIRST),
+ IB_OPCODE(RD, SEND_MIDDLE),
+ IB_OPCODE(RD, SEND_LAST),
+ IB_OPCODE(RD, SEND_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RD, SEND_ONLY),
+ IB_OPCODE(RD, SEND_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_WRITE_FIRST),
+ IB_OPCODE(RD, RDMA_WRITE_MIDDLE),
+ IB_OPCODE(RD, RDMA_WRITE_LAST),
+ IB_OPCODE(RD, RDMA_WRITE_LAST_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_WRITE_ONLY),
+ IB_OPCODE(RD, RDMA_WRITE_ONLY_WITH_IMMEDIATE),
+ IB_OPCODE(RD, RDMA_READ_REQUEST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_FIRST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_MIDDLE),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_LAST),
+ IB_OPCODE(RD, RDMA_READ_RESPONSE_ONLY),
+ IB_OPCODE(RD, ACKNOWLEDGE),
+ IB_OPCODE(RD, ATOMIC_ACKNOWLEDGE),
+ IB_OPCODE(RD, COMPARE_SWAP),
+ IB_OPCODE(RD, FETCH_ADD),
+
+ /* UD opcodes */
+ IB_OPCODE(UD, SEND_ONLY),
+ IB_OPCODE(UD, SEND_ONLY_WITH_IMMEDIATE)
+};
+/** @} */
+
+/**
+ * @brief Runtime packet context used during processing
+ */
+struct vhost_rdma_pkt_info {
+ struct vhost_rdma_dev *dev; /**< Owning device */
+ struct vhost_rdma_qp *qp; /**< Associated QP */
+ struct vhost_rdma_send_wqe *wqe; /**< Corresponding send WQE (if applicable) */
+ uint8_t *hdr; /**< Pointer to BTH (Base Transport Header) */
+ uint32_t mask; /**< Semantic flags (from vhost_rdma_hdr_mask) */
+ uint32_t psn; /**< Packet Sequence Number from BTH */
+ uint16_t pkey_index; /**< Partition key index */
+ uint16_t paylen; /**< Payload length (BTH to ICRC) */
+ uint8_t port_num; /**< Port this packet was received on */
+ uint8_t opcode; /**< BTH opcode field */
+};
+
+#endif /* __VHOST_RDMA_PKT_H__ */
\ No newline at end of file
--
2.43.0
Powered by blists - more mailing lists