[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251218091050.55047-3-15927021679@163.com>
Date: Thu, 18 Dec 2025 17:09:42 +0800
From: Xiong Weimin <15927021679@....com>
To: "Michael S . Tsirkin" <mst@...hat.com>,
David Hildenbrand <david@...hat.com>,
Jason Wang <jasowang@...hat.com>,
Stefano Garzarella <sgarzare@...hat.com>,
Thomas Monjalon <thomas@...jalon.net>,
David Marchand <david.marchand@...hat.com>,
Luca Boccassi <bluca@...ian.org>,
Kevin Traynor <ktraynor@...hat.com>,
Christian Ehrhardt <christian.ehrhardt@...onical.com>,
Xuan Zhuo <xuanzhuo@...ux.alibaba.com>,
Eugenio Pérez <eperezma@...hat.com>,
Xueming Li <xuemingl@...dia.com>,
Maxime Coquelin <maxime.coquelin@...hat.com>,
Chenbo Xia <chenbox@...dia.com>,
Bruce Richardson <bruce.richardson@...el.com>
Cc: kvm@...r.kernel.org,
virtualization@...ts.linux.dev,
netdev@...r.kernel.org,
xiongweimin <xiongweimin@...inos.cn>
Subject: [PATCH 02/10] drivers/infiniband/hw/virtio: add vrdma_exec_verbs_cmd to construct verbs sgs using virtio
From: xiongweimin <xiongweimin@...inos.cn>
The implementation of vrdma_exec_verbs_cmd used a busy-wait loop
with cpu_relax() for command completion, which wastes CPU cycles especially
in process context. This commit introduces a more efficient approach by:
1. Adding a wait queue (ctrl_waitq) and completion flag (ctrl_completed)
to the vrdma_dev structure
2. Using wait_event_timeout for sleeping instead of spinning in non-atomic
contexts
3. Maintaining the original busy-wait behavior for atomic contexts
4. Adding proper locking around the wait mechanism
5. Implementing wakeup in the IRQ handler
This change significantly reduces CPU usage when executing commands from
process context while maintaining compatibility with atomic contexts like
NAPI and workqueues.
Signed-off-by: Xiong Weimin <xiongweimin@...inos.cn>
---
.../drivers/infiniband/hw/virtio/vrdma.h | 3 +-
.../drivers/infiniband/hw/virtio/vrdma_dev.c | 1 +
.../infiniband/hw/virtio/vrdma_dev_api.h | 222 ++++++++++++++++++
.../drivers/infiniband/hw/virtio/vrdma_ib.c | 112 +++++++++
.../drivers/infiniband/hw/virtio/vrdma_ib.h | 2 +
5 files changed, 339 insertions(+), 1 deletion(-)
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma.h
index bc72d9c5e..a646794ef 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma.h
@@ -12,9 +12,10 @@
#include <linux/netdevice.h>
#include <rdma/ib_verbs.h>
#include <linux/spinlock.h>
-#include <linux/atomic.h>
+#include <linux/average.h>
#include <linux/mutex.h>
#include <linux/list.h>
+#include <linux/types.h>
/**
* struct vrdma_dev - Virtual RDMA device structure
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev.c b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev.c
index 0a09b3bd4..961529b58 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev.c
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev.c
@@ -6,6 +6,7 @@
#include <linux/virtio_config.h>
#include "vrdma.h"
+#include "vrdma_dev.h"
#include "vrdma_dev_api.h"
#include "vrdma_queue.h"
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
index 403d5e820..3b1f7d2b6 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_dev_api.h
@@ -8,6 +8,8 @@
#include <linux/kernel.h>
#include <linux/types.h>
+#include <linux/u64_stats_sync.h>
+#include <net/xdp.h>
#include <rdma/ib_verbs.h>
#include <rdma/vrdma_abi.h>
@@ -112,5 +114,225 @@ enum vrdma_verbs_cmd {
VIRTIO_RDMA_CMD_REQ_NOTIFY_CQ,
};
+#define VRDMA_CTRL_OK 0
+#define VRDMA_CTRL_ERR 1
+
+/**
+ * struct verbs_ctrl_buf - Control buffer command/status structure
+ * @cmd: Command code from driver to host (VRDMA_VQ_CTRL_*).
+ * @status: Status returned by host (0 = success, non-zero = error).
+ *
+ * Used in control virtqueue for configuration operations.
+ */
+struct verbs_ctrl_buf {
+ u8 cmd;
+ u8 status;
+} __packed;
+
+/**
+ * struct vrdma_sq_stats - Statistics for a send queue (transmit path)
+ * @syncp: Synchronization point for 64-bit stats on 32-bit CPUs.
+ * @packets: Number of packets transmitted.
+ * @bytes: Total number of bytes transmitted.
+ * @xdp_tx: Number of XDP frames sent via XDP_TX action.
+ * @xdp_tx_drops: Dropped due to ring full or mapping failure.
+ * @kicks: Number of times the TX virtqueue was kicked.
+ * @tx_timeouts: Number of transmit timeouts detected.
+ */
+struct vrdma_sq_stats {
+ struct u64_stats_sync syncp;
+ u64 packets;
+ u64 bytes;
+ u64 xdp_tx;
+ u64 xdp_tx_drops;
+ u64 kicks;
+ u64 tx_timeouts;
+};
+
+/**
+ * struct vrdma_rq_stats - Statistics for a receive queue (receive path)
+ * @syncp: Synchronization point for 64-bit stats on 32-bit CPUs.
+ * @packets: Number of packets received.
+ * @bytes: Total number of bytes received.
+ * @drops: Packet drops due to no available buffers.
+ * @xdp_packets: Number of packets processed by XDP.
+ * @xdp_tx: Packets sent back via XDP_TX.
+ * @xdp_redirects: Packets redirected via XDP_REDIRECT.
+ * @xdp_drops: Packets dropped via XDP_DROP or mapping failure.
+ * @kicks: Number of times RQ was kicked after refill.
+ */
+struct vrdma_rq_stats {
+ struct u64_stats_sync syncp;
+ u64 packets;
+ u64 bytes;
+ u64 drops;
+ u64 xdp_packets;
+ u64 xdp_tx;
+ u64 xdp_redirects;
+ u64 xdp_drops;
+ u64 kicks;
+};
+
+/* EWMA: Exponentially Weighted Moving Average for RX packet length */
+DECLARE_EWMA(pkt_len, 0, 64) /* weight=0, factor=64 */
+
+/**
+ * struct vrdma_send_queue - Internal representation of a TX virtqueue
+ * @vq: The associated virtqueue for sending packets.
+ * @sg: Scatterlist used per transmission (header + linear data + frags).
+ * @name: Human-readable name (e.g., "output.0").
+ * @stats: Transmit statistics under NAPI protection.
+ * @napi: NAPI context for interrupt moderation and polling.
+ * @reset: True if SQ is undergoing reset/recovery.
+ *
+ * One per transmit queue pair. Runs in NAPI poll context during congestion.
+ */
+struct vrdma_send_queue {
+ struct virtqueue *vq;
+ struct scatterlist sg[MAX_SKB_FRAGS + 2];
+ char name[16];
+
+ struct vrdma_sq_stats stats;
+ struct napi_struct napi;
+
+ bool reset;
+} __aligned(64);
+
+/**
+ * struct vrdma_receive_queue - Internal representation of an RX virtqueue
+ * @vq: The associated virtqueue for receiving packets.
+ * @napi: NAPI instance for batched processing of incoming packets.
+ * @xdp_prog: Current XDP BPF program (protected by RCU).
+ * @stats: Receive-side statistics.
+ * @pages: Linked list of pages used as packet buffers (via page->private).
+ * @mrg_avg_pkt_len: EWMA of packet length for mergeable buffer sizing.
+ * @alloc_frag: Page fragment allocator for non-mergeable case.
+ * @sg: Scatterlist used during RX submission.
+ * @min_buf_len: Minimum buffer size when using mergeable RX.
+ * @name: Human-readable name (e.g., "input.0").
+ * @xdp_rxq: Metadata for XDP frame reception (used with xdp_do_flush_map()).
+ *
+ * Maintains state for receiving packets from the host.
+ */
+struct vrdma_receive_queue {
+ struct virtqueue *vq;
+ struct napi_struct napi;
+ struct bpf_prog __rcu *xdp_prog;
+ struct vrdma_rq_stats stats;
+
+ struct page *pages;
+ struct ewma_pkt_len mrg_avg_pkt_len;
+ struct page_frag alloc_frag;
+
+ struct scatterlist sg[MAX_SKB_FRAGS + 2];
+ unsigned int min_buf_len;
+ char name[16];
+
+ struct xdp_rxq_info xdp_rxq;
+} __aligned(64);
+
+/**
+ * struct vrdma_info - Main device private data structure
+ * @vdev: Virtio device backing this interface.
+ * @cvq: Control virtqueue (optional, if feature bit set).
+ * @dev: Net device registered to kernel networking stack.
+ * @sq: Array of send queues (size = curr_queue_pairs).
+ * @rq: Array of receive queues (size = curr_queue_pairs).
+ * @status: Current device status (from config space).
+ * @max_queue_pairs: Maximum number of queue pairs supported by host.
+ * @curr_queue_pairs: Currently active queue pairs.
+ * @xdp_queue_pairs: Number of queue pairs dedicated to XDP processing.
+ * @xdp_enabled: Whether XDP is currently active.
+ * @big_packets: Host supports jumbo frames / large MTU.
+ * @big_packets_num_skbfrags: Max SG entries allocated for big packets.
+ * @mergeable_rx_bufs: Host can merge multiple RX buffers into one SKB.
+ * @has_rss: Host supports RSS (Receive Side Scaling).
+ * @has_rss_hash_report: Host provides hash value and type in RX header.
+ * @rss_key_size: Size of RSS key in bytes.
+ * @rss_indir_table_size: Size of indirection table.
+ * @rss_hash_types_supported: Bitmap of supported hash types (TCPV4, UDP6, etc).
+ * @rss_hash_types_saved: User-configured hash types enabled.
+ * @has_cvq: True if control virtqueue is present.
+ * @any_header_sg: Host allows splitting headers across SG elements.
+ * @hdr_len: Size of the transport header (virtio_net_hdr + optional metadata).
+ * @refill: Work item for delayed RX ring refill under memory pressure.
+ * @refill_enabled: Whether delayed refill mechanism is active.
+ * @refill_lock: Spinlock protecting access to refill_enabled.
+ * @config_work: Work item for handling config space changes (e.g., link up/down).
+ * @affinity_hint_set: Whether affinity hints are applied to VQ interrupts.
+ * @node: CPU hotplug notifier node for online events.
+ * @node_dead: CPU hotplug notifier node for dead events.
+ * @ctrl: Pre-allocated control buffer for synchronous CVQ commands.
+ * @duplex: Current duplex setting (from ethtool).
+ * @speed: Current link speed (from ethtool).
+ * @tx_usecs: Interrupt coalescing: TX timer in microseconds.
+ * @rx_usecs: Interrupt coalescing: RX timer in microseconds.
+ * @tx_max_packets: Interrupt coalescing: max packets before IRQ.
+ * @rx_max_packets: Interrupt coalescing: max packets before IRQ.
+ * @guest_offloads: Currently negotiated offload features.
+ * @guest_offloads_capable: Offload capabilities reported by host.
+ * @failover: Failover handle if STANDBY feature is enabled.
+ *
+ * This structure holds all per-device state for the vrdma driver.
+ */
+struct vrdma_info {
+ struct virtio_device *vdev;
+ struct virtqueue *cvq;
+ struct net_device *dev;
+ struct vrdma_send_queue *sq;
+ struct vrdma_receive_queue *rq;
+
+ unsigned int status;
+
+ u16 max_queue_pairs;
+ u16 curr_queue_pairs;
+ u16 xdp_queue_pairs;
+ bool xdp_enabled;
+
+ bool big_packets;
+ unsigned int big_packets_num_skbfrags;
+ bool mergeable_rx_bufs;
+
+ bool has_rss;
+ bool has_rss_hash_report;
+ u8 rss_key_size;
+ u16 rss_indir_table_size;
+ u32 rss_hash_types_supported;
+ u32 rss_hash_types_saved;
+
+ bool has_cvq;
+ bool any_header_sg;
+ u8 hdr_len;
+
+ struct delayed_work refill;
+ bool refill_enabled;
+ spinlock_t refill_lock;
+
+ struct work_struct config_work;
+ bool affinity_hint_set;
+
+ struct hlist_node node;
+ struct hlist_node node_dead;
+
+ void *ctrl; /* use flexible array later if needed */
+
+ /* Ethtool settings */
+ u8 duplex;
+ u32 speed;
+
+ /* Interrupt coalescing */
+ u32 tx_usecs;
+ u32 rx_usecs;
+ u32 tx_max_packets;
+ u32 rx_max_packets;
+
+ unsigned long guest_offloads;
+ unsigned long guest_offloads_capable;
+
+#ifdef CONFIG_NET_FAILOVER
+ struct failover *failover;
+#endif
+} __aligned(64);
+
#endif
\ No newline at end of file
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
index 379bd23d3..825ec58bd 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.c
@@ -49,6 +49,118 @@ static const char * const cmd_str[] = {
[VIRTIO_RDMA_CMD_REQ_NOTIFY_CQ] = "REQ_NOTIFY_CQ",
};
+/**
+ * vrdma_exec_verbs_cmd - Execute a verbs command via control virtqueue
+ * @vrdev: VRDMA device
+ * @verbs_cmd: Command opcode (VRDMA_CMD_*)
+ * @verbs_in: Input data SG list (optional)
+ * @verbs_out: Output data SG list (optional)
+ *
+ * Context: Can be called from process or atomic context (e.g., NAPI, workqueue).
+ * Locking: Expects caller to handle serialization if needed.
+ * Return: 0 on success, negative errno on failure.
+ */
+static int vrdma_exec_verbs_cmd(struct vrdma_dev *vrdev, int verbs_cmd,
+ struct scatterlist *verbs_in,
+ struct scatterlist *verbs_out)
+{
+ struct vrdma_info *vrdma_info = netdev_priv(vrdev->netdev);
+ struct virtqueue *vq = vrdev->ctrl_vq;
+ struct verbs_ctrl_buf *ctrl_buf;
+ struct scatterlist hdr_sg, status_sg;
+ struct scatterlist *sgs[4];
+ unsigned int out_num = 1, in_num = 1;
+ unsigned int len;
+ int ret, timeout_loops = VRDMA_COMM_TIMEOUT;
+ unsigned long flags;
+
+ if (unlikely(!vq)) {
+ netdev_err(vrdma_info->dev, "Missing control virtqueue\n");
+ return -EINVAL;
+ }
+
+ ctrl_buf = kmalloc(sizeof(*ctrl_buf), GFP_ATOMIC);
+ if (!ctrl_buf) {
+ goto unlock;
+ }
+ ctrl_buf->cmd = verbs_cmd;
+ ctrl_buf->status = ~0U;
+
+ /* Prepare scatterlists for sending command and receiving status */
+ sg_init_one(&hdr_sg, &ctrl_buf->cmd, sizeof(ctrl_buf->cmd));
+ sgs[0] = &hdr_sg;
+
+ if (verbs_in) {
+ sgs[1] = verbs_in;
+ in_num++;
+ }
+
+ sg_init_one(&status_sg, &ctrl_buf->status, sizeof(ctrl_buf->status));
+ sgs[in_num] = &status_sg;
+
+ if (verbs_out) {
+ sgs[in_num + 1] = verbs_out;
+ out_num++;
+ }
+
+ spin_lock_irqsave(&vrdev->ctrl_lock, flags);
+
+ ret = virtqueue_add_sgs(vq, sgs, in_num, out_num, vrdev, GFP_ATOMIC);
+ if (ret) {
+ netdev_err(vrdma_info->dev, "Failed to add cmd %d to CVQ: %d\n",
+ verbs_cmd, ret);
+ goto unlock;
+ }
+
+ if (unlikely(!virtqueue_kick(vq))) {
+ netdev_err(vrdma_info->dev, "Failed to kick CVQ for cmd %d\n", verbs_cmd);
+ ret = -EIO;
+ goto unlock;
+ }
+
+ /* Wait for response: loop with timeout to avoid infinite blocking */
+ ret = -ETIMEDOUT;
+ while (1) {
+ if (virtqueue_get_buf(vq, &len)) {
+ ret = 0;
+ break;
+ }
+ if (unlikely(virtqueue_is_broken(vq))) {
+ netdev_err(vrdma_info->dev, "CVQ is broken\n");
+ ret = -EIO;
+ break;
+ }
+ cpu_relax();
+ /*
+ * Prevent infinite wait. In non-atomic context, consider using schedule_timeout()
+ * for better CPU utilization.
+ */
+ if (!--timeout_loops) {
+ netdev_err(vrdma_info->dev, "Timeout waiting for cmd %d response\n",
+ verbs_cmd);
+ break;
+ }
+ }
+
+unlock:
+ spin_unlock_irqrestore(&vrdev->ctrl_lock, flags);
+
+ /* Log final result */
+ if (ret == 0 && ctrl_buf->status != VRDMA_CTRL_OK) {
+ netdev_err(vrdma_info->dev, "EXEC cmd %s failed: status=%d\n",
+ cmd_str[verbs_cmd], ctrl_buf->status);
+ ret = -EIO; /* Host returned an error status */
+ } else if (ret == 0) {
+ netdev_dbg(vrdma_info->dev, "EXEC cmd %s OK\n", cmd_str[verbs_cmd]);
+ } else {
+ netdev_err(vrdma_info->dev, "EXEC cmd %s failed: ret=%d\n",
+ cmd_str[verbs_cmd], ret);
+ }
+
+ kfree(ctrl_buf);
+ return ret;
+}
+
static const struct ib_device_ops virtio_rdma_dev_ops = {
.owner = THIS_MODULE,
.uverbs_abi_ver = VIRTIO_RDMA_ABI_VERSION,
diff --git a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h
index 9a7a0a168..bdba5a9de 100644
--- a/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h
+++ b/linux-6.16.8/drivers/infiniband/hw/virtio/vrdma_ib.h
@@ -11,6 +11,8 @@
#include <rdma/ib_verbs.h>
#include <rdma/vrdma_abi.h>
+#define VRDMA_COMM_TIMEOUT 1000000
+
enum {
VIRTIO_RDMA_ATOMIC_NONE,
VIRTIO_RDMA_ATOMIC_HCA,
--
2.43.0
Powered by blists - more mailing lists