[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250919213153.103606-20-daniel@iogearbox.net>
Date: Fri, 19 Sep 2025 23:31:52 +0200
From: Daniel Borkmann <daniel@...earbox.net>
To: netdev@...r.kernel.org
Cc: bpf@...r.kernel.org,
kuba@...nel.org,
davem@...emloft.net,
razor@...ckwall.org,
pabeni@...hat.com,
willemb@...gle.com,
sdf@...ichev.me,
john.fastabend@...il.com,
martin.lau@...nel.org,
jordan@...fe.io,
maciej.fijalkowski@...el.com,
magnus.karlsson@...el.com,
David Wei <dw@...idwei.uk>
Subject: [PATCH net-next 19/20] netkit: Add xsk support for af_xdp applications
Enable support for AF_XDP applications to operate on a netkit device.
The goal is that AF_XDP applications can natively consume AF_XDP
from network namespaces. The use-case from Cilium side is to support
Kubernetes KubeVirt VMs through QEMU's AF_XDP backend. KubeVirt is a
virtual machine management add-on for Kubernetes which aims to provide
a common ground for virtualization. KubeVirt spawns the VMs inside
Kubernetes Pods which reside in their own network namespace just like
regular Pods.
Raw QEMU AF_XDP backend example with eth0 being a physical device with
16 queues where netkit is bound to the last queue (for multi-queue RSS
context can be used if supported by the driver):
# ethtool -X eth0 start 0 equal 15
# ethtool -X eth0 start 15 equal 1 context new
# ethtool --config-ntuple eth0 flow-type ether \
src 00:00:00:00:00:00 \
src-mask ff:ff:ff:ff:ff:ff \
dst $mac dst-mask 00:00:00:00:00:00 \
proto 0 proto-mask 0xffff action 15
# ip netns add foo
# ip link add numrxqueues 2 nk type netkit single
# ynl-bind eth0 15 nk
# ip link set nk netns foo
# ip netns exec foo ip link set lo up
# ip netns exec foo ip link set nk up
# ip netns exec foo qemu-system-x86_64 \
-kernel $kernel \
-drive file=${image_name},index=0,media=disk,format=raw \
-append "root=/dev/sda rw console=ttyS0" \
-cpu host \
-m $memory \
-enable-kvm \
-device virtio-net-pci,netdev=net0,mac=$mac \
-netdev af-xdp,ifname=nk,id=net0,mode=native,queues=1,start-queue=1,inhibit=on,map-path=$dir/xsks_map \
-nographic
We have tested the above against a dual-port Nvidia ConnectX-6 (mlx5)
100G NIC with successful network connectivity out of QEMU. An earlier
iteration of this work was presented at LSF/MM/BPF [0].
Signed-off-by: Daniel Borkmann <daniel@...earbox.net>
Co-developed-by: David Wei <dw@...idwei.uk>
Signed-off-by: David Wei <dw@...idwei.uk>
Link: https://bpfconf.ebpf.io/bpfconf2025/bpfconf2025_material/lsfmmbpf_2025_netkit_borkmann.pdf [0]
---
drivers/net/netkit.c | 121 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 121 insertions(+)
diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c
index 5129b27a7c3c..a1d8a78bab0b 100644
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -11,6 +11,7 @@
#include <net/netdev_queues.h>
#include <net/netdev_rx_queue.h>
+#include <net/xdp_sock_drv.h>
#include <net/netkit.h>
#include <net/dst.h>
#include <net/tcx.h>
@@ -234,6 +235,122 @@ static void netkit_get_stats(struct net_device *dev,
stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped);
}
+static int netkit_xsk(struct net_device *dev, struct netdev_bpf *xdp)
+{
+ struct netkit *nk = netkit_priv(dev);
+ struct netdev_bpf xdp_lower;
+ struct netdev_rx_queue *rxq;
+ struct net_device *phys;
+
+ switch (xdp->command) {
+ case XDP_SETUP_XSK_POOL:
+ if (nk->pair == NETKIT_DEVICE_PAIR)
+ return -EOPNOTSUPP;
+ if (xdp->xsk.queue_id >= dev->real_num_rx_queues)
+ return -EINVAL;
+
+ rxq = __netif_get_rx_queue(dev, xdp->xsk.queue_id);
+ if (!rxq->peer)
+ return -EOPNOTSUPP;
+
+ phys = rxq->peer->dev;
+ if (!phys->netdev_ops->ndo_bpf ||
+ !phys->netdev_ops->ndo_xdp_xmit ||
+ !phys->netdev_ops->ndo_xsk_wakeup)
+ return -EOPNOTSUPP;
+
+ memcpy(&xdp_lower, xdp, sizeof(xdp_lower));
+ xdp_lower.xsk.queue_id = get_netdev_rx_queue_index(rxq->peer);
+ break;
+ case XDP_SETUP_PROG:
+ return -EPERM;
+ default:
+ return -EINVAL;
+ }
+
+ return phys->netdev_ops->ndo_bpf(phys, &xdp_lower);
+}
+
+static int netkit_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
+{
+ struct netdev_rx_queue *rxq;
+ struct net_device *phys;
+
+ if (queue_id >= dev->real_num_rx_queues)
+ return -EINVAL;
+
+ rxq = __netif_get_rx_queue(dev, queue_id);
+ if (!rxq->peer)
+ return -EOPNOTSUPP;
+
+ phys = rxq->peer->dev;
+ if (!phys->netdev_ops->ndo_xsk_wakeup)
+ return -EOPNOTSUPP;
+
+ return phys->netdev_ops->ndo_xsk_wakeup(phys,
+ get_netdev_rx_queue_index(rxq->peer), flags);
+}
+
+static bool netkit_xdp_supported(const struct net_device *dev)
+{
+ bool xdp_ok = IS_ENABLED(CONFIG_XDP_SOCKETS);
+
+ if (!dev->netdev_ops->ndo_bpf ||
+ !dev->netdev_ops->ndo_xdp_xmit ||
+ !dev->netdev_ops->ndo_xsk_wakeup)
+ xdp_ok = false;
+ if ((dev->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK)
+ xdp_ok = false;
+ return xdp_ok;
+}
+
+static void netkit_expose_xdp(struct net_device *dev, bool xdp_ok,
+ u32 xdp_zc_max_segs)
+{
+ if (xdp_ok) {
+ dev->xdp_zc_max_segs = xdp_zc_max_segs;
+ xdp_set_features_flag_locked(dev, NETDEV_XDP_ACT_XSK);
+ } else {
+ dev->xdp_zc_max_segs = 1;
+ xdp_set_features_flag_locked(dev, 0);
+ }
+}
+
+static void netkit_calculate_xdp(struct net_device *dev,
+ struct netdev_rx_queue *rxq, bool skip_rxq)
+{
+ struct netdev_rx_queue *src_rxq, *dst_rxq;
+ struct net_device *src_dev;
+ u32 xdp_zc_max_segs = ~0;
+ bool xdp_ok = false;
+ int i;
+
+ for (i = 1; i < dev->real_num_rx_queues; i++) {
+ dst_rxq = __netif_get_rx_queue(dev, i);
+ if (dst_rxq == rxq && skip_rxq)
+ continue;
+ src_rxq = dst_rxq->peer;
+ src_dev = src_rxq->dev;
+ xdp_zc_max_segs = min(xdp_zc_max_segs, src_dev->xdp_zc_max_segs);
+ xdp_ok = netkit_xdp_supported(src_dev) &&
+ (i == 1 ? true : xdp_ok);
+ }
+
+ netkit_expose_xdp(dev, xdp_ok, xdp_zc_max_segs);
+}
+
+static void netkit_peer_queues(struct net_device *dev,
+ struct netdev_rx_queue *rxq)
+{
+ netkit_calculate_xdp(dev, rxq, false);
+}
+
+static void netkit_unpeer_queues(struct net_device *dev,
+ struct netdev_rx_queue *rxq)
+{
+ netkit_calculate_xdp(dev, rxq, true);
+}
+
static void netkit_uninit(struct net_device *dev);
static const struct net_device_ops netkit_netdev_ops = {
@@ -247,6 +364,10 @@ static const struct net_device_ops netkit_netdev_ops = {
.ndo_get_peer_dev = netkit_peer_dev,
.ndo_get_stats64 = netkit_get_stats,
.ndo_uninit = netkit_uninit,
+ .ndo_peer_queues = netkit_peer_queues,
+ .ndo_unpeer_queues = netkit_unpeer_queues,
+ .ndo_bpf = netkit_xsk,
+ .ndo_xsk_wakeup = netkit_xsk_wakeup,
.ndo_features_check = passthru_features_check,
};
--
2.43.0
Powered by blists - more mailing lists