lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250919213153.103606-20-daniel@iogearbox.net>
Date: Fri, 19 Sep 2025 23:31:52 +0200
From: Daniel Borkmann <daniel@...earbox.net>
To: netdev@...r.kernel.org
Cc: bpf@...r.kernel.org,
	kuba@...nel.org,
	davem@...emloft.net,
	razor@...ckwall.org,
	pabeni@...hat.com,
	willemb@...gle.com,
	sdf@...ichev.me,
	john.fastabend@...il.com,
	martin.lau@...nel.org,
	jordan@...fe.io,
	maciej.fijalkowski@...el.com,
	magnus.karlsson@...el.com,
	David Wei <dw@...idwei.uk>
Subject: [PATCH net-next 19/20] netkit: Add xsk support for af_xdp applications

Enable support for AF_XDP applications to operate on a netkit device.
The goal is that AF_XDP applications can natively consume AF_XDP
from network namespaces. The use-case from Cilium side is to support
Kubernetes KubeVirt VMs through QEMU's AF_XDP backend. KubeVirt is a
virtual machine management add-on for Kubernetes which aims to provide
a common ground for virtualization. KubeVirt spawns the VMs inside
Kubernetes Pods which reside in their own network namespace just like
regular Pods.

Raw QEMU AF_XDP backend example with eth0 being a physical device with
16 queues where netkit is bound to the last queue (for multi-queue RSS
context can be used if supported by the driver):

  # ethtool -X eth0 start 0 equal 15
  # ethtool -X eth0 start 15 equal 1 context new
  # ethtool --config-ntuple eth0 flow-type ether \
            src 00:00:00:00:00:00 \
            src-mask ff:ff:ff:ff:ff:ff \
            dst $mac dst-mask 00:00:00:00:00:00 \
            proto 0 proto-mask 0xffff action 15
  # ip netns add foo
  # ip link add numrxqueues 2 nk type netkit single
  # ynl-bind eth0 15 nk
  # ip link set nk netns foo
  # ip netns exec foo ip link set lo up
  # ip netns exec foo ip link set nk up
  # ip netns exec foo qemu-system-x86_64 \
          -kernel $kernel \
          -drive file=${image_name},index=0,media=disk,format=raw \
          -append "root=/dev/sda rw console=ttyS0" \
          -cpu host \
          -m $memory \
          -enable-kvm \
          -device virtio-net-pci,netdev=net0,mac=$mac \
          -netdev af-xdp,ifname=nk,id=net0,mode=native,queues=1,start-queue=1,inhibit=on,map-path=$dir/xsks_map \
          -nographic

We have tested the above against a dual-port Nvidia ConnectX-6 (mlx5)
100G NIC with successful network connectivity out of QEMU. An earlier
iteration of this work was presented at LSF/MM/BPF [0].

Signed-off-by: Daniel Borkmann <daniel@...earbox.net>
Co-developed-by: David Wei <dw@...idwei.uk>
Signed-off-by: David Wei <dw@...idwei.uk>
Link: https://bpfconf.ebpf.io/bpfconf2025/bpfconf2025_material/lsfmmbpf_2025_netkit_borkmann.pdf [0]
---
 drivers/net/netkit.c | 121 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)

diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c
index 5129b27a7c3c..a1d8a78bab0b 100644
--- a/drivers/net/netkit.c
+++ b/drivers/net/netkit.c
@@ -11,6 +11,7 @@
 
 #include <net/netdev_queues.h>
 #include <net/netdev_rx_queue.h>
+#include <net/xdp_sock_drv.h>
 #include <net/netkit.h>
 #include <net/dst.h>
 #include <net/tcx.h>
@@ -234,6 +235,122 @@ static void netkit_get_stats(struct net_device *dev,
 	stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped);
 }
 
+static int netkit_xsk(struct net_device *dev, struct netdev_bpf *xdp)
+{
+	struct netkit *nk = netkit_priv(dev);
+	struct netdev_bpf xdp_lower;
+	struct netdev_rx_queue *rxq;
+	struct net_device *phys;
+
+	switch (xdp->command) {
+	case XDP_SETUP_XSK_POOL:
+		if (nk->pair == NETKIT_DEVICE_PAIR)
+			return -EOPNOTSUPP;
+		if (xdp->xsk.queue_id >= dev->real_num_rx_queues)
+			return -EINVAL;
+
+		rxq = __netif_get_rx_queue(dev, xdp->xsk.queue_id);
+		if (!rxq->peer)
+			return -EOPNOTSUPP;
+
+		phys = rxq->peer->dev;
+		if (!phys->netdev_ops->ndo_bpf ||
+		    !phys->netdev_ops->ndo_xdp_xmit ||
+		    !phys->netdev_ops->ndo_xsk_wakeup)
+			return -EOPNOTSUPP;
+
+		memcpy(&xdp_lower, xdp, sizeof(xdp_lower));
+		xdp_lower.xsk.queue_id = get_netdev_rx_queue_index(rxq->peer);
+		break;
+	case XDP_SETUP_PROG:
+		return -EPERM;
+	default:
+		return -EINVAL;
+	}
+
+	return phys->netdev_ops->ndo_bpf(phys, &xdp_lower);
+}
+
+static int netkit_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
+{
+	struct netdev_rx_queue *rxq;
+	struct net_device *phys;
+
+	if (queue_id >= dev->real_num_rx_queues)
+		return -EINVAL;
+
+	rxq = __netif_get_rx_queue(dev, queue_id);
+	if (!rxq->peer)
+		return -EOPNOTSUPP;
+
+	phys = rxq->peer->dev;
+	if (!phys->netdev_ops->ndo_xsk_wakeup)
+		return -EOPNOTSUPP;
+
+	return phys->netdev_ops->ndo_xsk_wakeup(phys,
+			get_netdev_rx_queue_index(rxq->peer), flags);
+}
+
+static bool netkit_xdp_supported(const struct net_device *dev)
+{
+	bool xdp_ok = IS_ENABLED(CONFIG_XDP_SOCKETS);
+
+	if (!dev->netdev_ops->ndo_bpf ||
+	    !dev->netdev_ops->ndo_xdp_xmit ||
+	    !dev->netdev_ops->ndo_xsk_wakeup)
+		xdp_ok = false;
+	if ((dev->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK)
+		xdp_ok = false;
+	return xdp_ok;
+}
+
+static void netkit_expose_xdp(struct net_device *dev, bool xdp_ok,
+			      u32 xdp_zc_max_segs)
+{
+	if (xdp_ok) {
+		dev->xdp_zc_max_segs = xdp_zc_max_segs;
+		xdp_set_features_flag_locked(dev, NETDEV_XDP_ACT_XSK);
+	} else {
+		dev->xdp_zc_max_segs = 1;
+		xdp_set_features_flag_locked(dev, 0);
+	}
+}
+
+static void netkit_calculate_xdp(struct net_device *dev,
+				 struct netdev_rx_queue *rxq, bool skip_rxq)
+{
+	struct netdev_rx_queue *src_rxq, *dst_rxq;
+	struct net_device *src_dev;
+	u32 xdp_zc_max_segs = ~0;
+	bool xdp_ok = false;
+	int i;
+
+	for (i = 1; i < dev->real_num_rx_queues; i++) {
+		dst_rxq = __netif_get_rx_queue(dev, i);
+		if (dst_rxq == rxq && skip_rxq)
+			continue;
+		src_rxq = dst_rxq->peer;
+		src_dev = src_rxq->dev;
+		xdp_zc_max_segs = min(xdp_zc_max_segs, src_dev->xdp_zc_max_segs);
+		xdp_ok = netkit_xdp_supported(src_dev) &&
+			 (i == 1 ? true : xdp_ok);
+	}
+
+	netkit_expose_xdp(dev, xdp_ok, xdp_zc_max_segs);
+}
+
+static void netkit_peer_queues(struct net_device *dev,
+			       struct netdev_rx_queue *rxq)
+{
+	netkit_calculate_xdp(dev, rxq, false);
+}
+
+static void netkit_unpeer_queues(struct net_device *dev,
+				 struct netdev_rx_queue *rxq)
+{
+	netkit_calculate_xdp(dev, rxq, true);
+}
+
 static void netkit_uninit(struct net_device *dev);
 
 static const struct net_device_ops netkit_netdev_ops = {
@@ -247,6 +364,10 @@ static const struct net_device_ops netkit_netdev_ops = {
 	.ndo_get_peer_dev	= netkit_peer_dev,
 	.ndo_get_stats64	= netkit_get_stats,
 	.ndo_uninit		= netkit_uninit,
+	.ndo_peer_queues	= netkit_peer_queues,
+	.ndo_unpeer_queues	= netkit_unpeer_queues,
+	.ndo_bpf		= netkit_xsk,
+	.ndo_xsk_wakeup		= netkit_xsk_wakeup,
 	.ndo_features_check	= passthru_features_check,
 };
 
-- 
2.43.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ