lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <50d4a072-209e-4751-80c3-1929c536afcb@blackwall.org>
Date: Wed, 22 Oct 2025 17:27:20 +0300
From: Nikolay Aleksandrov <razor@...ckwall.org>
To: Daniel Borkmann <daniel@...earbox.net>, netdev@...r.kernel.org
Cc: bpf@...r.kernel.org, kuba@...nel.org, davem@...emloft.net,
 pabeni@...hat.com, willemb@...gle.com, sdf@...ichev.me,
 john.fastabend@...il.com, martin.lau@...nel.org, jordan@...fe.io,
 maciej.fijalkowski@...el.com, magnus.karlsson@...el.com, dw@...idwei.uk,
 toke@...hat.com, yangzhenze@...edance.com, wangdongdong.6@...edance.com
Subject: Re: [PATCH net-next v3 15/15] netkit: Add xsk support for af_xdp
 applications

On 10/20/25 19:23, Daniel Borkmann wrote:
> Enable support for AF_XDP applications to operate on a netkit device.
> The goal is that AF_XDP applications can natively consume AF_XDP
> from network namespaces. The use-case from Cilium side is to support
> Kubernetes KubeVirt VMs through QEMU's AF_XDP backend. KubeVirt is a
> virtual machine management add-on for Kubernetes which aims to provide
> a common ground for virtualization. KubeVirt spawns the VMs inside
> Kubernetes Pods which reside in their own network namespace just like
> regular Pods.
> 
> Raw QEMU AF_XDP backend example with eth0 being a physical device with
> 16 queues where netkit is bound to the last queue (for multi-queue RSS
> context can be used if supported by the driver):
> 
>   # ethtool -X eth0 start 0 equal 15
>   # ethtool -X eth0 start 15 equal 1 context new
>   # ethtool --config-ntuple eth0 flow-type ether \
>             src 00:00:00:00:00:00 \
>             src-mask ff:ff:ff:ff:ff:ff \
>             dst $mac dst-mask 00:00:00:00:00:00 \
>             proto 0 proto-mask 0xffff action 15
>   [ ... setup BPF/XDP prog on eth0 to steer into shared xsk map ... ]
>   # ip netns add foo
>   # ip link add numrxqueues 2 nk type netkit single
>   # ./pyynl/cli.py --spec ~/netlink/specs/netdev.yaml \
>                    --do bind-queue \
>                    --json "{"src-ifindex": $(ifindex eth0), "src-queue-id": 15, \
>                             "dst-ifindex": $(ifindex nk), "queue-type": "rx"}"
>   {'dst-queue-id': 1}
>   # ip link set nk netns foo
>   # ip netns exec foo ip link set lo up
>   # ip netns exec foo ip link set nk up
>   # ip netns exec foo qemu-system-x86_64 \
>           -kernel $kernel \
>           -drive file=${image_name},index=0,media=disk,format=raw \
>           -append "root=/dev/sda rw console=ttyS0" \
>           -cpu host \
>           -m $memory \
>           -enable-kvm \
>           -device virtio-net-pci,netdev=net0,mac=$mac \
>           -netdev af-xdp,ifname=nk,id=net0,mode=native,queues=1,start-queue=1,inhibit=on,map-path=$dir/xsks_map \
>           -nographic
> 
> We have tested the above against a dual-port Nvidia ConnectX-6 (mlx5)
> 100G NIC with successful network connectivity out of QEMU. An earlier
> iteration of this work was presented at LSF/MM/BPF [0].
> 
> For getting to a first starting point to connect all things with
> KubeVirt, bind mounting the xsk map from Cilium into the VM launcher
> Pod which acts as a regular Kubernetes Pod while not perfect, is not
> a big problem given its out of reach from the application sitting
> inside the VM (and some of the control plane aspects are baked in
> the launcher Pod already), so the isolation barrier is still the VM.
> Eventually the goal is to have a XDP/XSK redirect extension where
> there is no need to have the xsk map, and the BPF program can just
> derive the target xsk through the queue where traffic was received
> on.
> 
> The exposure through netkit is because Cilium should not act as a
> proxy handing out xsk sockets. Existing applications expect a netdev
> from kernel side and should not need to rewrite just to implement
> against a CNI's protocol. Also, all the memory should not be accounted
> against Cilium but rather the application Pod itself which is consuming
> AF_XDP. Further, on up/downgrades we expect the data plane to being
> completely decoupled from the control plane; if Cilium would own the
> sockets that would be disruptive. Another use-case which opens up and
> is regularly asked from users would be to have DPDK applications on
> top of AF_XDP in regular Kubernetes Pods.
> 
> Signed-off-by: Daniel Borkmann <daniel@...earbox.net>
> Co-developed-by: David Wei <dw@...idwei.uk>
> Signed-off-by: David Wei <dw@...idwei.uk>
> Link: https://bpfconf.ebpf.io/bpfconf2025/bpfconf2025_material/lsfmmbpf_2025_netkit_borkmann.pdf [0]
> ---
>  drivers/net/netkit.c | 71 +++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 70 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c
> index a281b39a1047..f69abe5ec4cd 100644
> --- a/drivers/net/netkit.c
> +++ b/drivers/net/netkit.c
> @@ -12,6 +12,7 @@
>  #include <net/netdev_lock.h>
>  #include <net/netdev_queues.h>
>  #include <net/netdev_rx_queue.h>
> +#include <net/xdp_sock_drv.h>
>  #include <net/netkit.h>
>  #include <net/dst.h>
>  #include <net/tcx.h>
> @@ -235,6 +236,71 @@ static void netkit_get_stats(struct net_device *dev,
>  	stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped);
>  }
>  
> +static bool netkit_xsk_supported_at_phys(const struct net_device *dev)
> +{
> +	if (!dev->netdev_ops->ndo_bpf ||
> +	    !dev->netdev_ops->ndo_xdp_xmit ||
> +	    !dev->netdev_ops->ndo_xsk_wakeup)
> +		return false;
> +	if ((dev->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK)
> +		return false;
> +	return true;
> +}
> +
> +static int netkit_xsk(struct net_device *dev, struct netdev_bpf *xdp)
> +{
> +	struct netkit *nk = netkit_priv(dev);
> +	struct netdev_bpf xdp_lower;
> +	struct netdev_rx_queue *rxq;
> +	struct net_device *phys;
> +
> +	switch (xdp->command) {
> +	case XDP_SETUP_XSK_POOL:
> +		if (nk->pair == NETKIT_DEVICE_PAIR)
> +			return -EOPNOTSUPP;
> +		if (xdp->xsk.queue_id >= dev->real_num_rx_queues)
> +			return -EINVAL;
> +
> +		rxq = __netif_get_rx_queue(dev, xdp->xsk.queue_id);
> +		if (!rxq->peer)
> +			return -EOPNOTSUPP;
> +
> +		phys = rxq->peer->dev;
> +		if (!netkit_xsk_supported_at_phys(phys))
> +			return -EOPNOTSUPP;
> +
> +		memcpy(&xdp_lower, xdp, sizeof(xdp_lower));
> +		xdp_lower.xsk.queue_id = get_netdev_rx_queue_index(rxq->peer);
> +		break;
> +	case XDP_SETUP_PROG:
> +		return -EPERM;
> +	default:
> +		return -EINVAL;
> +	}
> +
> +	return phys->netdev_ops->ndo_bpf(phys, &xdp_lower);
> +}
> +
> +static int netkit_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
> +{
> +	struct netdev_rx_queue *rxq;
> +	struct net_device *phys;
> +
> +	if (queue_id >= dev->real_num_rx_queues)
> +		return -EINVAL;
> +
> +	rxq = __netif_get_rx_queue(dev, queue_id);
> +	if (!rxq->peer)
> +		return -EOPNOTSUPP;
> +
> +	phys = rxq->peer->dev;
> +	if (!netkit_xsk_supported_at_phys(phys))
> +		return -EOPNOTSUPP;
> +
> +	return phys->netdev_ops->ndo_xsk_wakeup(phys,
> +			get_netdev_rx_queue_index(rxq->peer), flags);
> +}
> +
>  static int netkit_init(struct net_device *dev)
>  {
>  	netdev_lockdep_set_classes(dev);
> @@ -255,6 +321,8 @@ static const struct net_device_ops netkit_netdev_ops = {
>  	.ndo_get_peer_dev	= netkit_peer_dev,
>  	.ndo_get_stats64	= netkit_get_stats,
>  	.ndo_uninit		= netkit_uninit,
> +	.ndo_bpf		= netkit_xsk,
> +	.ndo_xsk_wakeup		= netkit_xsk_wakeup,
>  	.ndo_features_check	= passthru_features_check,
>  };
>  
> @@ -409,10 +477,11 @@ static void netkit_setup(struct net_device *dev)
>  	dev->hw_enc_features = netkit_features;
>  	dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
>  	dev->vlan_features = dev->features & ~netkit_features_hw_vlan;
> -
>  	dev->needs_free_netdev = true;
>  
>  	netif_set_tso_max_size(dev, GSO_MAX_SIZE);
> +
> +	xdp_set_features_flag(dev, NETDEV_XDP_ACT_XSK);
>  }
>  
>  static struct net *netkit_get_link_net(const struct net_device *dev)

Reviewed-by: Nikolay Aleksandrov <razor@...ckwall.org>


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ