[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <50d4a072-209e-4751-80c3-1929c536afcb@blackwall.org>
Date: Wed, 22 Oct 2025 17:27:20 +0300
From: Nikolay Aleksandrov <razor@...ckwall.org>
To: Daniel Borkmann <daniel@...earbox.net>, netdev@...r.kernel.org
Cc: bpf@...r.kernel.org, kuba@...nel.org, davem@...emloft.net,
pabeni@...hat.com, willemb@...gle.com, sdf@...ichev.me,
john.fastabend@...il.com, martin.lau@...nel.org, jordan@...fe.io,
maciej.fijalkowski@...el.com, magnus.karlsson@...el.com, dw@...idwei.uk,
toke@...hat.com, yangzhenze@...edance.com, wangdongdong.6@...edance.com
Subject: Re: [PATCH net-next v3 15/15] netkit: Add xsk support for af_xdp
applications
On 10/20/25 19:23, Daniel Borkmann wrote:
> Enable support for AF_XDP applications to operate on a netkit device.
> The goal is that AF_XDP applications can natively consume AF_XDP
> from network namespaces. The use-case from Cilium side is to support
> Kubernetes KubeVirt VMs through QEMU's AF_XDP backend. KubeVirt is a
> virtual machine management add-on for Kubernetes which aims to provide
> a common ground for virtualization. KubeVirt spawns the VMs inside
> Kubernetes Pods which reside in their own network namespace just like
> regular Pods.
>
> Raw QEMU AF_XDP backend example with eth0 being a physical device with
> 16 queues where netkit is bound to the last queue (for multi-queue RSS
> context can be used if supported by the driver):
>
> # ethtool -X eth0 start 0 equal 15
> # ethtool -X eth0 start 15 equal 1 context new
> # ethtool --config-ntuple eth0 flow-type ether \
> src 00:00:00:00:00:00 \
> src-mask ff:ff:ff:ff:ff:ff \
> dst $mac dst-mask 00:00:00:00:00:00 \
> proto 0 proto-mask 0xffff action 15
> [ ... setup BPF/XDP prog on eth0 to steer into shared xsk map ... ]
> # ip netns add foo
> # ip link add numrxqueues 2 nk type netkit single
> # ./pyynl/cli.py --spec ~/netlink/specs/netdev.yaml \
> --do bind-queue \
> --json "{"src-ifindex": $(ifindex eth0), "src-queue-id": 15, \
> "dst-ifindex": $(ifindex nk), "queue-type": "rx"}"
> {'dst-queue-id': 1}
> # ip link set nk netns foo
> # ip netns exec foo ip link set lo up
> # ip netns exec foo ip link set nk up
> # ip netns exec foo qemu-system-x86_64 \
> -kernel $kernel \
> -drive file=${image_name},index=0,media=disk,format=raw \
> -append "root=/dev/sda rw console=ttyS0" \
> -cpu host \
> -m $memory \
> -enable-kvm \
> -device virtio-net-pci,netdev=net0,mac=$mac \
> -netdev af-xdp,ifname=nk,id=net0,mode=native,queues=1,start-queue=1,inhibit=on,map-path=$dir/xsks_map \
> -nographic
>
> We have tested the above against a dual-port Nvidia ConnectX-6 (mlx5)
> 100G NIC with successful network connectivity out of QEMU. An earlier
> iteration of this work was presented at LSF/MM/BPF [0].
>
> For getting to a first starting point to connect all things with
> KubeVirt, bind mounting the xsk map from Cilium into the VM launcher
> Pod which acts as a regular Kubernetes Pod while not perfect, is not
> a big problem given its out of reach from the application sitting
> inside the VM (and some of the control plane aspects are baked in
> the launcher Pod already), so the isolation barrier is still the VM.
> Eventually the goal is to have a XDP/XSK redirect extension where
> there is no need to have the xsk map, and the BPF program can just
> derive the target xsk through the queue where traffic was received
> on.
>
> The exposure through netkit is because Cilium should not act as a
> proxy handing out xsk sockets. Existing applications expect a netdev
> from kernel side and should not need to rewrite just to implement
> against a CNI's protocol. Also, all the memory should not be accounted
> against Cilium but rather the application Pod itself which is consuming
> AF_XDP. Further, on up/downgrades we expect the data plane to being
> completely decoupled from the control plane; if Cilium would own the
> sockets that would be disruptive. Another use-case which opens up and
> is regularly asked from users would be to have DPDK applications on
> top of AF_XDP in regular Kubernetes Pods.
>
> Signed-off-by: Daniel Borkmann <daniel@...earbox.net>
> Co-developed-by: David Wei <dw@...idwei.uk>
> Signed-off-by: David Wei <dw@...idwei.uk>
> Link: https://bpfconf.ebpf.io/bpfconf2025/bpfconf2025_material/lsfmmbpf_2025_netkit_borkmann.pdf [0]
> ---
> drivers/net/netkit.c | 71 +++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 70 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c
> index a281b39a1047..f69abe5ec4cd 100644
> --- a/drivers/net/netkit.c
> +++ b/drivers/net/netkit.c
> @@ -12,6 +12,7 @@
> #include <net/netdev_lock.h>
> #include <net/netdev_queues.h>
> #include <net/netdev_rx_queue.h>
> +#include <net/xdp_sock_drv.h>
> #include <net/netkit.h>
> #include <net/dst.h>
> #include <net/tcx.h>
> @@ -235,6 +236,71 @@ static void netkit_get_stats(struct net_device *dev,
> stats->tx_dropped = DEV_STATS_READ(dev, tx_dropped);
> }
>
> +static bool netkit_xsk_supported_at_phys(const struct net_device *dev)
> +{
> + if (!dev->netdev_ops->ndo_bpf ||
> + !dev->netdev_ops->ndo_xdp_xmit ||
> + !dev->netdev_ops->ndo_xsk_wakeup)
> + return false;
> + if ((dev->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK)
> + return false;
> + return true;
> +}
> +
> +static int netkit_xsk(struct net_device *dev, struct netdev_bpf *xdp)
> +{
> + struct netkit *nk = netkit_priv(dev);
> + struct netdev_bpf xdp_lower;
> + struct netdev_rx_queue *rxq;
> + struct net_device *phys;
> +
> + switch (xdp->command) {
> + case XDP_SETUP_XSK_POOL:
> + if (nk->pair == NETKIT_DEVICE_PAIR)
> + return -EOPNOTSUPP;
> + if (xdp->xsk.queue_id >= dev->real_num_rx_queues)
> + return -EINVAL;
> +
> + rxq = __netif_get_rx_queue(dev, xdp->xsk.queue_id);
> + if (!rxq->peer)
> + return -EOPNOTSUPP;
> +
> + phys = rxq->peer->dev;
> + if (!netkit_xsk_supported_at_phys(phys))
> + return -EOPNOTSUPP;
> +
> + memcpy(&xdp_lower, xdp, sizeof(xdp_lower));
> + xdp_lower.xsk.queue_id = get_netdev_rx_queue_index(rxq->peer);
> + break;
> + case XDP_SETUP_PROG:
> + return -EPERM;
> + default:
> + return -EINVAL;
> + }
> +
> + return phys->netdev_ops->ndo_bpf(phys, &xdp_lower);
> +}
> +
> +static int netkit_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
> +{
> + struct netdev_rx_queue *rxq;
> + struct net_device *phys;
> +
> + if (queue_id >= dev->real_num_rx_queues)
> + return -EINVAL;
> +
> + rxq = __netif_get_rx_queue(dev, queue_id);
> + if (!rxq->peer)
> + return -EOPNOTSUPP;
> +
> + phys = rxq->peer->dev;
> + if (!netkit_xsk_supported_at_phys(phys))
> + return -EOPNOTSUPP;
> +
> + return phys->netdev_ops->ndo_xsk_wakeup(phys,
> + get_netdev_rx_queue_index(rxq->peer), flags);
> +}
> +
> static int netkit_init(struct net_device *dev)
> {
> netdev_lockdep_set_classes(dev);
> @@ -255,6 +321,8 @@ static const struct net_device_ops netkit_netdev_ops = {
> .ndo_get_peer_dev = netkit_peer_dev,
> .ndo_get_stats64 = netkit_get_stats,
> .ndo_uninit = netkit_uninit,
> + .ndo_bpf = netkit_xsk,
> + .ndo_xsk_wakeup = netkit_xsk_wakeup,
> .ndo_features_check = passthru_features_check,
> };
>
> @@ -409,10 +477,11 @@ static void netkit_setup(struct net_device *dev)
> dev->hw_enc_features = netkit_features;
> dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
> dev->vlan_features = dev->features & ~netkit_features_hw_vlan;
> -
> dev->needs_free_netdev = true;
>
> netif_set_tso_max_size(dev, GSO_MAX_SIZE);
> +
> + xdp_set_features_flag(dev, NETDEV_XDP_ACT_XSK);
> }
>
> static struct net *netkit_get_link_net(const struct net_device *dev)
Reviewed-by: Nikolay Aleksandrov <razor@...ckwall.org>
Powered by blists - more mailing lists