netdev - [RFC PATCH net-next 2/3] selftests/bpf: Add xsk_hw AF

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <e951b56a512c80adce8e32de60157d623e5bd286.1718138187.git.zhuyifei@google.com>
Date: Tue, 11 Jun 2024 20:42:46 +0000
From: YiFei Zhu <zhuyifei@...gle.com>
To: netdev@...r.kernel.org, bpf@...r.kernel.org
Cc: "Björn Töpel" <bjorn@...nel.org>, Magnus Karlsson <magnus.karlsson@...el.com>, 
	Maciej Fijalkowski <maciej.fijalkowski@...el.com>, Jonathan Lemon <jonathan.lemon@...il.com>, 
	Alexei Starovoitov <ast@...nel.org>, Daniel Borkmann <daniel@...earbox.net>, 
	"David S . Miller" <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>, 
	Jesper Dangaard Brouer <hawk@...nel.org>, John Fastabend <john.fastabend@...il.com>, 
	Andrii Nakryiko <andrii@...nel.org>, Stanislav Fomichev <sdf@...gle.com>, 
	Willem de Bruijn <willemb@...gle.com>
Subject: [RFC PATCH net-next 2/3] selftests/bpf: Add xsk_hw AF_XDP
 functionality test

A UDP packet is sent from one peer to the other once, and verify
if it is received before a timeout. Busypoll mode is not tested
since it's more difficult to write a pass/fail test to verify
that busypoll mode is working as intended, and adding this test
can be future work.

To test TX/RX independently, the non-XDP side uses AF_PACKET to
send/recv in order to minimize possible differences in the
packet headers (as opposed to having AF_INET(6), which makes the
kernel's stack construct the packet). However, an AF_INET(6)
SOCK_DGRAM socket is still created and bound to the port
in order to mute ICMP port unreachable messages without having
to use iptables.

Considering that the set up of AF_XDP may have variable delays
on different machines, to synchronize the execution, a simple
TCP connection is established between client and server as a
synchronization point.

I'm also adding checksum_nofold helper to network_helpers since
I'm sure if it would be a good idea to rely on hardware offload
and XDP metadata for a basic functionality test. If needed we
can extend this test to test checksum offload too.

Signed-off-by: YiFei Zhu <zhuyifei@...gle.com>
Signed-off-by: Willem de Bruijn <willemb@...gle.com>
---
 tools/testing/selftests/bpf/.gitignore        |   1 +
 tools/testing/selftests/bpf/Makefile          |   7 +-
 tools/testing/selftests/bpf/network_helpers.h |  14 +
 tools/testing/selftests/bpf/progs/xsk_hw.c    |  72 ++
 tools/testing/selftests/bpf/xsk_hw.c          | 844 ++++++++++++++++++
 5 files changed, 937 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/progs/xsk_hw.c
 create mode 100644 tools/testing/selftests/bpf/xsk_hw.c

diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore
index 5025401323af..1065e4b508c4 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -47,6 +47,7 @@ test_cpp
 *.ko
 *.tmp
 xskxceiver
+xsk_hw
 xdp_redirect_multi
 xdp_synproxy
 xdp_hw_metadata
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index e0b3887b3d2d..854979898ce0 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -144,7 +144,7 @@ TEST_GEN_PROGS_EXTENDED = test_skb_cgroup_id_user \
 	flow_dissector_load test_flow_dissector test_tcp_check_syncookie_user \
 	test_lirc_mode2_user xdping test_cpp runqslower bench bpf_testmod.ko \
 	xskxceiver xdp_redirect_multi xdp_synproxy veristat xdp_hw_metadata \
-	xdp_features bpf_test_no_cfi.ko
+	xdp_features bpf_test_no_cfi.ko xsk_hw
 
 TEST_GEN_FILES += liburandom_read.so urandom_read sign-file uprobe_multi
 
@@ -476,6 +476,7 @@ test_usdt.skel.h-deps := test_usdt.bpf.o test_usdt_multispec.bpf.o
 xsk_xdp_progs.skel.h-deps := xsk_xdp_progs.bpf.o
 xdp_hw_metadata.skel.h-deps := xdp_hw_metadata.bpf.o
 xdp_features.skel.h-deps := xdp_features.bpf.o
+xsk_hw.skel.h-deps := xsk_hw.bpf.o
 
 LINKED_BPF_SRCS := $(patsubst %.bpf.o,%.c,$(foreach skel,$(LINKED_SKELS),$($(skel)-deps)))
 
@@ -710,6 +711,10 @@ $(OUTPUT)/xdp_features: xdp_features.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xdp
 	$(call msg,BINARY,,$@)
 	$(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
 
+$(OUTPUT)/xsk_hw: xsk_hw.c $(OUTPUT)/network_helpers.o $(OUTPUT)/xsk.o $(OUTPUT)/xsk_hw.skel.h | $(OUTPUT)
+	$(call msg,BINARY,,$@)
+	$(Q)$(CC) $(CFLAGS) $(filter %.a %.o %.c,$^) $(LDLIBS) -o $@
+
 # Make sure we are able to include and link libbpf against c++.
 $(OUTPUT)/test_cpp: test_cpp.cpp $(OUTPUT)/test_core_extern.skel.h $(BPFOBJ)
 	$(call msg,CXX,,$@)
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index b09c3bbd5b62..732f07486162 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -85,6 +85,20 @@ struct nstoken *open_netns(const char *name);
 void close_netns(struct nstoken *token);
 int send_recv_data(int lfd, int fd, uint32_t total_bytes);
 
+static inline __wsum checksum_nofold(const void *data, size_t len, __wsum sum)
+{
+	const uint16_t *words = (const uint16_t *)data;
+	int i;
+
+	for (i = 0; i < len / 2; i++)
+		sum += words[i];
+
+	if (len & 1)
+		sum += ((const unsigned char *)data)[len - 1];
+
+	return sum;
+}
+
 static __u16 csum_fold(__u32 csum)
 {
 	csum = (csum & 0xffff) + (csum >> 16);
diff --git a/tools/testing/selftests/bpf/progs/xsk_hw.c b/tools/testing/selftests/bpf/progs/xsk_hw.c
new file mode 100644
index 000000000000..8009267ddc33
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/xsk_hw.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/bpf.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/in.h>
+#include <linux/udp.h>
+#include <stdbool.h>
+
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_endian.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_XSKMAP);
+	__uint(max_entries, 256);
+	__type(key, __u32);
+	__type(value, __u32);
+} xsk SEC(".maps");
+
+__u16 port;
+bool should_rx;
+
+SEC("xdp")
+int rx(struct xdp_md *ctx)
+{
+	void *data, *data_end;
+	struct ipv6hdr *ip6h;
+	struct ethhdr *eth;
+	struct iphdr *iph;
+	struct udphdr *uh;
+
+	if (!should_rx)
+		return XDP_PASS;
+
+	data = (void *)(long)ctx->data;
+	data_end = (void *)(long)ctx->data_end;
+
+	eth = data;
+	data = eth + 1;
+	if (data > data_end)
+		return XDP_PASS;
+
+	if (eth->h_proto == bpf_htons(ETH_P_IP)) {
+		iph = data;
+		data = iph + 1;
+		if (data > data_end)
+			return XDP_PASS;
+		if (iph->protocol != IPPROTO_UDP)
+			return XDP_PASS;
+	} else if (eth->h_proto == bpf_htons(ETH_P_IPV6)) {
+		ip6h = data;
+		data = ip6h + 1;
+		if (data > data_end)
+			return XDP_PASS;
+		if (ip6h->nexthdr != IPPROTO_UDP)
+			return XDP_PASS;
+	} else {
+		return XDP_PASS;
+	}
+
+	uh = data;
+	data = uh + 1;
+	if (data > data_end)
+		return XDP_PASS;
+	if (uh->dest != port)
+		return XDP_PASS;
+
+	return bpf_redirect_map(&xsk, ctx->rx_queue_index, XDP_PASS);
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/tools/testing/selftests/bpf/xsk_hw.c b/tools/testing/selftests/bpf/xsk_hw.c
new file mode 100644
index 000000000000..dd6fc7b562ba
--- /dev/null
+++ b/tools/testing/selftests/bpf/xsk_hw.c
@@ -0,0 +1,844 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/* XSK basic regression test
+ *
+ * Exercise AF_XDP (XSK) sockets in all modes
+ * - skb copy
+ * - drv copy
+ * - drv zerocopy
+ *
+ * Run:
+ *
+ * server: ./xsk_hw -i $DEV -[46] -D $CLIENT_ADDR -S $SERVER_ADDR -m $LOCAL_MAC -M $GW_MAC -h $ARGS
+ * client: ./xsk_hw -i $DEV -[46] -D $SERVER_ADDR -S $CLIENT_ADDR -m $LOCAL_MAC -M $GW_MAC $ARGS
+ *
+ * Args:
+ *
+ * - ``: no args: minimal connectivity sanity test using PF_PACKET
+ *
+ * - `-T -s -c`: test transmit, skb copy mode
+ * - `-T -d -c`: test transmit, driver copy mode
+ * - `-T -d -z`: test transmit, driver zerocopy mode
+ *
+ * - `-R -s -c`: receive, skb copy mode
+ * - `-R -d -c`: receive, driver copy mode
+ * - `-R -d -z`: receive, driver zerocopy mode
+ */
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <error.h>
+#include <linux/errqueue.h>
+#include <linux/ethtool.h>
+#include <linux/filter.h>
+#include <linux/if_ether.h>
+#include <linux/if_link.h>
+#include <linux/if_packet.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/net_tstamp.h>
+#include <linux/sockios.h>
+#include <linux/udp.h>
+#include <limits.h>
+#include <net/if.h>
+#include <poll.h>
+#include <signal.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <uapi/linux/filter.h>
+#include <unistd.h>
+
+#include <network_helpers.h>
+
+#include "xsk.h"
+#include "xsk_hw.skel.h"
+
+static int cfg_addr_len;
+static void *cfg_daddr, *cfg_saddr;
+static struct in_addr cfg_daddr4, cfg_saddr4;
+static struct in6_addr cfg_daddr6, cfg_saddr6;
+static uint16_t cfg_eth_proto;
+static int cfg_family = PF_UNSPEC;
+
+static bool cfg_host_run;
+static char *cfg_ifname = "eth0";
+static int cfg_ifindex;
+static bool cfg_fill_after_bind;
+static char *cfg_mac_dst, *cfg_mac_src;
+static int cfg_num_rxq;
+static uint16_t cfg_port = __constant_htons(8000);
+static int cfg_pkt_len;
+static const char cfg_payload[] = "aaaaaaaa";
+static int cfg_rcvtimeo = 10;
+static int cfg_send_queue_id;
+static __u32 cfg_xdp_flags = XDP_FLAGS_REPLACE;
+static __u16 cfg_xdp_bind_flags;
+static bool cfg_xdp_rx;
+static bool cfg_xdp_tx;
+static bool cfg_xdp_tx_force_attach;
+
+/* constants that can be used in static array allocation
+ * const int is not sufficient: a const qualified variable
+ */
+enum {
+	pkt_len_l4 = sizeof(struct udphdr) + sizeof(cfg_payload),
+	pkt_len_v4 = ETH_HLEN + sizeof(struct iphdr) + pkt_len_l4,
+	pkt_len_v6 = ETH_HLEN + sizeof(struct ipv6hdr) + pkt_len_l4,
+};
+
+static char pkt[pkt_len_v6];
+
+#define UMEM_NUM 8192
+#define UMEM_QLEN (UMEM_NUM / 2)
+#define UMEM_FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE
+#define UMEM_SIZE (UMEM_FRAME_SIZE * UMEM_NUM)
+
+struct xsk {
+	void *umem_area;
+	struct xsk_umem *umem;
+	struct xsk_ring_prod fill;
+	struct xsk_ring_cons comp;
+	struct xsk_ring_prod tx;
+	struct xsk_ring_cons rx;
+	struct xsk_socket *socket;
+	__u32 tx_head;
+};
+
+static struct xsk_hw *bpf_obj;
+static struct xsk *xsks;
+
+static int pfpacket_fd;
+static int udp_fd;
+
+static void init_pkt_ipv4(struct iphdr *iph)
+{
+	struct udphdr *uh;
+
+	iph->version = 4;
+	iph->ihl = 5;
+	iph->protocol = IPPROTO_UDP;
+	iph->tot_len = htons(sizeof(*iph) + sizeof(*uh) + sizeof(cfg_payload));
+	iph->ttl = 64;
+	iph->daddr = cfg_daddr4.s_addr;
+	iph->saddr = cfg_saddr4.s_addr;
+	iph->check = csum_fold(checksum_nofold(iph, sizeof(*iph), 0));
+}
+
+static void init_pkt_ipv6(struct ipv6hdr *ip6h)
+{
+	struct udphdr *uh;
+
+	ip6h->version = 6;
+	ip6h->payload_len = htons(sizeof(*uh) + sizeof(cfg_payload));
+	ip6h->nexthdr = IPPROTO_UDP;
+	ip6h->hop_limit = 64;
+	ip6h->daddr = cfg_daddr6;
+	ip6h->saddr = cfg_saddr6;
+}
+
+static void init_pkt(void)
+{
+	struct ipv6hdr *ip6h = NULL;
+	struct iphdr *iph = NULL;
+	struct ethhdr *eth;
+	struct udphdr *uh;
+	__wsum check;
+
+	/* init mac header */
+	eth = (void *)&pkt;
+	if (sscanf(cfg_mac_dst, "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx",
+		   &eth->h_dest[0], &eth->h_dest[1], &eth->h_dest[2],
+		   &eth->h_dest[3], &eth->h_dest[4], &eth->h_dest[5]) != 6)
+		error(1, 0, "sscanf mac dst ('-M')\n");
+	if (sscanf(cfg_mac_src, "%02hhx:%02hhx:%02hhx:%02hhx:%02hhx:%02hhx",
+		   &eth->h_source[0], &eth->h_source[1], &eth->h_source[2],
+		   &eth->h_source[3], &eth->h_source[4], &eth->h_source[5]) != 6)
+		error(1, 0, "sscanf mac src ('-m')\n");
+	eth->h_proto = htons(cfg_eth_proto);
+
+	if (cfg_family == PF_INET) {
+		iph = (void *)(eth + 1);
+		uh = (void *)(iph + 1);
+		init_pkt_ipv4(iph);
+	} else {
+		ip6h = (void *)(eth + 1);
+		uh = (void *)(ip6h + 1);
+		init_pkt_ipv6(ip6h);
+	}
+
+	/* init udp header */
+	uh->source = cfg_port;
+	uh->dest = cfg_port;
+	uh->len = htons(sizeof(*uh) + sizeof(cfg_payload));
+	uh->check = 0;
+
+	/* init payload */
+	memcpy(uh + 1, cfg_payload, sizeof(cfg_payload));
+
+	/* udp checksum */
+	check = checksum_nofold(uh, sizeof(*uh) + sizeof(cfg_payload), 0);
+	if (ip6h)
+		uh->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
+					    ntohs(uh->len), IPPROTO_UDP,
+					    check);
+	else
+		uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
+					      ntohs(uh->len), IPPROTO_UDP,
+					      check);
+}
+
+static void *verify_pkt_ipv4(void *data, void *data_end)
+{
+	struct iphdr *iph = data;
+
+	data = iph + 1;
+	if (data > data_end)
+		return NULL;
+
+	if (iph->protocol != IPPROTO_UDP)
+		return NULL;
+
+	return data;
+}
+
+static void *verify_pkt_ipv6(void *data, void *data_end)
+{
+	struct ipv6hdr *ip6h = data;
+
+	data = ip6h + 1;
+	if (data > data_end)
+		return NULL;
+
+	if (ip6h->nexthdr != IPPROTO_UDP)
+		return NULL;
+
+	return data;
+}
+
+static void verify_pkt(void *data, size_t len)
+{
+	void *data_end = data + len;
+	struct ethhdr *eth;
+	struct udphdr *uh;
+
+	eth = data;
+	data = eth + 1;
+	if (data > data_end)
+		goto bad;
+	if (eth->h_proto != htons(cfg_eth_proto))
+		goto bad;
+
+	if (cfg_family == PF_INET)
+		data = verify_pkt_ipv4(data, data_end);
+	else
+		data = verify_pkt_ipv6(data, data_end);
+	if (!data)
+		goto bad;
+
+	uh = data;
+	data = uh + 1;
+	if (data > data_end)
+		goto bad;
+	if (uh->dest != cfg_port)
+		goto bad;
+
+	if (data_end - data != sizeof(cfg_payload))
+		goto bad;
+	if (memcmp(data, cfg_payload, sizeof(cfg_payload)))
+		goto bad;
+
+	return;
+bad:
+	error(1, 0, "bad packet content");
+}
+
+static void udp_bind(void)
+{
+	/* Dual-stack, as not enabling IPV6_V6ONLY */
+	struct sockaddr_in6 ip6addr = {
+		.sin6_family = AF_INET6,
+		.sin6_port = cfg_port,
+		.sin6_addr = in6addr_any,
+	};
+
+	if (bind(udp_fd, (void *)&ip6addr, sizeof(ip6addr)) == -1)
+		error(1, 0, "udp bind");
+}
+
+static void pfpacket_setfilter_ipproto(void)
+{
+	int off_proto, off_port;
+
+	if (cfg_family == PF_INET) {
+		off_proto = ETH_HLEN + offsetof(struct iphdr, protocol);
+		off_port = ETH_HLEN + sizeof(struct iphdr) + offsetof(struct udphdr, dest);
+	} else {
+		off_proto = ETH_HLEN + offsetof(struct ipv6hdr, nexthdr);
+		off_port = ETH_HLEN + sizeof(struct ipv6hdr) + offsetof(struct udphdr, dest);
+	}
+
+	struct sock_filter filter[] = {
+		BPF_STMT(BPF_LD  + BPF_B   + BPF_ABS, SKF_AD_OFF + SKF_AD_PKTTYPE),
+		BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, PACKET_HOST, 0, 5),
+		BPF_STMT(BPF_LD  + BPF_B   + BPF_ABS, off_proto),
+		BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, IPPROTO_UDP, 0, 3),
+		BPF_STMT(BPF_LD  + BPF_H   + BPF_ABS, off_port),
+		BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, ntohs(cfg_port), 0, 1),
+		BPF_STMT(BPF_RET + BPF_K, 0xFFFF),
+		BPF_STMT(BPF_RET + BPF_K, 0),
+	};
+	struct sock_fprog prog = {};
+
+	prog.filter = filter;
+	prog.len = sizeof(filter) / sizeof(struct sock_filter);
+	if (setsockopt(pfpacket_fd, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog)))
+		error(1, errno, "setsockopt filter");
+}
+
+static void pfpacket_bind(void)
+{
+	struct sockaddr_ll laddr = {
+		.sll_family = AF_PACKET,
+		.sll_protocol = cfg_xdp_rx ? 0 : htons(cfg_eth_proto),
+		.sll_ifindex = cfg_ifindex,
+	};
+
+	if (bind(pfpacket_fd, (void *)&laddr, sizeof(laddr)) == -1)
+		error(1, 0, "pfpacket bind");
+}
+
+static int fill_xsk(struct xsk *xsk)
+{
+	__u64 addr;
+	__u32 idx;
+	int i;
+
+	/* returns either 0 on failure or second arg, UMEM_QLEN */
+	if (!xsk_ring_prod__reserve(&xsk->fill, UMEM_QLEN, &idx))
+		return -ENOMEM;
+
+	for (i = 0; i < UMEM_QLEN; i++) {
+		addr = (UMEM_QLEN + i) * UMEM_FRAME_SIZE;
+		*xsk_ring_prod__fill_addr(&xsk->fill, i) = addr;
+	}
+	xsk_ring_prod__submit(&xsk->fill, UMEM_QLEN);
+
+	return 0;
+}
+
+static int open_xsk(struct xsk *xsk, __u32 queue_id)
+{
+	const int mmap_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
+	const struct xsk_socket_config socket_config = {
+		.rx_size = UMEM_QLEN,
+		.tx_size = UMEM_QLEN,
+		.bind_flags = cfg_xdp_bind_flags,
+	};
+	const struct xsk_umem_config umem_config = {
+		.fill_size = UMEM_QLEN,
+		.comp_size = UMEM_QLEN,
+		.frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
+	};
+	__u64 addr;
+	int ret;
+	int i;
+
+	xsk->umem_area = mmap(NULL, UMEM_SIZE, PROT_READ | PROT_WRITE, mmap_flags, -1, 0);
+	if (xsk->umem_area == MAP_FAILED)
+		return -ENOMEM;
+
+	ret = xsk_umem__create(&xsk->umem,
+			       xsk->umem_area, UMEM_SIZE,
+			       &xsk->fill,
+			       &xsk->comp,
+			       &umem_config);
+	if (ret)
+		return ret;
+
+	ret = xsk_socket__create(&xsk->socket, cfg_ifindex, queue_id,
+				 xsk->umem,
+				 &xsk->rx,
+				 &xsk->tx,
+				 &socket_config);
+	if (ret)
+		return ret;
+
+	/* First half of umem is for TX. This way address matches 1-to-1
+	 * to the completion queue index.
+	 */
+
+	for (i = 0; i < UMEM_QLEN; i++) {
+		addr = i * UMEM_FRAME_SIZE;
+		memcpy(xsk_umem__get_data(xsk->umem_area, addr),
+		       pkt, cfg_pkt_len);
+	}
+
+	/* Second half of umem is for RX. */
+	if (!cfg_fill_after_bind) {
+		if (fill_xsk(xsk))
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void release_tx(struct xsk *xsk)
+{
+	__u32 idx = 0;
+	unsigned int n;
+
+	n = xsk_ring_cons__peek(&xsk->comp, XSK_RING_CONS__DEFAULT_NUM_DESCS, &idx);
+	if (n)
+		xsk_ring_cons__release(&xsk->comp, n);
+}
+
+static void send_xsk(void)
+{
+	struct xsk *xsk = &xsks[cfg_send_queue_id];
+	struct xdp_desc *desc;
+	__u32 idx;
+
+	release_tx(xsk);
+	if (xsk_ring_prod__reserve(&xsk->tx, 1, &idx) != 1)
+		error(1, 0, "TX ring is full");
+
+	desc = xsk_ring_prod__tx_desc(&xsk->tx, idx);
+	desc->addr = (xsk->tx_head++ % UMEM_QLEN) * UMEM_FRAME_SIZE;
+	desc->len = cfg_pkt_len;
+
+	xsk_ring_prod__submit(&xsk->tx, 1);
+	sendto(xsk_socket__fd(xsk->socket), NULL, 0, MSG_DONTWAIT, NULL, 0);
+}
+
+static void refill_rx(struct xsk *xsk, __u64 addr)
+{
+	__u32 idx;
+
+	if (xsk_ring_prod__reserve(&xsk->fill, 1, &idx) == 1) {
+		*xsk_ring_prod__fill_addr(&xsk->fill, idx) = addr;
+		xsk_ring_prod__submit(&xsk->fill, 1);
+	}
+}
+
+static void recv_xsk(void)
+{
+	const struct xdp_desc *desc;
+	struct pollfd fds[cfg_num_rxq];
+	__u64 comp_addr;
+	__u64 addr;
+	__u32 idx;
+	int ret;
+	int i;
+
+	for (i = 0; i < cfg_num_rxq; i++) {
+		fds[i].fd = xsk_socket__fd(xsks[i].socket);
+		fds[i].events = POLLIN;
+		fds[i].revents = 0;
+	}
+
+	ret = poll(fds, cfg_num_rxq, cfg_rcvtimeo * 1000);
+	if (ret < 0)
+		error(1, -ret, "poll");
+	if (ret == 0)
+		error(1, 0, "%s: Timeout", __func__);
+
+	for (i = 0; i < cfg_num_rxq; i++) {
+		if (fds[i].revents == 0)
+			continue;
+
+		struct xsk *xsk = &xsks[i];
+
+		/* Reading one packet at a time, because we expect only one
+		 * packet outstanding per queue at a time due to test doing
+		 * single connection request/response
+		 */
+		ret = xsk_ring_cons__peek(&xsk->rx, 1, &idx);
+		if (ret != 1)
+			continue;
+
+		desc = xsk_ring_cons__rx_desc(&xsk->rx, idx);
+		comp_addr = xsk_umem__extract_addr(desc->addr);
+		addr = xsk_umem__add_offset_to_addr(desc->addr);
+		verify_pkt(xsk_umem__get_data(xsk->umem_area, addr), desc->len);
+		xsk_ring_cons__release(&xsk->rx, 1);
+		refill_rx(xsk, comp_addr);
+	}
+}
+
+static void send_pfpacket(void)
+{
+	int ret;
+
+	ret = write(pfpacket_fd, pkt, cfg_pkt_len);
+	if (ret == -1)
+		error(1, errno, "write");
+	if (ret != cfg_pkt_len)
+		error(1, 0, "write pkt: %uB != %uB", ret, cfg_pkt_len);
+}
+
+static void recv_pfpacket(void)
+{
+	static char recv_pkt[sizeof(pkt)];
+	struct pollfd fds = {
+		.fd = pfpacket_fd,
+		.events = POLLIN,
+	};
+	int ret, pkt_len;
+
+	ret = poll(&fds, 1, cfg_rcvtimeo * 1000);
+	if (ret < 0)
+		error(1, -ret, "poll");
+	if (ret == 0)
+		error(1, 0, "%s: Timeout", __func__);
+
+	pkt_len = cfg_family == PF_INET ? pkt_len_v4 : pkt_len_v6;
+
+	ret = recv(pfpacket_fd, recv_pkt, sizeof(pkt), MSG_TRUNC);
+	if (ret == -1)
+		error(1, errno, "recv");
+	if (ret != pkt_len)
+		error(1, 0, "recv pkt: %uB != %uB\n", ret, pkt_len);
+
+	verify_pkt(recv_pkt, ret);
+}
+
+static void do_send(void)
+{
+	if (cfg_xdp_tx)
+		send_xsk();
+	else
+		send_pfpacket();
+}
+
+static void do_recv(void)
+{
+	if (cfg_xdp_rx)
+		recv_xsk();
+	else
+		recv_pfpacket();
+}
+
+static bool link_is_down(void)
+{
+	char path[PATH_MAX];
+	FILE *file;
+	char status;
+
+	snprintf(path, PATH_MAX, "/sys/class/net/%s/carrier", cfg_ifname);
+	file = fopen(path, "r");
+	if (!file)
+		error(1, errno, "%s", path);
+
+	if (fread(&status, 1, 1, file) != 1)
+		error(1, errno, "fread");
+
+	fclose(file);
+
+	return status == '0';
+}
+
+static void do_sync_client(void)
+{
+	struct sockaddr_in ip4addr = {
+		.sin_family = AF_INET,
+		.sin_port = cfg_port,
+		.sin_addr = cfg_daddr4,
+	};
+	struct sockaddr_in6 ip6addr = {
+		.sin6_family = AF_INET6,
+		.sin6_port = cfg_port,
+		.sin6_addr = cfg_daddr6,
+	};
+	const int retry_sleep_ms = 200;
+	const int retries_per_sec = 1000 / retry_sleep_ms;
+	const int max_retries = cfg_rcvtimeo * retries_per_sec;
+	int fd, ret, retries = 0;
+
+	fd = socket(cfg_family, SOCK_STREAM, 0);
+	if (fd == -1)
+		error(1, errno, "socket sync client");
+
+	/* If the client calls connect before the server listens,
+	 * the connection will fail immediately and the call returns
+	 * with ECONNREFUSED. Retry up to cfg_rcvtimeo.
+	 */
+	while (true) {
+		if (cfg_family == PF_INET)
+			ret = connect(fd, (void *)&ip4addr, sizeof(ip4addr));
+		else
+			ret = connect(fd, (void *)&ip6addr, sizeof(ip6addr));
+
+		if (ret == -1 && errno != ECONNREFUSED)
+			error(1, errno, "connect sync client");
+		if (ret == 0)
+			break;
+		retries++;
+		usleep(retry_sleep_ms * 1000);
+		if (retries == max_retries)
+			error(1, 0, "connect sync client: max_retries");
+	}
+
+	if (close(fd))
+		error(1, errno, "close sync client");
+}
+
+static void do_sync_server(void)
+{
+	int fdl, fdc;
+	int *fds;
+
+	/* Dual-stack, as not enabling IPV6_V6ONLY */
+	fds = start_reuseport_server(AF_INET6, SOCK_STREAM, "::",
+				     ntohs(cfg_port), cfg_rcvtimeo * 1000, 1);
+	if (!fds)
+		error(1, errno, "start_server");
+	fdl = *fds;
+
+	fdc = accept(fdl, NULL, NULL);
+	if (fdc == -1)
+		error(1, errno, "accept sync");
+
+	if (close(fdc))
+		error(1, errno, "close sync child");
+	if (close(fdl))
+		error(1, errno, "close sync listener");
+}
+
+static void sync_barrier(void)
+{
+	if (cfg_host_run)
+		do_sync_server();
+	else
+		do_sync_client();
+}
+
+static void cleanup(void)
+{
+	LIBBPF_OPTS(bpf_xdp_attach_opts, opts);
+
+	if (bpf_obj) {
+		opts.old_prog_fd = bpf_program__fd(bpf_obj->progs.rx);
+		if (opts.old_prog_fd >= 0)
+			bpf_xdp_detach(cfg_ifindex, cfg_xdp_flags, &opts);
+	}
+}
+
+static void setup_for_ipv4(void)
+{
+	cfg_family = PF_INET;
+	cfg_eth_proto = ETH_P_IP;
+	cfg_addr_len = sizeof(struct in_addr);
+	cfg_pkt_len = pkt_len_v4;
+	cfg_daddr = &cfg_daddr4;
+	cfg_saddr = &cfg_saddr4;
+}
+
+static void setup_for_ipv6(void)
+{
+	cfg_family = PF_INET6;
+	cfg_eth_proto = ETH_P_IPV6;
+	cfg_addr_len = sizeof(struct in6_addr);
+	cfg_pkt_len = pkt_len_v6;
+	cfg_daddr = &cfg_daddr6;
+	cfg_saddr = &cfg_saddr6;
+}
+
+static void parse_opts(int argc, char **argv)
+{
+	char *daddr = NULL, *saddr = NULL;
+	int c;
+
+	while ((c = getopt(argc, argv, "46cD:dfhi:m:M:p:q:RS:sTz")) != -1) {
+		switch (c) {
+		case '4':
+			setup_for_ipv4();
+			break;
+		case '6':
+			setup_for_ipv6();
+			break;
+		case 'c':
+			cfg_xdp_bind_flags |= XDP_COPY;
+			break;
+		case 'D':
+			daddr = optarg;
+			break;
+		case 'd':
+			cfg_xdp_flags |= XDP_FLAGS_DRV_MODE;
+			break;
+		case 'f':
+			cfg_fill_after_bind = true;
+			break;
+		case 'h':
+			cfg_host_run = true;
+			break;
+		case 'i':
+			cfg_ifname = optarg;
+			break;
+		case 'm':
+			cfg_mac_src = optarg;
+			break;
+		case 'M':
+			cfg_mac_dst = optarg;
+			break;
+		case 'p':
+			cfg_port = htons(atoi(optarg));
+			break;
+		case 'q':
+			cfg_send_queue_id = atoi(optarg);
+			break;
+		case 'R':
+			cfg_xdp_rx = true;
+			break;
+		case 'S':
+			saddr = optarg;
+			break;
+		case 's':
+			cfg_xdp_flags |= XDP_FLAGS_SKB_MODE;
+			break;
+		case 'T':
+			if (cfg_xdp_tx)
+				cfg_xdp_tx_force_attach = true;
+			cfg_xdp_tx = true;
+			break;
+		case 'z':
+			cfg_xdp_bind_flags |= XDP_ZEROCOPY;
+			break;
+		default:
+			error(1, 0, "%s: parse error", argv[0]);
+		}
+	}
+
+	if (cfg_family == PF_UNSPEC)
+		error(1, 0, "select one of -4 or -6");
+
+	if (!cfg_mac_src || !cfg_mac_dst || !saddr || !daddr)
+		error(1, 0, "all MAC and IP addresses must be set");
+	if (cfg_fill_after_bind && !cfg_xdp_rx && cfg_xdp_tx)
+		error(1, 0, "'-f' is meaningless without '-R' or '-T'");
+
+	if (inet_pton(cfg_family, daddr, cfg_daddr) != 1)
+		error(1, 0, "dst addr parse error: dst ('-D')");
+	if (inet_pton(cfg_family, saddr, cfg_saddr) != 1)
+		error(1, 0, "src addr parse error: src ('-S')");
+
+	cfg_ifindex = if_nametoindex(cfg_ifname);
+	if (!cfg_ifindex)
+		error(1, 0, "ifname invalid");
+}
+
+static void handle_signal(int sig)
+{
+	/* Signal handler (rather than default termination) needed to
+	 * make sure the atexit cleanup is invoked and XDP is detached
+	 */
+	exit(1);
+}
+
+int main(int argc, char *argv[])
+{
+	int ret;
+	int i;
+
+	parse_opts(argc, argv);
+	init_pkt();
+
+	/* A UDP socket to silence kernel-generated ICMP unreachable
+	 * without needing an iptables rule.
+	 */
+	udp_fd = socket(PF_INET6, SOCK_DGRAM, 0);
+	if (udp_fd == -1)
+		error(1, errno, "socket");
+
+	pfpacket_fd = socket(PF_PACKET, SOCK_RAW, 0);
+	if (pfpacket_fd == -1)
+		error(1, errno, "socket");
+
+	udp_bind();
+
+	pfpacket_setfilter_ipproto();
+	pfpacket_bind();
+
+	cfg_num_rxq = rxq_num(cfg_ifname);
+	if (cfg_num_rxq < 0)
+		error(1, -cfg_num_rxq, "rxq_num");
+
+	if (cfg_xdp_rx || cfg_xdp_tx_force_attach) {
+		bpf_obj = xsk_hw__open();
+		if (libbpf_get_error(bpf_obj))
+			error(1, libbpf_get_error(bpf_obj), "xsk_hw__open");
+
+		/* Not doing bpf_program__set_ifindex because it requests offload */
+
+		ret = xsk_hw__load(bpf_obj);
+		if (ret)
+			error(1, -ret, "xsk_hw__load");
+
+		bpf_obj->bss->port = cfg_port;
+		bpf_obj->bss->should_rx = cfg_xdp_rx;
+	}
+
+	if (cfg_xdp_rx || cfg_xdp_tx) {
+		xsks = calloc(cfg_num_rxq, sizeof(struct xsk));
+		if (!xsks)
+			error(1, ENOMEM, "malloc");
+
+		for (i = 0; i < cfg_num_rxq; i++) {
+			ret = open_xsk(&xsks[i], i);
+			if (ret)
+				error(1, -ret, "open_xsk");
+		}
+	}
+
+	if (cfg_xdp_rx) {
+		for (i = 0; i < cfg_num_rxq; i++) {
+			int sock_fd = xsk_socket__fd(xsks[i].socket);
+			__u32 queue_id = i;
+
+			ret = bpf_map__update_elem(bpf_obj->maps.xsk,
+						   &queue_id, sizeof(queue_id),
+						   &sock_fd, sizeof(sock_fd), 0);
+			if (ret)
+				error(1, -ret, "bpf_map__update_elem");
+		}
+	}
+
+	if (cfg_xdp_rx || cfg_xdp_tx_force_attach) {
+		ret = bpf_xdp_attach(cfg_ifindex,
+				     bpf_program__fd(bpf_obj->progs.rx),
+				     cfg_xdp_flags, NULL);
+		if (ret)
+			error(1, -ret, "bpf_xdp_attach");
+	}
+
+	/* Optionally exercise an AF_XDP API use edge case:
+	 * Bind the socket before making buffers available in the fillq.
+	 *
+	 * Peculiar behavior, but seen in practice and seen it tripping
+	 * up at least one driver that would leave networking disabled
+	 * on failing to allocate during init.
+	 */
+	if (cfg_fill_after_bind) {
+		for (i = 0; i < cfg_num_rxq; i++)
+			fill_xsk(&xsks[i]);
+	}
+
+	atexit(cleanup);
+	signal(SIGINT, handle_signal);
+	signal(SIGTERM, handle_signal);
+
+	/* XDP may need a delay for device reinitialization */
+	do {
+		usleep(100 * 1000);
+	} while (link_is_down());
+
+	sync_barrier();
+
+	if (cfg_host_run) {
+		usleep(100 * 1000);
+		do_recv();
+	} else {
+		do_send();
+	}
+}
-- 
2.45.2.505.gda0bf45e8d-goog