[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <PH0PR11MB5830DF14E012DCAC75090010D8DFA@PH0PR11MB5830.namprd11.prod.outlook.com>
Date: Tue, 24 Oct 2023 02:19:39 +0000
From: "Song, Yoong Siang" <yoong.siang.song@...el.com>
To: Stanislav Fomichev <sdf@...gle.com>, "bpf@...r.kernel.org"
<bpf@...r.kernel.org>
CC: "ast@...nel.org" <ast@...nel.org>, "daniel@...earbox.net"
<daniel@...earbox.net>, "andrii@...nel.org" <andrii@...nel.org>,
"martin.lau@...ux.dev" <martin.lau@...ux.dev>, "song@...nel.org"
<song@...nel.org>, "yhs@...com" <yhs@...com>, "john.fastabend@...il.com"
<john.fastabend@...il.com>, "kpsingh@...nel.org" <kpsingh@...nel.org>,
"haoluo@...gle.com" <haoluo@...gle.com>, "jolsa@...nel.org"
<jolsa@...nel.org>, "kuba@...nel.org" <kuba@...nel.org>, "toke@...nel.org"
<toke@...nel.org>, "willemb@...gle.com" <willemb@...gle.com>,
"dsahern@...nel.org" <dsahern@...nel.org>, "Karlsson, Magnus"
<magnus.karlsson@...el.com>, "bjorn@...nel.org" <bjorn@...nel.org>,
"Fijalkowski, Maciej" <maciej.fijalkowski@...el.com>, "hawk@...nel.org"
<hawk@...nel.org>, "netdev@...r.kernel.org" <netdev@...r.kernel.org>,
"xdp-hints@...-project.net" <xdp-hints@...-project.net>
Subject: RE: [PATCH bpf-next v4 10/11] selftests/bpf: Add TX side to
xdp_hw_metadata
On Friday, October 20, 2023 1:50 AM Stanislav Fomichev <sdf@...gle.com> wrote:
>When we get a packet on port 9091, we swap src/dst and send it out.
>At this point we also request the timestamp and checksum offloads.
>
>Checksum offload is verified by looking at the tcpdump on the other side.
>The tool prints pseudo-header csum and the final one it expects.
>The final checksum actually matches the incoming packets checksum
>because we only flip the src/dst and don't change the payload.
>
>Some other related changes:
>- switched to zerocopy mode by default; new flag can be used to force
> old behavior
>- request fixed tx_metadata_len headroom
>- some other small fixes (umem size, fill idx+i, etc)
>
>mvbz3:~# ./xdp_hw_metadata eth3
>...
>xsk_ring_cons__peek: 1
>0x19546f8: rx_desc[0]->addr=80100 addr=80100 comp_addr=80100
>rx_hash: 0x80B7EA8B with RSS type:0x2A
>rx_timestamp: 1697580171852147395 (sec:1697580171.8521)
>HW RX-time: 1697580171852147395 (sec:1697580171.8521), delta to User RX-time sec:0.2797 (279673.082 usec)
>XDP RX-time: 1697580172131699047 (sec:1697580172.1317), delta to User RX-time sec:0.0001 (121.430 usec)
>0x19546f8: ping-pong with csum=3b8e (want d862) csum_start=54 csum_offset=6
>0x19546f8: complete tx idx=0 addr=8
>tx_timestamp: 1697580172056756493 (sec:1697580172.0568)
Hi Stanislav,
rx_timestamp is duplicating HW RX-time while tx_timestamp is duplicating HW TX-complete-time,
so, I think can remove printing of rx_timestamp and tx_timestamp to avoid confusion.
>HW TX-complete-time: 1697580172056756493 (sec:1697580172.0568), delta to User TX-complete-time sec:0.0852 (85175.537 usec)
>XDP RX-time: 1697580172131699047 (sec:1697580172.1317), delta to User TX-complete-time sec:0.0102 (10232.983 usec)
>HW RX-time: 1697580171852147395 (sec:1697580171.8521), delta to HW TX-complete-time sec:0.2046 (204609.098 usec)
>0x19546f8: complete rx idx=128 addr=80100
>
>mvbz4:~# nc -Nu -q1 ${MVBZ3_LINK_LOCAL_IP}%eth3 9091
>
>mvbz4:~# tcpdump -vvx -i eth3 udp
> tcpdump: listening on eth3, link-type EN10MB (Ethernet), snapshot length 262144
>bytes
>12:26:09.301074 IP6 (flowlabel 0x35fa5, hlim 127, next-header UDP (17) payload
>length: 11) fe80::1270:fdff:fe48:1087.55807 > fe80::1270:fdff:fe48:1077.9091: [bad
>udp cksum 0x3b8e -> 0xde7e!] UDP, length 3
> 0x0000: 6003 5fa5 000b 117f fe80 0000 0000 0000
> 0x0010: 1270 fdff fe48 1087 fe80 0000 0000 0000
> 0x0020: 1270 fdff fe48 1077 d9ff 2383 000b 3b8e
> 0x0030: 7864 70
>12:26:09.301976 IP6 (flowlabel 0x35fa5, hlim 127, next-header UDP (17) payload
>length: 11) fe80::1270:fdff:fe48:1077.9091 > fe80::1270:fdff:fe48:1087.55807: [udp
>sum ok] UDP, length 3
> 0x0000: 6003 5fa5 000b 117f fe80 0000 0000 0000
> 0x0010: 1270 fdff fe48 1077 fe80 0000 0000 0000
> 0x0020: 1270 fdff fe48 1087 2383 d9ff 000b de7e
> 0x0030: 7864 70
>
>This reverts commit c3c9abc1d0c989e0be21d78cccd99076cc94ec44.
It didn't looked like this patch is reverting something.
If this is not a mistake, can you add the commit title behind the ID?
Thanks & Regards
Siang
>
>Signed-off-by: Stanislav Fomichev <sdf@...gle.com>
>---
> tools/testing/selftests/bpf/xdp_hw_metadata.c | 159 +++++++++++++++++-
> 1 file changed, 157 insertions(+), 2 deletions(-)
>
>diff --git a/tools/testing/selftests/bpf/xdp_hw_metadata.c
>b/tools/testing/selftests/bpf/xdp_hw_metadata.c
>index 057f7c145f62..d9421c5889f8 100644
>--- a/tools/testing/selftests/bpf/xdp_hw_metadata.c
>+++ b/tools/testing/selftests/bpf/xdp_hw_metadata.c
>@@ -10,7 +10,9 @@
> * - rx_hash
> *
> * TX:
>- * - TBD
>+ * - UDP 9091 packets trigger TX reply
>+ * - TX HW timestamp is requested and reported back upon completion
>+ * - TX checksum is requested
> */
>
> #include <test_progs.h>
>@@ -24,11 +26,14 @@
> #include <linux/net_tstamp.h>
> #include <linux/udp.h>
> #include <linux/sockios.h>
>+#include <linux/if_xdp.h>
> #include <sys/mman.h>
> #include <net/if.h>
> #include <ctype.h>
> #include <poll.h>
> #include <time.h>
>+#include <unistd.h>
>+#include <libgen.h>
>
> #include "xdp_metadata.h"
>
>@@ -53,6 +58,9 @@ struct xsk *rx_xsk;
> const char *ifname;
> int ifindex;
> int rxq;
>+bool skip_tx;
>+__u64 last_hw_rx_timestamp;
>+__u64 last_xdp_rx_timestamp;
>
> void test__fail(void) { /* for network_helpers.c */ }
>
>@@ -69,6 +77,7 @@ static int open_xsk(int ifindex, struct xsk *xsk, __u32 queue_id)
> .comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS,
> .frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE,
> .flags = XSK_UMEM__DEFAULT_FLAGS,
>+ .tx_metadata_len = sizeof(struct xsk_tx_metadata),
> };
> __u32 idx;
> u64 addr;
>@@ -190,6 +199,10 @@ static void verify_xdp_metadata(void *data, clockid_t
>clock_id)
> if (meta->rx_timestamp) {
> __u64 ref_tstamp = gettime(clock_id);
>
>+ /* store received timestamps to calculate a delta at tx */
>+ last_hw_rx_timestamp = meta->rx_timestamp;
>+ last_xdp_rx_timestamp = meta->xdp_timestamp;
>+
> print_tstamp_delta("HW RX-time", "User RX-time",
> meta->rx_timestamp, ref_tstamp);
> print_tstamp_delta("XDP RX-time", "User RX-time",
>@@ -242,6 +255,128 @@ static void verify_skb_metadata(int fd)
> printf("skb hwtstamp is not found!\n");
> }
>
>+static bool complete_tx(struct xsk *xsk, clockid_t clock_id)
>+{
>+ struct xsk_tx_metadata *meta;
>+ __u64 addr;
>+ void *data;
>+ __u32 idx;
>+
>+ if (!xsk_ring_cons__peek(&xsk->comp, 1, &idx))
>+ return false;
>+
>+ addr = *xsk_ring_cons__comp_addr(&xsk->comp, idx);
>+ data = xsk_umem__get_data(xsk->umem_area, addr);
>+ meta = data - sizeof(struct xsk_tx_metadata);
>+
>+ printf("%p: complete tx idx=%u addr=%llx\n", xsk, idx, addr);
>+
>+ printf("tx_timestamp: %llu (sec:%0.4f)\n", meta->completion.tx_timestamp,
>+ (double)meta->completion.tx_timestamp / NANOSEC_PER_SEC);
>+ if (meta->completion.tx_timestamp) {
>+ __u64 ref_tstamp = gettime(clock_id);
>+
>+ print_tstamp_delta("HW TX-complete-time", "User TX-complete-
>time",
>+ meta->completion.tx_timestamp, ref_tstamp);
>+ print_tstamp_delta("XDP RX-time", "User TX-complete-time",
>+ last_xdp_rx_timestamp, ref_tstamp);
>+ print_tstamp_delta("HW RX-time", "HW TX-complete-time",
>+ last_hw_rx_timestamp, meta-
>>completion.tx_timestamp);
>+ }
>+
>+ xsk_ring_cons__release(&xsk->comp, 1);
>+
>+ return true;
>+}
>+
>+#define swap(a, b, len) do { \
>+ for (int i = 0; i < len; i++) { \
>+ __u8 tmp = ((__u8 *)a)[i]; \
>+ ((__u8 *)a)[i] = ((__u8 *)b)[i]; \
>+ ((__u8 *)b)[i] = tmp; \
>+ } \
>+} while (0)
>+
>+static void ping_pong(struct xsk *xsk, void *rx_packet, clockid_t clock_id)
>+{
>+ struct xsk_tx_metadata *meta;
>+ struct ipv6hdr *ip6h = NULL;
>+ struct iphdr *iph = NULL;
>+ struct xdp_desc *tx_desc;
>+ struct udphdr *udph;
>+ struct ethhdr *eth;
>+ __sum16 want_csum;
>+ void *data;
>+ __u32 idx;
>+ int ret;
>+ int len;
>+
>+ ret = xsk_ring_prod__reserve(&xsk->tx, 1, &idx);
>+ if (ret != 1) {
>+ printf("%p: failed to reserve tx slot\n", xsk);
>+ return;
>+ }
>+
>+ tx_desc = xsk_ring_prod__tx_desc(&xsk->tx, idx);
>+ tx_desc->addr = idx % (UMEM_NUM / 2) * UMEM_FRAME_SIZE +
>sizeof(struct xsk_tx_metadata);
>+ data = xsk_umem__get_data(xsk->umem_area, tx_desc->addr);
>+
>+ meta = data - sizeof(struct xsk_tx_metadata);
>+ memset(meta, 0, sizeof(*meta));
>+ meta->flags = XDP_TX_METADATA_TIMESTAMP;
>+
>+ eth = rx_packet;
>+
>+ if (eth->h_proto == htons(ETH_P_IP)) {
>+ iph = (void *)(eth + 1);
>+ udph = (void *)(iph + 1);
>+ } else if (eth->h_proto == htons(ETH_P_IPV6)) {
>+ ip6h = (void *)(eth + 1);
>+ udph = (void *)(ip6h + 1);
>+ } else {
>+ printf("%p: failed to detect IP version for ping pong %04x\n", xsk, eth-
>>h_proto);
>+ xsk_ring_prod__cancel(&xsk->tx, 1);
>+ return;
>+ }
>+
>+ len = ETH_HLEN;
>+ if (ip6h)
>+ len += sizeof(*ip6h) + ntohs(ip6h->payload_len);
>+ if (iph)
>+ len += ntohs(iph->tot_len);
>+
>+ swap(eth->h_dest, eth->h_source, ETH_ALEN);
>+ if (iph)
>+ swap(&iph->saddr, &iph->daddr, 4);
>+ else
>+ swap(&ip6h->saddr, &ip6h->daddr, 16);
>+ swap(&udph->source, &udph->dest, 2);
>+
>+ want_csum = udph->check;
>+ if (ip6h)
>+ udph->check = ~csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr,
>+ ntohs(udph->len), IPPROTO_UDP, 0);
>+ else
>+ udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
>+ ntohs(udph->len), IPPROTO_UDP, 0);
>+
>+ meta->flags |= XDP_TX_METADATA_CHECKSUM;
>+ if (iph)
>+ meta->csum_start = sizeof(*eth) + sizeof(*iph);
>+ else
>+ meta->csum_start = sizeof(*eth) + sizeof(*ip6h);
>+ meta->csum_offset = offsetof(struct udphdr, check);
>+
>+ printf("%p: ping-pong with csum=%04x (want %04x) csum_start=%d
>csum_offset=%d\n",
>+ xsk, ntohs(udph->check), ntohs(want_csum), meta->csum_start, meta-
>>csum_offset);
>+
>+ memcpy(data, rx_packet, len); /* don't share umem chunk for simplicity */
>+ tx_desc->options |= XDP_TX_METADATA;
>+ tx_desc->len = len;
>+
>+ xsk_ring_prod__submit(&xsk->tx, 1);
>+}
>+
> static int verify_metadata(struct xsk *rx_xsk, int rxq, int server_fd, clockid_t clock_id)
> {
> const struct xdp_desc *rx_desc;
>@@ -307,6 +442,22 @@ static int verify_metadata(struct xsk *rx_xsk, int rxq, int
>server_fd, clockid_t
> verify_xdp_metadata(xsk_umem__get_data(xsk-
>>umem_area, addr),
> clock_id);
> first_seg = false;
>+
>+ if (!skip_tx) {
>+ /* mirror first chunk back */
>+ ping_pong(xsk, xsk_umem__get_data(xsk-
>>umem_area, addr),
>+ clock_id);
>+
>+ ret = kick_tx(xsk);
>+ if (ret)
>+ printf("kick_tx ret=%d\n", ret);
>+
>+ for (int j = 0; j < 500; j++) {
>+ if (complete_tx(xsk, clock_id))
>+ break;
>+ usleep(10*1000);
>+ }
>+ }
> }
>
> xsk_ring_cons__release(&xsk->rx, 1);
>@@ -442,6 +593,7 @@ static void print_usage(void)
> " -c Run in copy mode (zerocopy is default)\n"
> " -h Display this help and exit\n\n"
> " -m Enable multi-buffer XDP for larger MTU\n"
>+ " -r Don't generate AF_XDP reply (rx metadata only)\n"
> "Generate test packets on the other machine with:\n"
> " echo -n xdp | nc -u -q1 <dst_ip> 9091\n";
>
>@@ -452,7 +604,7 @@ static void read_args(int argc, char *argv[])
> {
> char opt;
>
>- while ((opt = getopt(argc, argv, "chm")) != -1) {
>+ while ((opt = getopt(argc, argv, "chmr")) != -1) {
> switch (opt) {
> case 'c':
> bind_flags &= ~XDP_USE_NEED_WAKEUP;
>@@ -465,6 +617,9 @@ static void read_args(int argc, char *argv[])
> case 'm':
> bind_flags |= XDP_USE_SG;
> break;
>+ case 'r':
>+ skip_tx = true;
>+ break;
> case '?':
> if (isprint(optopt))
> fprintf(stderr, "Unknown option: -%c\n", optopt);
>--
>2.42.0.655.g421f12c284-goog
Powered by blists - more mailing lists