netdev - Re: [RFC PATCH net-next v6 13/13] bpf: add simple bpf tests in the tx path for so

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <564d8d62-3148-41a1-ae08-ed4ad08996d3@linux.dev>
Date: Fri, 24 Jan 2025 19:07:57 -0800
From: Martin KaFai Lau <martin.lau@...ux.dev>
To: Jason Xing <kerneljasonxing@...il.com>
Cc: davem@...emloft.net, edumazet@...gle.com, kuba@...nel.org,
 pabeni@...hat.com, dsahern@...nel.org, willemdebruijn.kernel@...il.com,
 willemb@...gle.com, ast@...nel.org, daniel@...earbox.net, andrii@...nel.org,
 eddyz87@...il.com, song@...nel.org, yonghong.song@...ux.dev,
 john.fastabend@...il.com, kpsingh@...nel.org, sdf@...ichev.me,
 haoluo@...gle.com, jolsa@...nel.org, horms@...nel.org, bpf@...r.kernel.org,
 netdev@...r.kernel.org
Subject: Re: [RFC PATCH net-next v6 13/13] bpf: add simple bpf tests in the tx
 path for so_timestamping feature

On 1/20/25 5:29 PM, Jason Xing wrote:
> Only check if we pass those three key points after we enable the
> bpf extension for so_timestamping. During each point, we can choose
> whether to print the current timestamp.
> 
> Signed-off-by: Jason Xing <kerneljasonxing@...il.com>
> ---
>   .../bpf/prog_tests/so_timestamping.c          |  98 ++++++++
>   .../selftests/bpf/progs/so_timestamping.c     | 227 ++++++++++++++++++
>   2 files changed, 325 insertions(+)
>   create mode 100644 tools/testing/selftests/bpf/prog_tests/so_timestamping.c
>   create mode 100644 tools/testing/selftests/bpf/progs/so_timestamping.c
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/so_timestamping.c b/tools/testing/selftests/bpf/prog_tests/so_timestamping.c
> new file mode 100644
> index 000000000000..bbfa7eb38cfb
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/prog_tests/so_timestamping.c
> @@ -0,0 +1,98 @@
> +#define _GNU_SOURCE
> +#include <sched.h>
> +#include <linux/socket.h>
> +#include <linux/tls.h>
> +#include <net/if.h>
> +
> +#include "test_progs.h"
> +#include "cgroup_helpers.h"
> +#include "network_helpers.h"
> +
> +#include "so_timestamping.skel.h"
> +
> +#define CG_NAME "/so-timestamping-test"
> +
> +static const char addr4_str[] = "127.0.0.1";
> +static const char addr6_str[] = "::1";
> +static struct so_timestamping *skel;
> +static int cg_fd;
> +
> +static int create_netns(void)

Reuse the netns_new("so_timestamping_ns", true) from test_progs.c.

> +{
> +	if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns"))
> +		return -1;
> +
> +	if (!ASSERT_OK(system("ip link set dev lo up"), "set lo up"))
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static void test_tcp(int family)
> +{
> +	struct so_timestamping__bss *bss = skel->bss;
> +	char buf[] = "testing testing";
> +	int sfd = -1, cfd = -1;
> +	int n;
> +
> +	memset(bss, 0, sizeof(*bss));
> +
> +	sfd = start_server(family, SOCK_STREAM,
> +			   family == AF_INET6 ? addr6_str : addr4_str, 0, 0);
> +	if (!ASSERT_GE(sfd, 0, "start_server"))

nit. ASSERT_OK_FD.

> +		goto out;
> +
> +	cfd = connect_to_fd(sfd, 0);
> +	if (!ASSERT_GE(cfd, 0, "connect_to_fd_server")) {

Same here. ASSERT_OK_FD.

> +		close(sfd);

This close is unnecessary. It will cause a double close at "out:" also.

> +		goto out;
> +	}
> +
> +	n = write(cfd, buf, sizeof(buf));
> +	if (!ASSERT_EQ(n, sizeof(buf), "send to server"))
> +		goto out;
> +
> +	ASSERT_EQ(bss->nr_active, 1, "nr_active");
> +	ASSERT_EQ(bss->nr_snd, 2, "nr_snd");
> +	ASSERT_EQ(bss->nr_sched, 1, "nr_sched");
> +	ASSERT_EQ(bss->nr_txsw, 1, "nr_txsw");
> +	ASSERT_EQ(bss->nr_ack, 1, "nr_ack");
> +
> +out:
> +	if (sfd >= 0)
> +		close(sfd);
> +	if (cfd >= 0)
> +		close(cfd);
> +}
> +
> +void test_so_timestamping(void)
> +{
> +	cg_fd = test__join_cgroup(CG_NAME);
> +	if (cg_fd < 0)
> +		return;
> +
> +	if (create_netns())
> +		goto done;
> +
> +	skel = so_timestamping__open();

nit. so_timestamping__open_and_load()

> +	if (!ASSERT_OK_PTR(skel, "open skel"))
> +		goto done;
> +
> +	if (!ASSERT_OK(so_timestamping__load(skel), "load skel"))

Then this __load() is not need.

> +		goto done;
> +
> +	if (!ASSERT_OK(so_timestamping__attach(skel), "attach skel"))
> +		goto done;
> +
> +	skel->links.skops_sockopt =
> +		bpf_program__attach_cgroup(skel->progs.skops_sockopt, cg_fd);
> +	if (!ASSERT_OK_PTR(skel->links.skops_sockopt, "attach cgroup"))
> +		goto done;
> +
> +	test_tcp(AF_INET6);
> +	test_tcp(AF_INET);
> +
> +done:
> +	so_timestamping__destroy(skel);
> +	close(cg_fd);
> +}
> diff --git a/tools/testing/selftests/bpf/progs/so_timestamping.c b/tools/testing/selftests/bpf/progs/so_timestamping.c
> new file mode 100644
> index 000000000000..f4708e84c243
> --- /dev/null
> +++ b/tools/testing/selftests/bpf/progs/so_timestamping.c
> @@ -0,0 +1,227 @@
> +#include "vmlinux.h"
> +#include "bpf_tracing_net.h"
> +#include <bpf/bpf_core_read.h>
> +#include <bpf/bpf_helpers.h>
> +#include <bpf/bpf_tracing.h>
> +//#include <bpf/bpf_core_read.h>
> +#include "bpf_misc.h"
> +#include "bpf_kfuncs.h"
> +
> +#define SK_BPF_CB_FLAGS 1009
> +#define SK_BPF_CB_TX_TIMESTAMPING 1
> +
> +int nr_active;
> +int nr_snd;
> +int nr_passive;
> +int nr_sched;
> +int nr_txsw;
> +int nr_ack;
> +
> +struct sockopt_test {
> +	int opt;
> +	int new;
> +};
> +
> +static const struct sockopt_test sol_socket_tests[] = {
> +	{ .opt = SK_BPF_CB_FLAGS, .new = SK_BPF_CB_TX_TIMESTAMPING, },
> +	{ .opt = 0, },
> +};
> +
> +struct loop_ctx {
> +	void *ctx;
> +	const struct sock *sk;
> +};
> +
> +struct sk_stg {
> +	__u64 sendmsg_ns;	/* record ts when sendmsg is called */
> +};
> +
> +struct {
> +	__uint(type, BPF_MAP_TYPE_SK_STORAGE);
> +	__uint(map_flags, BPF_F_NO_PREALLOC);
> +	__type(key, int);
> +	__type(value, struct sk_stg);
> +} sk_stg_map SEC(".maps");
> +
> +
> +struct delay_info {
> +	u64 sendmsg_ns;		/* record ts when sendmsg is called */
> +	u32 sched_delay;	/* SCHED_OPT_CB - sendmsg_ns */
> +	u32 sw_snd_delay;	/* SW_OPT_CB - SCHED_OPT_CB */
> +	u32 ack_delay;		/* ACK_OPT_CB - SW_OPT_CB */
> +};
> +
> +struct {
> +	__uint(type, BPF_MAP_TYPE_HASH);
> +	__type(key, u32);
> +	__type(value, struct delay_info);
> +	__uint(max_entries, 1024);
> +} time_map SEC(".maps");
> +
> +static u64 delay_tolerance_nsec = 1000000000; /* 1 second as an example */
> +
> +static int bpf_test_sockopt_int(void *ctx, const struct sock *sk,
> +				const struct sockopt_test *t,
> +				int level)
> +{
> +	int new, opt, tmp;
> +
> +	opt = t->opt;
> +	new = t->new;
> +
> +	if (bpf_setsockopt(ctx, level, opt, &new, sizeof(new)))
> +		return 1;
> +
> +	if (bpf_getsockopt(ctx, level, opt, &tmp, sizeof(tmp)) ||
> +	    tmp != new) {
> +		return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +static int bpf_test_socket_sockopt(__u32 i, struct loop_ctx *lc)
> +{
> +	const struct sockopt_test *t;
> +
> +	if (i >= ARRAY_SIZE(sol_socket_tests))
> +		return 1;
> +
> +	t = &sol_socket_tests[i];
> +	if (!t->opt)
> +		return 1;
> +
> +	return bpf_test_sockopt_int(lc->ctx, lc->sk, t, SOL_SOCKET);
> +}
> +
> +static int bpf_test_sockopt(void *ctx, const struct sock *sk)
> +{
> +	struct loop_ctx lc = { .ctx = ctx, .sk = sk, };
> +	int n;
> +
> +	n = bpf_loop(ARRAY_SIZE(sol_socket_tests), bpf_test_socket_sockopt, &lc, 0);
> +	if (n != ARRAY_SIZE(sol_socket_tests))
> +		return -1;
> +
> +	return 0;
> +}
> +
> +static bool bpf_test_delay(struct bpf_sock_ops *skops, const struct sock *sk)
> +{
> +	struct bpf_sock_ops_kern *skops_kern;
> +	u64 timestamp = bpf_ktime_get_ns();
> +	struct skb_shared_info *shinfo;
> +	struct delay_info dinfo = {0};
> +	struct delay_info *val;
> +	struct sk_buff *skb;
> +	struct sk_stg *stg;
> +	u32 delay, tskey;
> +	u64 prior_ts;
> +
> +	skops_kern = bpf_cast_to_kern_ctx(skops);
> +	skb = skops_kern->skb;
> +	shinfo = bpf_core_cast(skb->head + skb->end, struct skb_shared_info);
> +	tskey = shinfo->tskey;
> +	if (!tskey)
> +		return false;
> +
> +	if (skops->op == BPF_SOCK_OPS_TS_TCP_SND_CB) {
> +		stg = bpf_sk_storage_get(&sk_stg_map, (void *)sk, 0, 0);
> +		if (!stg)
> +			return false;
> +		dinfo.sendmsg_ns = stg->sendmsg_ns;
> +		val = &dinfo;

Move the map_update here instead.

		bpf_map_update_elem(&time_map, &tskey, val, BPF_ANY);

> +		goto out;
> +	}
> +
> +	val = bpf_map_lookup_elem(&time_map, &tskey);
> +	if (!val)
> +		return false;
> +
> +	switch (skops->op) {
> +	case BPF_SOCK_OPS_TS_SCHED_OPT_CB:
> +		delay = val->sched_delay = timestamp - val->sendmsg_ns;
> +		break;
> +	case BPF_SOCK_OPS_TS_SW_OPT_CB:
> +		prior_ts = val->sched_delay + val->sendmsg_ns;
> +		delay = val->sw_snd_delay = timestamp - prior_ts;
> +		break;
> +	case BPF_SOCK_OPS_TS_ACK_OPT_CB:
> +		prior_ts = val->sw_snd_delay + val->sched_delay + val->sendmsg_ns;
> +		delay = val->ack_delay = timestamp - prior_ts;
> +		break;
> +	}
> +
> +	if (delay <= 0 || delay >= delay_tolerance_nsec)

Regarding delay <= 0 check, note that delay was defined as u32.

delay_tolerance_nsec is 1 sec which could be too short for the bpf CI. May be 
raise it to like 10s and only check "if (delay >= delay_tolerance_nsec)". It 
will be useful to bump a nr_long_delay++ also and ASSERT in the userspace.

btw, it is in nsec, is u32 enough?


> +		return false;
> +
> +	/* Since it's the last one, remove from the map after latency check */
> +	if (skops->op == BPF_SOCK_OPS_TS_ACK_OPT_CB) {
> +		bpf_map_delete_elem(&time_map, &tskey);
> +		return true;
> +	}
> +
> +out:
> +	bpf_map_update_elem(&time_map, &tskey, val, BPF_ANY);

then no need to do update_elem here for other op.

Overall, I think the set looks good. Only a few things left. Thanks for 
revamping the test also. The test should be pretty close to how it will be used.

Please add tests to ensure the new timestamping callbacks cannot use the helpers 
that we discussed in the earlier patch and also cannot directly read/write the 
sock fields through the bpf_sock_ops.

Please also add some details on how the UDP BPF_SOCK_OPS_TS_TCP_SND_CB (or to be 
renamed to BPF_SOCK_OPS_TS_SND_CB ?) will look like. It is the only callback 
that I don't have a clear idea for UDP.

Please tag the set to bpf-next. Then the bpf CI can pick up automatically and 
continue testing it whenever some other bpf patches landed.

[ I will reply other followup later ]

> +	return true;
> +}
> +
> +SEC("fentry/tcp_sendmsg_locked")
> +int BPF_PROG(trace_tcp_sendmsg_locked, struct sock *sk, struct msghdr *msg, size_t size)
> +{
> +	u64 timestamp = bpf_ktime_get_ns();
> +	u32 flag = sk->sk_bpf_cb_flags;
> +	struct sk_stg *stg;
> +
> +	if (!flag)
> +		return 0;
> +
> +	stg = bpf_sk_storage_get(&sk_stg_map, sk, 0,
> +				 BPF_SK_STORAGE_GET_F_CREATE);
> +	if (!stg)
> +		return 0;
> +
> +	stg->sendmsg_ns = timestamp;
> +	nr_snd += 1;
> +	return 0;
> +}
> +
> +SEC("sockops")
> +int skops_sockopt(struct bpf_sock_ops *skops)
> +{
> +	struct bpf_sock *bpf_sk = skops->sk;
> +	const struct sock *sk;
> +
> +	if (!bpf_sk)
> +		return 1;
> +
> +	sk = (struct sock *)bpf_skc_to_tcp_sock(bpf_sk);
> +	if (!sk)
> +		return 1;
> +
> +	switch (skops->op) {
> +	case BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB:
> +		nr_active += !bpf_test_sockopt(skops, sk);
> +		break;
> +	case BPF_SOCK_OPS_TS_TCP_SND_CB:
> +		if (bpf_test_delay(skops, sk))
> +			nr_snd += 1;
> +		break;
> +	case BPF_SOCK_OPS_TS_SCHED_OPT_CB:
> +		if (bpf_test_delay(skops, sk))
> +			nr_sched += 1;
> +		break;
> +	case BPF_SOCK_OPS_TS_SW_OPT_CB:
> +		if (bpf_test_delay(skops, sk))
> +			nr_txsw += 1;
> +		break;
> +	case BPF_SOCK_OPS_TS_ACK_OPT_CB:
> +		if (bpf_test_delay(skops, sk))
> +			nr_ack += 1;
> +		break;
> +	}
> +
> +	return 1;
> +}
> +
> +char _license[] SEC("license") = "GPL";