[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20191224013140.ibn33unj77mtbkne@kafai-mbp>
Date: Tue, 24 Dec 2019 01:31:44 +0000
From: Martin Lau <kafai@...com>
To: Andrii Nakryiko <andrii.nakryiko@...il.com>
CC: bpf <bpf@...r.kernel.org>, Alexei Starovoitov <ast@...nel.org>,
"Daniel Borkmann" <daniel@...earbox.net>,
David Miller <davem@...emloft.net>,
"Kernel Team" <Kernel-team@...com>,
Networking <netdev@...r.kernel.org>
Subject: Re: [PATCH bpf-next v2 11/11] bpf: Add bpf_dctcp example
On Mon, Dec 23, 2019 at 03:26:50PM -0800, Andrii Nakryiko wrote:
> On Fri, Dec 20, 2019 at 10:26 PM Martin KaFai Lau <kafai@...com> wrote:
> >
> > This patch adds a bpf_dctcp example. It currently does not do
> > no-ECN fallback but the same could be done through the cgrp2-bpf.
> >
> > Signed-off-by: Martin KaFai Lau <kafai@...com>
> > ---
> > tools/testing/selftests/bpf/bpf_tcp_helpers.h | 228 ++++++++++++++++++
> > .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 218 +++++++++++++++++
> > tools/testing/selftests/bpf/progs/bpf_dctcp.c | 210 ++++++++++++++++
> > 3 files changed, 656 insertions(+)
> > create mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> > create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp.c
> >
> > diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > new file mode 100644
> > index 000000000000..7ba8c1b4157a
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > @@ -0,0 +1,228 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef __BPF_TCP_HELPERS_H
> > +#define __BPF_TCP_HELPERS_H
> > +
> > +#include <stdbool.h>
> > +#include <linux/types.h>
> > +#include <bpf_helpers.h>
> > +#include <bpf_core_read.h>
> > +#include "bpf_trace_helpers.h"
> > +
> > +#define BPF_TCP_OPS_0(fname, ret_type, ...) BPF_TRACE_x(0, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > +#define BPF_TCP_OPS_1(fname, ret_type, ...) BPF_TRACE_x(1, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > +#define BPF_TCP_OPS_2(fname, ret_type, ...) BPF_TRACE_x(2, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > +#define BPF_TCP_OPS_3(fname, ret_type, ...) BPF_TRACE_x(3, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > +#define BPF_TCP_OPS_4(fname, ret_type, ...) BPF_TRACE_x(4, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > +#define BPF_TCP_OPS_5(fname, ret_type, ...) BPF_TRACE_x(5, #fname"_sec", fname, ret_type, __VA_ARGS__)
>
> Should we try to put those BPF programs into some section that would
> indicate they are used with struct opts? libbpf doesn't use or enforce
> that (even though it could to derive and enforce that they are
> STRUCT_OPS programs). So something like
> SEC("struct_ops/<ideally-operation-name-here>"). I think having this
> convention is very useful for consistency and to do a quick ELF dump
> and see what is where. WDYT?
I did not use it here because I don't want any misperception that it is
a required convention by libbpf.
Sure, I can prefix it here and comment that it is just a
convention but not a libbpf's requirement.
>
> > +
> > +struct sock_common {
> > + unsigned char skc_state;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct sock {
> > + struct sock_common __sk_common;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct inet_sock {
> > + struct sock sk;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct inet_connection_sock {
> > + struct inet_sock icsk_inet;
> > + __u8 icsk_ca_state:6,
> > + icsk_ca_setsockopt:1,
> > + icsk_ca_dst_locked:1;
> > + struct {
> > + __u8 pending;
> > + } icsk_ack;
> > + __u64 icsk_ca_priv[104 / sizeof(__u64)];
> > +} __attribute__((preserve_access_index));
> > +
> > +struct tcp_sock {
> > + struct inet_connection_sock inet_conn;
> > +
> > + __u32 rcv_nxt;
> > + __u32 snd_nxt;
> > + __u32 snd_una;
> > + __u8 ecn_flags;
> > + __u32 delivered;
> > + __u32 delivered_ce;
> > + __u32 snd_cwnd;
> > + __u32 snd_cwnd_cnt;
> > + __u32 snd_cwnd_clamp;
> > + __u32 snd_ssthresh;
> > + __u8 syn_data:1, /* SYN includes data */
> > + syn_fastopen:1, /* SYN includes Fast Open option */
> > + syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
> > + syn_fastopen_ch:1, /* Active TFO re-enabling probe */
> > + syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
> > + save_syn:1, /* Save headers of SYN packet */
> > + is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
> > + syn_smc:1; /* SYN includes SMC */
> > + __u32 max_packets_out;
> > + __u32 lsndtime;
> > + __u32 prior_cwnd;
> > +} __attribute__((preserve_access_index));
> > +
> > +static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk)
> > +{
> > + return (struct inet_connection_sock *)sk;
> > +}
> > +
> > +static __always_inline void *inet_csk_ca(const struct sock *sk)
> > +{
> > + return (void *)inet_csk(sk)->icsk_ca_priv;
> > +}
> > +
> > +static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk)
> > +{
> > + return (struct tcp_sock *)sk;
> > +}
> > +
> > +static __always_inline bool before(__u32 seq1, __u32 seq2)
> > +{
> > + return (__s32)(seq1-seq2) < 0;
> > +}
> > +#define after(seq2, seq1) before(seq1, seq2)
> > +
> > +#define TCP_ECN_OK 1
> > +#define TCP_ECN_QUEUE_CWR 2
> > +#define TCP_ECN_DEMAND_CWR 4
> > +#define TCP_ECN_SEEN 8
> > +
> > +enum inet_csk_ack_state_t {
> > + ICSK_ACK_SCHED = 1,
> > + ICSK_ACK_TIMER = 2,
> > + ICSK_ACK_PUSHED = 4,
> > + ICSK_ACK_PUSHED2 = 8,
> > + ICSK_ACK_NOW = 16 /* Send the next ACK immediately (once) */
> > +};
> > +
> > +enum tcp_ca_event {
> > + CA_EVENT_TX_START = 0,
> > + CA_EVENT_CWND_RESTART = 1,
> > + CA_EVENT_COMPLETE_CWR = 2,
> > + CA_EVENT_LOSS = 3,
> > + CA_EVENT_ECN_NO_CE = 4,
> > + CA_EVENT_ECN_IS_CE = 5,
> > +};
> > +
> > +enum tcp_ca_state {
> > + TCP_CA_Open = 0,
> > + TCP_CA_Disorder = 1,
> > + TCP_CA_CWR = 2,
> > + TCP_CA_Recovery = 3,
> > + TCP_CA_Loss = 4
> > +};
> > +
> > +struct ack_sample {
> > + __u32 pkts_acked;
> > + __s32 rtt_us;
> > + __u32 in_flight;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct rate_sample {
> > + __u64 prior_mstamp; /* starting timestamp for interval */
> > + __u32 prior_delivered; /* tp->delivered at "prior_mstamp" */
> > + __s32 delivered; /* number of packets delivered over interval */
> > + long interval_us; /* time for tp->delivered to incr "delivered" */
> > + __u32 snd_interval_us; /* snd interval for delivered packets */
> > + __u32 rcv_interval_us; /* rcv interval for delivered packets */
> > + long rtt_us; /* RTT of last (S)ACKed packet (or -1) */
> > + int losses; /* number of packets marked lost upon ACK */
> > + __u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */
> > + __u32 prior_in_flight; /* in flight before this ACK */
> > + bool is_app_limited; /* is sample from packet with bubble in pipe? */
> > + bool is_retrans; /* is sample from retransmission? */
> > + bool is_ack_delayed; /* is this (likely) a delayed ACK? */
> > +} __attribute__((preserve_access_index));
> > +
> > +#define TCP_CA_NAME_MAX 16
> > +#define TCP_CONG_NEEDS_ECN 0x2
> > +
> > +struct tcp_congestion_ops {
> > + __u32 flags;
> > +
> > + /* initialize private data (optional) */
> > + void (*init)(struct sock *sk);
> > + /* cleanup private data (optional) */
> > + void (*release)(struct sock *sk);
> > +
> > + /* return slow start threshold (required) */
> > + __u32 (*ssthresh)(struct sock *sk);
> > + /* do new cwnd calculation (required) */
> > + void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked);
> > + /* call before changing ca_state (optional) */
> > + void (*set_state)(struct sock *sk, __u8 new_state);
> > + /* call when cwnd event occurs (optional) */
> > + void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
> > + /* call when ack arrives (optional) */
> > + void (*in_ack_event)(struct sock *sk, __u32 flags);
> > + /* new value of cwnd after loss (required) */
> > + __u32 (*undo_cwnd)(struct sock *sk);
> > + /* hook for packet ack accounting (optional) */
> > + void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
> > + /* override sysctl_tcp_min_tso_segs */
> > + __u32 (*min_tso_segs)(struct sock *sk);
> > + /* returns the multiplier used in tcp_sndbuf_expand (optional) */
> > + __u32 (*sndbuf_expand)(struct sock *sk);
> > + /* call when packets are delivered to update cwnd and pacing rate,
> > + * after all the ca_state processing. (optional)
> > + */
> > + void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
> > +
> > + char name[TCP_CA_NAME_MAX];
> > +};
>
> Can all of these types come from vmlinux.h instead of being duplicated here?
It can but I prefer leaving it as is in bpf_tcp_helpers.h like another
existing test in kfree_skb.c. Without directly using the same struct in
vmlinux.h, I think it is a good test for libbpf.
That remind me to shuffle the member ordering a little in tcp_congestion_ops
here.
>
> > +
> > +#define min(a, b) ((a) < (b) ? (a) : (b))
> > +#define max(a, b) ((a) > (b) ? (a) : (b))
> > +#define min_not_zero(x, y) ({ \
> > + typeof(x) __x = (x); \
> > + typeof(y) __y = (y); \
> > + __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
> > +
>
> [...]
>
> > +static struct bpf_object *load(const char *filename, const char *map_name,
> > + struct bpf_link **link)
> > +{
> > + struct bpf_object *obj;
> > + struct bpf_map *map;
> > + struct bpf_link *l;
> > + int err;
> > +
> > + obj = bpf_object__open(filename);
> > + if (CHECK(IS_ERR(obj), "bpf_obj__open_file", "obj:%ld\n",
> > + PTR_ERR(obj)))
> > + return obj;
> > +
> > + err = bpf_object__load(obj);
> > + if (CHECK(err, "bpf_object__load", "err:%d\n", err)) {
> > + bpf_object__close(obj);
> > + return ERR_PTR(err);
> > + }
> > +
> > + map = bpf_object__find_map_by_name(obj, map_name);
> > + if (CHECK(!map, "bpf_object__find_map_by_name", "%s not found\n",
> > + map_name)) {
> > + bpf_object__close(obj);
> > + return ERR_PTR(-ENOENT);
> > + }
> > +
>
> use skeleton instead?
Will give it a spin.
>
> > + l = bpf_map__attach_struct_ops(map);
> > + if (CHECK(IS_ERR(l), "bpf_struct_ops_map__attach", "err:%ld\n",
> > + PTR_ERR(l))) {
> > + bpf_object__close(obj);
> > + return (void *)l;
> > + }
> > +
> > + *link = l;
> > +
> > + return obj;
> > +}
> > +
Powered by blists - more mailing lists