netdev - Re: [PATCH bpf-next v2 11/11] bpf: Add bpf

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Tue, 24 Dec 2019 01:31:44 +0000
From:   Martin Lau <kafai@...com>
To:     Andrii Nakryiko <andrii.nakryiko@...il.com>
CC:     bpf <bpf@...r.kernel.org>, Alexei Starovoitov <ast@...nel.org>,
        "Daniel Borkmann" <daniel@...earbox.net>,
        David Miller <davem@...emloft.net>,
        "Kernel Team" <Kernel-team@...com>,
        Networking <netdev@...r.kernel.org>
Subject: Re: [PATCH bpf-next v2 11/11] bpf: Add bpf_dctcp example

On Mon, Dec 23, 2019 at 03:26:50PM -0800, Andrii Nakryiko wrote:
> On Fri, Dec 20, 2019 at 10:26 PM Martin KaFai Lau <kafai@...com> wrote:
> >
> > This patch adds a bpf_dctcp example.  It currently does not do
> > no-ECN fallback but the same could be done through the cgrp2-bpf.
> >
> > Signed-off-by: Martin KaFai Lau <kafai@...com>
> > ---
> >  tools/testing/selftests/bpf/bpf_tcp_helpers.h | 228 ++++++++++++++++++
> >  .../selftests/bpf/prog_tests/bpf_tcp_ca.c     | 218 +++++++++++++++++
> >  tools/testing/selftests/bpf/progs/bpf_dctcp.c | 210 ++++++++++++++++
> >  3 files changed, 656 insertions(+)
> >  create mode 100644 tools/testing/selftests/bpf/bpf_tcp_helpers.h
> >  create mode 100644 tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c
> >  create mode 100644 tools/testing/selftests/bpf/progs/bpf_dctcp.c
> >
> > diff --git a/tools/testing/selftests/bpf/bpf_tcp_helpers.h b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > new file mode 100644
> > index 000000000000..7ba8c1b4157a
> > --- /dev/null
> > +++ b/tools/testing/selftests/bpf/bpf_tcp_helpers.h
> > @@ -0,0 +1,228 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef __BPF_TCP_HELPERS_H
> > +#define __BPF_TCP_HELPERS_H
> > +
> > +#include <stdbool.h>
> > +#include <linux/types.h>
> > +#include <bpf_helpers.h>
> > +#include <bpf_core_read.h>
> > +#include "bpf_trace_helpers.h"
> > +
> > +#define BPF_TCP_OPS_0(fname, ret_type, ...) BPF_TRACE_x(0, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > +#define BPF_TCP_OPS_1(fname, ret_type, ...) BPF_TRACE_x(1, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > +#define BPF_TCP_OPS_2(fname, ret_type, ...) BPF_TRACE_x(2, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > +#define BPF_TCP_OPS_3(fname, ret_type, ...) BPF_TRACE_x(3, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > +#define BPF_TCP_OPS_4(fname, ret_type, ...) BPF_TRACE_x(4, #fname"_sec", fname, ret_type, __VA_ARGS__)
> > +#define BPF_TCP_OPS_5(fname, ret_type, ...) BPF_TRACE_x(5, #fname"_sec", fname, ret_type, __VA_ARGS__)
> 
> Should we try to put those BPF programs into some section that would
> indicate they are used with struct opts? libbpf doesn't use or enforce
> that (even though it could to derive and enforce that they are
> STRUCT_OPS programs). So something like
> SEC("struct_ops/<ideally-operation-name-here>"). I think having this
> convention is very useful for consistency and to do a quick ELF dump
> and see what is where. WDYT?
I did not use it here because I don't want any misperception that it is
a required convention by libbpf.

Sure, I can prefix it here and comment that it is just a
convention but not a libbpf's requirement.

> 
> > +
> > +struct sock_common {
> > +       unsigned char   skc_state;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct sock {
> > +       struct sock_common      __sk_common;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct inet_sock {
> > +       struct sock             sk;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct inet_connection_sock {
> > +       struct inet_sock          icsk_inet;
> > +       __u8                      icsk_ca_state:6,
> > +                                 icsk_ca_setsockopt:1,
> > +                                 icsk_ca_dst_locked:1;
> > +       struct {
> > +               __u8              pending;
> > +       } icsk_ack;
> > +       __u64                     icsk_ca_priv[104 / sizeof(__u64)];
> > +} __attribute__((preserve_access_index));
> > +
> > +struct tcp_sock {
> > +       struct inet_connection_sock     inet_conn;
> > +
> > +       __u32   rcv_nxt;
> > +       __u32   snd_nxt;
> > +       __u32   snd_una;
> > +       __u8    ecn_flags;
> > +       __u32   delivered;
> > +       __u32   delivered_ce;
> > +       __u32   snd_cwnd;
> > +       __u32   snd_cwnd_cnt;
> > +       __u32   snd_cwnd_clamp;
> > +       __u32   snd_ssthresh;
> > +       __u8    syn_data:1,     /* SYN includes data */
> > +               syn_fastopen:1, /* SYN includes Fast Open option */
> > +               syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
> > +               syn_fastopen_ch:1, /* Active TFO re-enabling probe */
> > +               syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
> > +               save_syn:1,     /* Save headers of SYN packet */
> > +               is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */
> > +               syn_smc:1;      /* SYN includes SMC */
> > +       __u32   max_packets_out;
> > +       __u32   lsndtime;
> > +       __u32   prior_cwnd;
> > +} __attribute__((preserve_access_index));
> > +
> > +static __always_inline struct inet_connection_sock *inet_csk(const struct sock *sk)
> > +{
> > +       return (struct inet_connection_sock *)sk;
> > +}
> > +
> > +static __always_inline void *inet_csk_ca(const struct sock *sk)
> > +{
> > +       return (void *)inet_csk(sk)->icsk_ca_priv;
> > +}
> > +
> > +static __always_inline struct tcp_sock *tcp_sk(const struct sock *sk)
> > +{
> > +       return (struct tcp_sock *)sk;
> > +}
> > +
> > +static __always_inline bool before(__u32 seq1, __u32 seq2)
> > +{
> > +       return (__s32)(seq1-seq2) < 0;
> > +}
> > +#define after(seq2, seq1)      before(seq1, seq2)
> > +
> > +#define        TCP_ECN_OK              1
> > +#define        TCP_ECN_QUEUE_CWR       2
> > +#define        TCP_ECN_DEMAND_CWR      4
> > +#define        TCP_ECN_SEEN            8
> > +
> > +enum inet_csk_ack_state_t {
> > +       ICSK_ACK_SCHED  = 1,
> > +       ICSK_ACK_TIMER  = 2,
> > +       ICSK_ACK_PUSHED = 4,
> > +       ICSK_ACK_PUSHED2 = 8,
> > +       ICSK_ACK_NOW = 16       /* Send the next ACK immediately (once) */
> > +};
> > +
> > +enum tcp_ca_event {
> > +       CA_EVENT_TX_START = 0,
> > +       CA_EVENT_CWND_RESTART = 1,
> > +       CA_EVENT_COMPLETE_CWR = 2,
> > +       CA_EVENT_LOSS = 3,
> > +       CA_EVENT_ECN_NO_CE = 4,
> > +       CA_EVENT_ECN_IS_CE = 5,
> > +};
> > +
> > +enum tcp_ca_state {
> > +       TCP_CA_Open = 0,
> > +       TCP_CA_Disorder = 1,
> > +       TCP_CA_CWR = 2,
> > +       TCP_CA_Recovery = 3,
> > +       TCP_CA_Loss = 4
> > +};
> > +
> > +struct ack_sample {
> > +       __u32 pkts_acked;
> > +       __s32 rtt_us;
> > +       __u32 in_flight;
> > +} __attribute__((preserve_access_index));
> > +
> > +struct rate_sample {
> > +       __u64  prior_mstamp; /* starting timestamp for interval */
> > +       __u32  prior_delivered; /* tp->delivered at "prior_mstamp" */
> > +       __s32  delivered;               /* number of packets delivered over interval */
> > +       long interval_us;       /* time for tp->delivered to incr "delivered" */
> > +       __u32 snd_interval_us;  /* snd interval for delivered packets */
> > +       __u32 rcv_interval_us;  /* rcv interval for delivered packets */
> > +       long rtt_us;            /* RTT of last (S)ACKed packet (or -1) */
> > +       int  losses;            /* number of packets marked lost upon ACK */
> > +       __u32  acked_sacked;    /* number of packets newly (S)ACKed upon ACK */
> > +       __u32  prior_in_flight; /* in flight before this ACK */
> > +       bool is_app_limited;    /* is sample from packet with bubble in pipe? */
> > +       bool is_retrans;        /* is sample from retransmission? */
> > +       bool is_ack_delayed;    /* is this (likely) a delayed ACK? */
> > +} __attribute__((preserve_access_index));
> > +
> > +#define TCP_CA_NAME_MAX                16
> > +#define TCP_CONG_NEEDS_ECN     0x2
> > +
> > +struct tcp_congestion_ops {
> > +       __u32 flags;
> > +
> > +       /* initialize private data (optional) */
> > +       void (*init)(struct sock *sk);
> > +       /* cleanup private data  (optional) */
> > +       void (*release)(struct sock *sk);
> > +
> > +       /* return slow start threshold (required) */
> > +       __u32 (*ssthresh)(struct sock *sk);
> > +       /* do new cwnd calculation (required) */
> > +       void (*cong_avoid)(struct sock *sk, __u32 ack, __u32 acked);
> > +       /* call before changing ca_state (optional) */
> > +       void (*set_state)(struct sock *sk, __u8 new_state);
> > +       /* call when cwnd event occurs (optional) */
> > +       void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev);
> > +       /* call when ack arrives (optional) */
> > +       void (*in_ack_event)(struct sock *sk, __u32 flags);
> > +       /* new value of cwnd after loss (required) */
> > +       __u32  (*undo_cwnd)(struct sock *sk);
> > +       /* hook for packet ack accounting (optional) */
> > +       void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample);
> > +       /* override sysctl_tcp_min_tso_segs */
> > +       __u32 (*min_tso_segs)(struct sock *sk);
> > +       /* returns the multiplier used in tcp_sndbuf_expand (optional) */
> > +       __u32 (*sndbuf_expand)(struct sock *sk);
> > +       /* call when packets are delivered to update cwnd and pacing rate,
> > +        * after all the ca_state processing. (optional)
> > +        */
> > +       void (*cong_control)(struct sock *sk, const struct rate_sample *rs);
> > +
> > +       char            name[TCP_CA_NAME_MAX];
> > +};
> 
> Can all of these types come from vmlinux.h instead of being duplicated here?
It can but I prefer leaving it as is in bpf_tcp_helpers.h like another
existing test in kfree_skb.c.  Without directly using the same struct in
vmlinux.h,  I think it is a good test for libbpf.
That remind me to shuffle the member ordering a little in tcp_congestion_ops
here.

> 
> > +
> > +#define min(a, b) ((a) < (b) ? (a) : (b))
> > +#define max(a, b) ((a) > (b) ? (a) : (b))
> > +#define min_not_zero(x, y) ({                  \
> > +       typeof(x) __x = (x);                    \
> > +       typeof(y) __y = (y);                    \
> > +       __x == 0 ? __y : ((__y == 0) ? __x : min(__x, __y)); })
> > +
> 
> [...]
> 
> > +static struct bpf_object *load(const char *filename, const char *map_name,
> > +                              struct bpf_link **link)
> > +{
> > +       struct bpf_object *obj;
> > +       struct bpf_map *map;
> > +       struct bpf_link *l;
> > +       int err;
> > +
> > +       obj = bpf_object__open(filename);
> > +       if (CHECK(IS_ERR(obj), "bpf_obj__open_file", "obj:%ld\n",
> > +                 PTR_ERR(obj)))
> > +               return obj;
> > +
> > +       err = bpf_object__load(obj);
> > +       if (CHECK(err, "bpf_object__load", "err:%d\n", err)) {
> > +               bpf_object__close(obj);
> > +               return ERR_PTR(err);
> > +       }
> > +
> > +       map = bpf_object__find_map_by_name(obj, map_name);
> > +       if (CHECK(!map, "bpf_object__find_map_by_name", "%s not found\n",
> > +                   map_name)) {
> > +               bpf_object__close(obj);
> > +               return ERR_PTR(-ENOENT);
> > +       }
> > +
> 
> use skeleton instead?
Will give it a spin.

> 
> > +       l = bpf_map__attach_struct_ops(map);
> > +       if (CHECK(IS_ERR(l), "bpf_struct_ops_map__attach", "err:%ld\n",
> > +                 PTR_ERR(l))) {
> > +               bpf_object__close(obj);
> > +               return (void *)l;
> > +       }
> > +
> > +       *link = l;
> > +
> > +       return obj;
> > +}
> > +