[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <bb218f08-79ea-5029-e0a2-57d8a6d07d51@fb.com>
Date: Tue, 17 Dec 2019 17:36:13 +0000
From: Yonghong Song <yhs@...com>
To: Martin Lau <kafai@...com>,
"bpf@...r.kernel.org" <bpf@...r.kernel.org>
CC: Alexei Starovoitov <ast@...nel.org>,
Daniel Borkmann <daniel@...earbox.net>,
David Miller <davem@...emloft.net>,
Kernel Team <Kernel-team@...com>,
"netdev@...r.kernel.org" <netdev@...r.kernel.org>
Subject: Re: [PATCH bpf-next 07/13] bpf: tcp: Support tcp_congestion_ops in
bpf
On 12/13/19 4:47 PM, Martin KaFai Lau wrote:
> This patch makes "struct tcp_congestion_ops" to be the first user
> of BPF STRUCT_OPS. It allows implementing a tcp_congestion_ops
> in bpf.
>
> The BPF implemented tcp_congestion_ops can be used like
> regular kernel tcp-cc through sysctl and setsockopt. e.g.
> [root@...h-fb-vm1 bpf]# sysctl -a | egrep congestion
> net.ipv4.tcp_allowed_congestion_control = reno cubic bpf_cubic
> net.ipv4.tcp_available_congestion_control = reno bic cubic bpf_cubic
> net.ipv4.tcp_congestion_control = bpf_cubic
>
> There has been attempt to move the TCP CC to the user space
> (e.g. CCP in TCP). The common arguments are faster turn around,
> get away from long-tail kernel versions in production...etc,
> which are legit points.
>
> BPF has been the continuous effort to join both kernel and
> userspace upsides together (e.g. XDP to gain the performance
> advantage without bypassing the kernel). The recent BPF
> advancements (in particular BTF-aware verifier, BPF trampoline,
> BPF CO-RE...) made implementing kernel struct ops (e.g. tcp cc)
> possible in BPF. It allows a faster turnaround for testing algorithm
> in the production while leveraging the existing (and continue growing)
> BPF feature/framework instead of building one specifically for
> userspace TCP CC.
>
> This patch allows write access to a few fields in tcp-sock
> (in bpf_tcp_ca_btf_struct_access()).
>
> The optional "get_info" is unsupported now. It can be added
> later. One possible way is to output the info with a btf-id
> to describe the content.
>
> Signed-off-by: Martin KaFai Lau <kafai@...com>
> ---
> include/linux/filter.h | 2 +
> include/net/tcp.h | 1 +
> kernel/bpf/bpf_struct_ops_types.h | 7 +-
> net/core/filter.c | 2 +-
> net/ipv4/Makefile | 4 +
> net/ipv4/bpf_tcp_ca.c | 225 ++++++++++++++++++++++++++++++
> net/ipv4/tcp_cong.c | 14 +-
> net/ipv4/tcp_ipv4.c | 6 +-
> net/ipv4/tcp_minisocks.c | 4 +-
> net/ipv4/tcp_output.c | 4 +-
> 10 files changed, 254 insertions(+), 15 deletions(-)
> create mode 100644 net/ipv4/bpf_tcp_ca.c
>
> diff --git a/include/linux/filter.h b/include/linux/filter.h
> index 37ac7025031d..7c22c5e6528d 100644
> --- a/include/linux/filter.h
> +++ b/include/linux/filter.h
> @@ -844,6 +844,8 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog);
> int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog,
> bpf_aux_classic_check_t trans, bool save_orig);
> void bpf_prog_destroy(struct bpf_prog *fp);
> +const struct bpf_func_proto *
> +bpf_base_func_proto(enum bpf_func_id func_id);
>
> int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
> int sk_attach_bpf(u32 ufd, struct sock *sk);
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 86b9a8766648..fd87fa1df603 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -1007,6 +1007,7 @@ enum tcp_ca_ack_event_flags {
> #define TCP_CONG_NON_RESTRICTED 0x1
> /* Requires ECN/ECT set on all packets */
> #define TCP_CONG_NEEDS_ECN 0x2
> +#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
>
> union tcp_cc_info;
>
> diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h
> index 7bb13ff49ec2..066d83ea1c99 100644
> --- a/kernel/bpf/bpf_struct_ops_types.h
> +++ b/kernel/bpf/bpf_struct_ops_types.h
> @@ -1,4 +1,9 @@
> /* SPDX-License-Identifier: GPL-2.0 */
> /* internal file - do not include directly */
>
> -/* To be filled in a later patch */
> +#ifdef CONFIG_BPF_JIT
> +#ifdef CONFIG_INET
> +#include <net/tcp.h>
> +BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
> +#endif
> +#endif
> diff --git a/net/core/filter.c b/net/core/filter.c
> index a411f7835dee..fbb3698026bd 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -5975,7 +5975,7 @@ bool bpf_helper_changes_pkt_data(void *func)
> return false;
> }
>
> -static const struct bpf_func_proto *
> +const struct bpf_func_proto *
> bpf_base_func_proto(enum bpf_func_id func_id)
> {
> switch (func_id) {
> diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
> index d57ecfaf89d4..7360d9b3eaad 100644
> --- a/net/ipv4/Makefile
> +++ b/net/ipv4/Makefile
> @@ -65,3 +65,7 @@ obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
>
> obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
> xfrm4_output.o xfrm4_protocol.o
> +
> +ifeq ($(CONFIG_BPF_SYSCALL),y)
> +obj-$(CONFIG_BPF_JIT) += bpf_tcp_ca.o
> +endif
> diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c
> new file mode 100644
> index 000000000000..967af987bc26
> --- /dev/null
> +++ b/net/ipv4/bpf_tcp_ca.c
> @@ -0,0 +1,225 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/* Copyright (c) 2019 Facebook */
> +
> +#include <linux/types.h>
> +#include <linux/bpf_verifier.h>
> +#include <linux/bpf.h>
> +#include <linux/btf.h>
> +#include <linux/filter.h>
> +#include <net/tcp.h>
> +
> +static u32 optional_ops[] = {
> + offsetof(struct tcp_congestion_ops, init),
> + offsetof(struct tcp_congestion_ops, release),
> + offsetof(struct tcp_congestion_ops, set_state),
> + offsetof(struct tcp_congestion_ops, cwnd_event),
> + offsetof(struct tcp_congestion_ops, in_ack_event),
> + offsetof(struct tcp_congestion_ops, pkts_acked),
> + offsetof(struct tcp_congestion_ops, min_tso_segs),
> + offsetof(struct tcp_congestion_ops, sndbuf_expand),
> + offsetof(struct tcp_congestion_ops, cong_control),
> +};
> +
> +static u32 unsupported_ops[] = {
> + offsetof(struct tcp_congestion_ops, get_info),
> +};
Could you elaborate by adding some comments at least how
required fields are handled?
> +
> +static const struct btf_type *tcp_sock_type;
> +static u32 tcp_sock_id, sock_id;
> +
> +static int bpf_tcp_ca_init(struct btf *_btf_vmlinux)
> +{
> + s32 type_id;
> +
> + type_id = btf_find_by_name_kind(_btf_vmlinux, "sock", BTF_KIND_STRUCT);
> + if (type_id < 0)
> + return -EINVAL;
> + sock_id = type_id;
> +
> + type_id = btf_find_by_name_kind(_btf_vmlinux, "tcp_sock",
> + BTF_KIND_STRUCT);
> + if (type_id < 0)
> + return -EINVAL;
> + tcp_sock_id = type_id;
> + tcp_sock_type = btf_type_by_id(_btf_vmlinux, tcp_sock_id);
> +
> + return 0;
> +}
> +
> +static bool check_optional(u32 member_offset)
> +{
> + unsigned int i;
> +
> + for (i = 0; i < ARRAY_SIZE(optional_ops); i++) {
> + if (member_offset == optional_ops[i])
> + return true;
> + }
> +
> + return false;
> +}
> +
> +static bool check_unsupported(u32 member_offset)
> +{
> + unsigned int i;
> +
> + for (i = 0; i < ARRAY_SIZE(unsupported_ops); i++) {
> + if (member_offset == unsupported_ops[i])
> + return true;
> + }
> +
> + return false;
> +}
> +
[...]
> +
> +static const struct bpf_verifier_ops bpf_tcp_ca_verifier_ops = {
> + .get_func_proto = bpf_tcp_ca_get_func_proto,
> + .is_valid_access = bpf_tcp_ca_is_valid_access,
> + .btf_struct_access = bpf_tcp_ca_btf_struct_access,
> +};
> +
> +static int bpf_tcp_ca_init_member(const struct btf_type *t,
> + const struct btf_member *member,
> + void *kdata, const void *udata)
> +{
> + const struct tcp_congestion_ops *utcp_ca;
> + struct tcp_congestion_ops *tcp_ca;
> + size_t tcp_ca_name_len;
> + int prog_fd;
> + u32 moff;
> +
> + utcp_ca = (const struct tcp_congestion_ops *)udata;
> + tcp_ca = (struct tcp_congestion_ops *)kdata;
> +
> + moff = btf_member_bit_offset(t, member) / 8;
> + switch (moff) {
> + case offsetof(struct tcp_congestion_ops, flags):
> + if (utcp_ca->flags & ~TCP_CONG_MASK)
> + return -EINVAL;
> + tcp_ca->flags = utcp_ca->flags;
> + return 1;
> + case offsetof(struct tcp_congestion_ops, name):
> + tcp_ca_name_len = strnlen(utcp_ca->name, sizeof(utcp_ca->name));
> + if (!tcp_ca_name_len ||
> + tcp_ca_name_len == sizeof(utcp_ca->name))
> + return -EINVAL;
> + memcpy(tcp_ca->name, utcp_ca->name, sizeof(tcp_ca->name));
> + return 1;
> + }
> +
> + if (!btf_type_resolve_func_ptr(btf_vmlinux, member->type, NULL))
> + return 0;
> +
> + prog_fd = (int)(*(unsigned long *)(udata + moff));
> + if (!prog_fd && !check_optional(moff) && !check_unsupported(moff))
> + return -EINVAL;
So if a member is option or unsupported, we will return -EINVAL?
I probably miss something here.
> +
> + return 0;
> +}
> +
> +static int bpf_tcp_ca_check_member(const struct btf_type *t,
> + const struct btf_member *member)
> +{
> + if (check_unsupported(btf_member_bit_offset(t, member) / 8))
> + return -ENOTSUPP;
> + return 0;
> +}
> +
> +static int bpf_tcp_ca_reg(void *kdata)
> +{
> + return tcp_register_congestion_control(kdata);
> +}
> +
> +static void bpf_tcp_ca_unreg(void *kdata)
> +{
> + tcp_unregister_congestion_control(kdata);
> +}
> +
[...]
Powered by blists - more mailing lists