netdev - Re: [RFC PATCH net-next v2 10/15] bpf: Add support for changing congestion control

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Sun, 18 Jun 2017 02:39:07 +0000
From:   Lawrence Brakmo <brakmo@...com>
To:     Daniel Borkmann <daniel@...earbox.net>,
        netdev <netdev@...r.kernel.org>
CC:     Kernel Team <Kernel-team@...com>, Blake Matheny <bmatheny@...com>,
        "Alexei Starovoitov" <ast@...com>,
        David Ahern <dsa@...ulusnetworks.com>
Subject: Re: [RFC PATCH net-next v2 10/15] bpf: Add support for changing
 congestion control


On 6/16/17, 6:58 AM, "Daniel Borkmann" <daniel@...earbox.net> wrote:

    On 06/15/2017 10:08 PM, Lawrence Brakmo wrote:
    > Added support for changing congestion control for SOCKET_OPS bps
    > programs through the setsockopt bpf helper function. It also adds
    > a new SOCKET_OPS op, BPF_SOCKET_OPS_NEEDS_ECN, that is needed for
    > congestion controls, like dctcp, that need to enable ECN in the
    > SYN packets.
    >
    > Signed-off-by: Lawrence Brakmo <brakmo@...com>
    
    I like the use-case! ;)

I’m trying to generalize somethings I hardcoded in our kernels
    
    > ---
    >   include/net/tcp.h        |  9 ++++++++-
    >   include/uapi/linux/bpf.h |  3 +++
    >   net/core/filter.c        | 11 +++++++++--
    >   net/ipv4/tcp.c           |  2 +-
    >   net/ipv4/tcp_cong.c      | 15 ++++++++++-----
    >   net/ipv4/tcp_input.c     |  3 ++-
    >   net/ipv4/tcp_output.c    |  8 +++++---
    >   7 files changed, 38 insertions(+), 13 deletions(-)
    >
    > diff --git a/include/net/tcp.h b/include/net/tcp.h
    > index 29c27dc..371b1bd 100644
    > --- a/include/net/tcp.h
    > +++ b/include/net/tcp.h
    > @@ -1001,7 +1001,9 @@ void tcp_get_default_congestion_control(char *name);
    >   void tcp_get_available_congestion_control(char *buf, size_t len);
    >   void tcp_get_allowed_congestion_control(char *buf, size_t len);
    >   int tcp_set_allowed_congestion_control(char *allowed);
    > -int tcp_set_congestion_control(struct sock *sk, const char *name);
    > +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load);
    > +void tcp_reinit_congestion_control(struct sock *sk,
    > +				   const struct tcp_congestion_ops *ca);
    >   u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
    >   void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
    >
    > @@ -2039,4 +2041,9 @@ static inline u32 tcp_rwnd_init_bpf(struct sock *sk, bool is_req_sock)
    >   		rwnd = 0;
    >   	return rwnd;
    >   }
    > +
    > +static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk)
    > +{
    > +	return (tcp_call_bpf(sk, true, BPF_SOCKET_OPS_NEEDS_ECN) == 1);
    > +}
    >   #endif	/* _TCP_H */
    > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
    > index c3490d3..8d1d2b7 100644
    > --- a/include/uapi/linux/bpf.h
    > +++ b/include/uapi/linux/bpf.h
    > @@ -776,6 +776,9 @@ enum {
    >   						 * passive connection is
    >   						 * established
    >   						 */
    > +	BPF_SOCKET_OPS_NEEDS_ECN,	/* If connection's congestion control
    > +					 * needs ECN
    > +					 */
    >   };
    >
    >   #endif /* _UAPI__LINUX_BPF_H__ */
    > diff --git a/net/core/filter.c b/net/core/filter.c
    > index 9ff567c..4325aba 100644
    > --- a/net/core/filter.c
    > +++ b/net/core/filter.c
    > @@ -2716,8 +2716,15 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_socket_ops_kern *, bpf_socket,
    >   		}
    >   	} else if (level == SOL_TCP &&
    >   		   bpf_socket->sk->sk_prot->setsockopt == tcp_setsockopt) {
    > -		/* Place holder */
    > -		ret = -EINVAL;
    > +		if (optname == TCP_CONGESTION) {
    > +			ret = tcp_set_congestion_control(sk, optval, false);
    > +			if (!ret && bpf_socket->op > BPF_SOCKET_OPS_NEEDS_ECN)
    > +				/* replacing an existing ca */
    > +				tcp_reinit_congestion_control(sk,
    > +					inet_csk(sk)->icsk_ca_ops);
    
    Can you elaborate on the sematics of BPF_SOCKET_OPS_NEEDS_ECN?
    
    It's called from tcp_bpf_ca_needs_ecn(), so if someone sets a cong ctl
    from that callback, or any other bpf_socket->op > BPF_SOCKET_OPS_NEEDS_ECN
    then we call tcp_reinit_congestion_control()? Why 'bpf_socket->op > ..'?

Sorry, this is not very clear. This is to deal with the issue that TCP_CONGESTION can
be called before there is an initialized congestion control, when there is no need to
reinit congestion control, or after one has been initialized, so we need to call reinit.

It just happens that ops <= BPF_SOCKET_OPS_NEEDS_ECN are called before
initialization, so no need to reinit.
    
    Could the needs_ecn implementation detail be hidden altogether from the
    BPF prog?

Yes, but it would be somewhat messy. The NEEDS_ECN happens early in the
connection establishment. We could ask instead of SET_CONG, where the
BPF program could return NULL (no change) or the name (ex: dctcp) then
the kernel would need to determine if ECN is needed for it. But there would
need to be another call later on to actually set it. Overall more overhead.

I would rather add more documentation to the example to make it
clearer. 
    
    > +		} else {
    > +			ret = -EINVAL;
    > +		}
    >   	} else {
    >   		ret = -EINVAL;
    >   	}
    > diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
    > index cc8fd8b..07984ea 100644
    > --- a/net/ipv4/tcp.c
    > +++ b/net/ipv4/tcp.c
    > @@ -2478,7 +2478,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
    >   		name[val] = 0;
    >
    >   		lock_sock(sk);
    > -		err = tcp_set_congestion_control(sk, name);
    > +		err = tcp_set_congestion_control(sk, name, true);
    >   		release_sock(sk);
    >   		return err;
    >   	}
    > diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
    > index 324c9bc..e6f3717 100644
    > --- a/net/ipv4/tcp_cong.c
    > +++ b/net/ipv4/tcp_cong.c
    > @@ -189,8 +189,8 @@ void tcp_init_congestion_control(struct sock *sk)
    >   		INET_ECN_dontxmit(sk);
    >   }
    >
    > -static void tcp_reinit_congestion_control(struct sock *sk,
    > -					  const struct tcp_congestion_ops *ca)
    > +void tcp_reinit_congestion_control(struct sock *sk,
    > +				   const struct tcp_congestion_ops *ca)
    >   {
    >   	struct inet_connection_sock *icsk = inet_csk(sk);
    >
    > @@ -334,7 +334,7 @@ int tcp_set_allowed_congestion_control(char *val)
    >   }
    >
    >   /* Change congestion control for socket */
    > -int tcp_set_congestion_control(struct sock *sk, const char *name)
    > +int tcp_set_congestion_control(struct sock *sk, const char *name, bool load)
    >   {
    >   	struct inet_connection_sock *icsk = inet_csk(sk);
    >   	const struct tcp_congestion_ops *ca;
    > @@ -344,7 +344,10 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
    >   		return -EPERM;
    >
    >   	rcu_read_lock();
    > -	ca = __tcp_ca_find_autoload(name);
    > +	if (!load)
    > +		ca = tcp_ca_find(name);
    > +	else
    > +		ca = __tcp_ca_find_autoload(name);
    
     From BPF program side, we call with !load since we're not allowed
    to sleep under RCU, that's correct ...
    
    >   	/* No change asking for existing value */
    >   	if (ca == icsk->icsk_ca_ops) {
    >   		icsk->icsk_ca_setsockopt = 1;
    > @@ -352,8 +355,10 @@ int tcp_set_congestion_control(struct sock *sk, const char *name)
    >   	}
    >   	if (!ca)
    >   		err = -ENOENT;
    > +	else if (!load)
    > +		icsk->icsk_ca_ops = ca;
    
    ... but don't we also need to hold a module ref in this case as done
    below?
    
    Meaning, tcp_ca_find() could return a ca that was previously loaded
    to the tcp_cong_list as module, then resulting in ref count imbalance
    when set from BPF?
    
As I mentioned above, this can be called before congestion has been
initialized (op <= BPF_SOCKET_OPS_NEEDS_ECN) in which case 
tcp_init_congestion_control will be called later. If op > ..OPS_NEEDS_ECN
then bpf_setsockopt() will call the reinit_congestion_control().

But this points to an issue where someone else could call 
tcp_set_congestion_control() with load == false not knowing they
need to call either init or reinit. I will add a comment to the function
to make it clear.
 
    >   	else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
    > -		   ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
    > +			   ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)))
    
    Nit: slipped in?

Fixed.
    
    >   		err = -EPERM;
    >   	else if (!try_module_get(ca->owner))
    >   		err = -EBUSY;
    > diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
    > index e0d688a..14c889f 100644
    > --- a/net/ipv4/tcp_input.c
    > +++ b/net/ipv4/tcp_input.c
    > @@ -6192,7 +6192,8 @@ static void tcp_ecn_create_request(struct request_sock *req,
    >   	ecn_ok = net->ipv4.sysctl_tcp_ecn || ecn_ok_dst;
    >
    >   	if ((!ect && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
    > -	    (ecn_ok_dst & DST_FEATURE_ECN_CA))
    > +	    (ecn_ok_dst & DST_FEATURE_ECN_CA) ||
    > +	    tcp_bpf_ca_needs_ecn((struct sock *)req))
    >   		inet_rsk(req)->ecn_ok = 1;
    >   }
    >
    > diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
    > index 9124d3d..30660ee 100644
    > --- a/net/ipv4/tcp_output.c
    > +++ b/net/ipv4/tcp_output.c
    > @@ -316,7 +316,8 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
    >   	TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
    >   	if (!(tp->ecn_flags & TCP_ECN_OK))
    >   		TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
    > -	else if (tcp_ca_needs_ecn(sk))
    > +	else if (tcp_ca_needs_ecn(sk) ||
    > +		 tcp_bpf_ca_needs_ecn(sk))
    >   		INET_ECN_xmit(sk);
    >   }
    >
    > @@ -324,8 +325,9 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
    >   static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
    >   {
    >   	struct tcp_sock *tp = tcp_sk(sk);
    > +	bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
    >   	bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
    > -		       tcp_ca_needs_ecn(sk);
    > +		tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
    >
    >   	if (!use_ecn) {
    >   		const struct dst_entry *dst = __sk_dst_get(sk);
    > @@ -339,7 +341,7 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
    >   	if (use_ecn) {
    >   		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
    >   		tp->ecn_flags = TCP_ECN_OK;
    > -		if (tcp_ca_needs_ecn(sk))
    > +		if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
    >   			INET_ECN_xmit(sk);
    >   	}
    >   }
    >