lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1399938695.7973.37.camel@edumazet-glaptop2.roam.corp.google.com>
Date:	Mon, 12 May 2014 16:51:35 -0700
From:	Eric Dumazet <eric.dumazet@...il.com>
To:	Florian Westphal <fw@...len.de>
Cc:	netdev@...r.kernel.org, Daniel Borkmann <dborkman@...hat.com>,
	Glenn Judd <glenn.judd@...ganstanley.com>
Subject: Re: [PATCH 2/5] net: tcp: add flag for ca to indicate that ECN is
 required

On Mon, 2014-05-12 at 22:59 +0200, Florian Westphal wrote:
> From: Daniel Borkmann <dborkman@...hat.com>
> 
> This patch adds a flag to TCP congestion algorithms that allows
> for requesting to mark IPv4/IPv6 sockets with transport as ECN
> capable, that is, ECT(0), when required by a congestion algorithm.
> 
> It is currently used and needed in DataCenter TCP (DCTCP), as it
> requires both peers to assert ECT on all IP packets sent - it
> uses ECN feedback (i.e. CE, Congestion Encountered) from switches
> inside the data center to derive feedback to the end hosts.
> 
> Therefore, simply add a new flag to icsk_ca_ops. Note that DCTCP's
> algorithm/behaviour slightly diverges from RFC3168, therefore this
> is only (!) enabled iff the assigned congestion control ops module
> has requested this. By that, we can tightly couple this logic really
> only to provided congestion control ops.
> 
> Joint work with Florian Westphal and Glenn Judd.
> 
> Signed-off-by: Daniel Borkmann <dborkman@...hat.com>
> Signed-off-by: Glenn Judd <glenn.judd@...ganstanley.com>
> Signed-off-by: Florian Westphal <fw@...len.de>
> ---
>  include/net/tcp.h     | 45 ++++++++++++++++++++++++++++-----------------
>  net/ipv4/tcp_ipv4.c   |  2 +-
>  net/ipv4/tcp_output.c |  9 ++++++++-
>  net/ipv6/tcp_ipv6.c   |  2 +-
>  4 files changed, 38 insertions(+), 20 deletions(-)
> 
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 8ae165f..92d1600 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -734,23 +734,6 @@ struct tcp_skb_cb {
>  
>  #define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
>  
> -/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
> - *
> - * If we receive a SYN packet with these bits set, it means a network is
> - * playing bad games with TOS bits. In order to avoid possible false congestion
> - * notifications, we disable TCP ECN negociation.
> - */
> -static inline void
> -TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
> -		struct net *net)
> -{
> -	const struct tcphdr *th = tcp_hdr(skb);
> -
> -	if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr &&
> -	    INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield))
> -		inet_rsk(req)->ecn_ok = 1;
> -}
> -
>  /* Due to TSO, an SKB can be composed of multiple actual
>   * packets.  To keep these tracked properly, we use this.
>   */
> @@ -783,6 +766,7 @@ enum tcp_ca_event {
>  #define TCP_CA_BUF_MAX	(TCP_CA_NAME_MAX*TCP_CA_MAX)
>  
>  #define TCP_CONG_NON_RESTRICTED 0x1
> +#define TCP_CONG_NEEDS_ECN	0x2
>  
>  struct tcp_congestion_ops {
>  	struct list_head	list;
> @@ -848,6 +832,13 @@ static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state)
>  	icsk->icsk_ca_state = ca_state;
>  }
>  
> +static inline bool tcp_ca_needs_ecn(const struct sock *sk)
> +{
> +	const struct inet_connection_sock *icsk = inet_csk(sk);
> +
> +	return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
> +}
> +
>  static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
>  {
>  	const struct inet_connection_sock *icsk = inet_csk(sk);
> @@ -856,6 +847,26 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
>  		icsk->icsk_ca_ops->cwnd_event(sk, event);
>  }
>  
> +/* RFC3168 : 6.1.1 SYN packets must not have ECT/ECN bits set
> + *
> + * If we receive a SYN packet with these bits set, it means a network is
> + * playing bad games with TOS bits. In order to avoid possible false congestion
> + * notifications, we disable TCP ECN negociation.
> + */

looks like comment needs a change.

> +static inline void
> +TCP_ECN_create_request(struct request_sock *req, const struct sk_buff *skb,
> +		       const struct sock *listen_sk)
> +{
> +	const struct tcphdr *th = tcp_hdr(skb);
> +	const struct net *net = sock_net(listen_sk);
> +
> +	if (net->ipv4.sysctl_tcp_ecn && th->ece && th->cwr &&
> +	    (INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield) ||
> +	     tcp_ca_needs_ecn(listen_sk))) {
> +		inet_rsk(req)->ecn_ok = 1;
> +	}
> +}
> +
>  /* These functions determine how the current flow behaves in respect of SACK
>   * handling. SACK is negotiated with the peer, and therefore it can vary
>   * between different flows.
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index ad166dc..d7d9c54 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1512,7 +1512,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
>  		goto drop_and_free;
>  
>  	if (!want_cookie || tmp_opt.tstamp_ok)
> -		TCP_ECN_create_request(req, skb, sock_net(sk));
> +		TCP_ECN_create_request(req, skb, sk);
>  
>  	if (want_cookie) {
>  		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 694711a..1f983dd 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -331,7 +331,8 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
>  	struct tcp_sock *tp = tcp_sk(sk);
>  
>  	tp->ecn_flags = 0;
> -	if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1) {
> +	if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
> +	    tcp_ca_needs_ecn(sk)) {
>  		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
>  		tp->ecn_flags = TCP_ECN_OK;
>  	}
> @@ -953,6 +954,9 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
>  	if (likely((tcb->tcp_flags & TCPHDR_SYN) == 0))
>  		TCP_ECN_send(sk, skb, tcp_header_size);
>  
> +	if (tcp_ca_needs_ecn(sk))
> +		INET_ECN_xmit(sk);

Why do we need this every time we send a packet ?

Its normally done in TCP_ECN_send(), where it probably makes sense to
change the logic and add a comment why DCTCP sets ECT even for rtx
packets ?


> +
>  #ifdef CONFIG_TCP_MD5SIG
>  	/* Calculate the MD5 hash, as we have all we need now */
>  	if (md5) {
> @@ -2860,6 +2864,9 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
>  	th->doff = (tcp_header_size >> 2);
>  	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_OUTSEGS);
>  
> +	if (tcp_ca_needs_ecn(sk))
> +		INET_ECN_xmit(sk);

Likewise, please do not change this listener field for every SYNACK,
this is going to hurt. This probably should be done _once_.

> +
>  #ifdef CONFIG_TCP_MD5SIG
>  	/* Okay, we have all we need - do the md5 hash if needed */
>  	if (md5) {
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index 7fa6743..b1c8765 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1013,7 +1013,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
>  	ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
>  	ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
>  	if (!want_cookie || tmp_opt.tstamp_ok)
> -		TCP_ECN_create_request(req, skb, sock_net(sk));
> +		TCP_ECN_create_request(req, skb, sk);
>  
>  	ireq->ir_iif = sk->sk_bound_dev_if;
>  


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ