netdev - Re: [PATCH] revert TCP retransmission backoff on ICMP destination unreachable

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <alpine.DEB.2.00.0908181514560.10654@wel-95.cs.helsinki.fi>
Date:	Tue, 18 Aug 2009 16:45:02 +0300 (EEST)
From:	"Ilpo Järvinen" <ilpo.jarvinen@...sinki.fi>
To:	Damian Lukowski <damian@....rwth-aachen.de>
cc:	David Miller <davem@...emloft.net>, Netdev <netdev@...r.kernel.org>
Subject: Re: [PATCH] revert TCP retransmission backoff on ICMP destination
 unreachable

On Fri, 14 Aug 2009, Damian Lukowski wrote:

>> Longer than 80 columns, and use an inline function instead
>> of a macro in order to get proper type checking.
>> [...]
>> Do not break up the function local variables with spurious new lines
>> like this, please.
>> [...]
>> The indentation and tabbing is messed up in all of the code you are
>> adding, please fix it up to be consistent with the surrounding code
>> and the rest of the TCP stack.
>>
>> Do not use C++ style // comments.
>
> Better?

Please, include the changelog message on resubmits too next time.

> Signed-off-by: Damian Lukowski <damian@....rwth-aachen.de>


> diff -Naur linux-2.6.30.4/include/net/tcp.h linux-2.6.30.4-tcp-icmp/include/net/tcp.h
> --- linux-2.6.30.4/include/net/tcp.h	2009-07-31 00:34:47.000000000 +0200
> +++ linux-2.6.30.4-tcp-icmp/include/net/tcp.h	2009-08-14 12:18:30.846060685 +0200
> @@ -1220,6 +1220,14 @@
>  #define tcp_for_write_queue_from_safe(skb, tmp, sk)			\
>  	skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
> 
> +static inline bool retrans_overstepped(const struct sock *sk,
> +					unsigned int boundary)
> +{
> +	return 	inet_csk(sk)->icsk_retransmits &&
> +		(tcp_time_stamp - tcp_sk(sk)->retrans_stamp) >=
> +			TCP_RTO_MIN*(2 << boundary);
> +}
> +
>  static inline struct sk_buff *tcp_send_head(struct sock *sk)
>  {
>  	return sk->sk_send_head;
> diff -Naur linux-2.6.30.4/net/ipv4/tcp_ipv4.c linux-2.6.30.4-tcp-icmp/net/ipv4/tcp_ipv4.c
> --- linux-2.6.30.4/net/ipv4/tcp_ipv4.c	2009-07-31 00:34:47.000000000 +0200
> +++ linux-2.6.30.4-tcp-icmp/net/ipv4/tcp_ipv4.c	2009-08-14 13:19:48.841598908 +0200
> @@ -332,11 +332,13 @@
>  {
>  	struct iphdr *iph = (struct iphdr *)skb->data;
>  	struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
> +	struct inet_connection_sock *icsk;
>  	struct tcp_sock *tp;
>  	struct inet_sock *inet;
>  	const int type = icmp_hdr(skb)->type;
>  	const int code = icmp_hdr(skb)->code;
>  	struct sock *sk;
> +	struct sk_buff *skb_r;

I'd make this called "skb", and change the old skb to icmp_skb.

>  	__u32 seq;
>  	int err;
>  	struct net *net = dev_net(skb->dev);
> @@ -367,6 +369,7 @@
>  	if (sk->sk_state == TCP_CLOSE)
>  		goto out;
> 
> +	icsk = inet_csk(sk);
>  	tp = tcp_sk(sk);
>  	seq = ntohl(th->seq);
>  	if (sk->sk_state != TCP_LISTEN &&
> @@ -393,6 +396,41 @@
>  		}
>
>  		err = icmp_err_convert[code].errno;
> +		/* check if ICMP unreachable messages allow revert of backoff */
> +		if ((code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH) ||
> +		    seq != tp->snd_una  || !icsk->icsk_retransmits ||
> +		    !icsk->icsk_backoff)

I'd recommend you break this into two if()\nbreak's, one for filtering
other icmps and other for filtering mismatching against the socket's 
state.

> +			break;
> +
> +		icsk->icsk_backoff--;
> +		icsk->icsk_rto >>= 1;

Are you absolute sure that we don't go to zero here when phase of the 
moon is just right? I'd put a max(..., 1) guard there.

> +
> +		skb_r = skb_peek(&sk->sk_write_queue);

tcp_write_queue_head(sk)

> +		BUG_ON(!skb_r);
> +
> +		if (sock_owned_by_user(sk)) {
> +			/* Deferring retransmission clocked by ICMP
> +			 * due to locked socket. */
> +			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
> +			min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL),

??? (besides having indent wrong). What makes you think HZ/2 is right 
value if icsk_rto is large?

> +			TCP_RTO_MAX);

Perhaps you lack here something to exit?

> +		}
> +
> +		if (tcp_time_stamp - TCP_SKB_CB(skb_r)->when >
> +		    inet_csk(sk)->icsk_rto) {

icsk->icsk_rto !?!

> +			/* RTO revert clocked out retransmission. */
> +			tcp_retransmit_skb(sk, skb_r);

This is plain wrong, tcp_sock counters will get messed up if 
TCPCB_SACKED_RETRANS is already set while calling tcp_retransmit_skb. As a 
sidenote, it would be probably useful to move the check + clear of that 
bit before doing the retransmission to some common place one day.

> +			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
> +						  icsk->icsk_rto, TCP_RTO_MAX);

RFC 2988, step 5.5 missing? Have you verified this patch for real?

> +		} else {
> +			/* RTO revert shortened timer. */
> +			inet_csk_reset_xmit_timer(
> +				sk, ICSK_TIME_RETRANS,
> +				icsk->icsk_rto-
> +				(tcp_time_stamp-TCP_SKB_CB(skb_r)->when),

Spacing.

> +				TCP_RTO_MAX);
> +		}
> +

How about:

 		u32 remaining;

 		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 						 tcp_time_stamp - TCP_SKB_CB(skb_r)->when));
 		if (!remaining) {
 			tcp_retransmit_skb(...);
 			remaining = icsk->icsk_rto;
 		}
 		inet_csk_reset_xmit_timer(..., remaining);

But check my note about step 5.5 (see above) to get remaining set right in 
the if branch.

>  		break;
>  	case ICMP_TIME_EXCEEDED:
>  		err = EHOSTUNREACH;
> diff -Naur linux-2.6.30.4/net/ipv4/tcp_timer.c linux-2.6.30.4-tcp-icmp/net/ipv4/tcp_timer.c
> --- linux-2.6.30.4/net/ipv4/tcp_timer.c	2009-07-31 00:34:47.000000000 +0200
> +++ linux-2.6.30.4-tcp-icmp/net/ipv4/tcp_timer.c	2009-08-14 13:22:18.068666329 +0200
> @@ -143,7 +143,7 @@
>  			dst_negative_advice(&sk->sk_dst_cache);
>  		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
>  	} else {
> -		if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
> +		if (retrans_overstepped(sk, sysctl_tcp_retries1)) {

??? Could you justify these changes and explain their relation to the 
draft and icmp messages? If not, please drop them and try with proper 
description and description for them in a separate patch. I fail to see 
what you're trying to achieve here. You just ended up redefining the 
sysctl meaning too I think...

>  			/* Black hole detection */
>  			tcp_mtu_probing(icsk, sk);
> 
> @@ -156,12 +156,14 @@
>
>  			retry_until = tcp_orphan_retries(sk, alive);
> 
> -			if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
> +			if (tcp_out_of_resources(
> +				sk, alive ||
> +				    !retrans_overstepped(sk, retry_until)))

???

>  				return 1;
>  		}
>  	}
> 
> -	if (icsk->icsk_retransmits >= retry_until) {
> +	if (retrans_overstepped(sk, retry_until)) {

...

>  		/* Has it gone just too far? */
>  		tcp_write_err(sk);
>  		return 1;
> @@ -385,7 +387,7 @@
>  out_reset_timer:
>  	icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
>  	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
> -	if (icsk->icsk_retransmits > sysctl_tcp_retries1)
> +	if (retrans_overstepped(sk, sysctl_tcp_retries1))

...I guess none of these retrans_overstepped are related to the icmp stuff 
at all?

>  		__sk_dst_reset(sk);
>
>  out:;

-- 
  i.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html