lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <1282630819-23104-1-git-send-email-hkchu@google.com>
Date:	Mon, 23 Aug 2010 23:20:19 -0700
From:	"H.K. Jerry Chu" <hkchu@...gle.com>
To:	ilpo.jarvinen@...sinki.fi, davem@...emloft.net
Cc:	netdev@...r.kernel.org, Jerry Chu <hkchu@...gle.com>
Subject: [PATCH] TCP_FAILFAST: a new socket option to timeout/abort a connection quicker

From: Jerry Chu <hkchu@...gle.com>

This is a TCP level socket option that takes an unsigned int to specify
how long in ms TCP should resend a lost data packet before giving up
and returning ETIMEDOUT. The normal TCP retry/abort timeout limit still
applies. In other words this option is only meant for those applications
that need to "fail faster" than the default TCP timeout. The latter
may take upto 20 minutes in a normal WAN environment.

The option is disabled (by default) when set to 0. Also it does not
apply during the connection establishment phase.

Signed-off-by: H.K. Jerry Chu <hkchu@...gle.com>
---
 include/linux/tcp.h                |    1 +
 include/net/inet_connection_sock.h |    1 +
 net/ipv4/tcp.c                     |   11 ++++++++-
 net/ipv4/tcp_timer.c               |   42 +++++++++++++++++++++++++++++++----
 4 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index a778ee0..60b7244 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -105,6 +105,7 @@ enum {
 #define TCP_COOKIE_TRANSACTIONS	15	/* TCP Cookie Transactions */
 #define TCP_THIN_LINEAR_TIMEOUTS 16      /* Use linear timeouts for thin streams*/
 #define TCP_THIN_DUPACK         17      /* Fast retrans. after 1 dupack */
+#define TCP_FAILFAST		18	/* Abort connection in loss retry sooner*/
 
 /* for TCP_INFO socket option */
 #define TCPI_OPT_TIMESTAMPS	1
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index b6d3b55..6553921 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -125,6 +125,7 @@ struct inet_connection_sock {
 		int		  probe_size;
 	} icsk_mtup;
 	u32			  icsk_ca_priv[16];
+	u32			  icsk_max_timeout;
 #define ICSK_CA_PRIV_SIZE	(16 * sizeof(u32))
 };
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 176e11a..ddb548a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2391,7 +2391,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		err = tp->af_specific->md5_parse(sk, optval, optlen);
 		break;
 #endif
-
+	case TCP_FAILFAST:
+		/* Cap the max timeout in ms TCP will retry/retrans
+		 * before giving up and aborting (ETIMEDOUT) a connection.
+		 */
+		icsk->icsk_max_timeout = msecs_to_jiffies(val);
+		break;
 	default:
 		err = -ENOPROTOOPT;
 		break;
@@ -2610,6 +2615,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 	case TCP_THIN_DUPACK:
 		val = tp->thin_dupack;
 		break;
+
+	case TCP_FAILFAST:
+		val = jiffies_to_msecs(icsk->icsk_max_timeout);
+		break;
 	default:
 		return -ENOPROTOOPT;
 	}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 808bb92..95c2548 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -138,7 +138,8 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk)
  * retransmissions with an initial RTO of TCP_RTO_MIN.
  */
 static bool retransmits_timed_out(struct sock *sk,
-				  unsigned int boundary)
+				  unsigned int boundary,
+				  unsigned int max_timeout)
 {
 	unsigned int timeout, linear_backoff_thresh;
 	unsigned int start_ts;
@@ -159,6 +160,9 @@ static bool retransmits_timed_out(struct sock *sk,
 		timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN +
 			  (boundary - linear_backoff_thresh) * TCP_RTO_MAX;
 
+	if (max_timeout != 0 && timeout > max_timeout)
+		timeout = max_timeout;
+
 	return (tcp_time_stamp - start_ts) >= timeout;
 }
 
@@ -174,7 +178,7 @@ static int tcp_write_timeout(struct sock *sk)
 			dst_negative_advice(sk);
 		retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 	} else {
-		if (retransmits_timed_out(sk, sysctl_tcp_retries1)) {
+		if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) {
 			/* Black hole detection */
 			tcp_mtu_probing(icsk, sk);
 
@@ -187,14 +191,16 @@ static int tcp_write_timeout(struct sock *sk)
 
 			retry_until = tcp_orphan_retries(sk, alive);
 			do_reset = alive ||
-				   !retransmits_timed_out(sk, retry_until);
+				   !retransmits_timed_out(sk, retry_until, 0);
 
 			if (tcp_out_of_resources(sk, do_reset))
 				return 1;
 		}
 	}
 
-	if (retransmits_timed_out(sk, retry_until)) {
+	if (retransmits_timed_out(sk, retry_until,
+	    (1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) ? 0 :
+	    icsk->icsk_max_timeout)) {
 		/* Has it gone just too far? */
 		tcp_write_err(sk);
 		return 1;
@@ -434,9 +440,35 @@ out_reset_timer:
 	} else {
 		/* Use normal (exponential) backoff */
 		icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
+		if (icsk->icsk_max_timeout &&
+		    ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) == 0) {
+			int ts;
+			unsigned int base_rto =
+			    min(__tcp_set_rto(tp), TCP_RTO_MAX);
+
+			if (unlikely(!tp->retrans_stamp))
+				ts = (int)TCP_SKB_CB(tcp_write_queue_head(sk))->when;
+			else
+				ts = (int)tp->retrans_stamp;
+			ts = icsk->icsk_max_timeout - (tcp_time_stamp - ts) -
+				base_rto-1;
+			/*
+			 * Adjust rto so that the total timeout is not far off
+			 * the max_timeout range. Also if the total # of
+			 * retries would be less than 6, allow one more shot.
+			 */
+			if (icsk->icsk_rto > ts && icsk->icsk_retransmits < 6)
+				icsk->icsk_rto >>= 1;
+			if ((int)(icsk->icsk_rto) > ts) {
+				if (ts < (int)base_rto)
+					icsk->icsk_rto = base_rto;
+				else
+					icsk->icsk_rto = ts;
+			}
+		}
 	}
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
-	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1))
+	if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0))
 		__sk_dst_reset(sk);
 
 out:;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ