netdev - [RFC 2/2] Account acked_out in sack, if the sack is invalid

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1376940568-16512-3-git-send-email-christoph.paasch@uclouvain.be>
Date:	Mon, 19 Aug 2013 21:29:28 +0200
From:	Christoph Paasch <christoph.paasch@...ouvain.be>
To:	Eric Dumazet <eric.dumazet@...il.com>
Cc:	netdev@...r.kernel.org, Phil Oester <kernel@...uxace.com>,
	Corey Hickey <bugfood-ml@...ooh.org>,
	Benjamin Hesmans <benjamin.hesmans@...ouvain.be>
Subject: [RFC 2/2] Account acked_out in sack, if the sack is invalid

In case of a sequence-number rewriting firewall (e.g., a Cisco Firewall:
https://supportforums.cisco.com/docs/DOC-12668#TCP_Sequence_Number_Randomization_and_SACK)
the SACK-blocks are not rewritten.
As we receive these ACK-packets, the sackblock is marked as invalid in
tcp_is_sackblock_valid(), and sacked_out will not be incremented.
Thus, fast-retransmit will not trigger and the stack will timeout after an
RTO. This completly kills the performance across this kind of firewalls.

This patch allows acked_out to increase, in case a sackblog is marked as
invalid. Further, it considers acked_out in the accounting when SACK has
been enabled on the connection.

Tested-by: Benjamin Hesmans <benjamin.hesmans@...ouvain.be>
Signed-off-by: Christoph Paasch <christoph.paasch@...ouvain.be>
---
 net/ipv4/tcp_input.c  | 59 ++++++++++++++++++++++++++++++++++++++-------------
 net/ipv4/tcp_output.c |  4 ++--
 2 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index af764d0..ea99017 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -75,6 +75,8 @@
 #include <asm/unaligned.h>
 #include <net/netdma.h>
 
+static void tcp_add_reno_sack(struct sock *sk);
+
 int sysctl_tcp_timestamps __read_mostly = 1;
 int sysctl_tcp_window_scaling __read_mostly = 1;
 int sysctl_tcp_sack __read_mostly = 1;
@@ -1542,7 +1544,7 @@ static int tcp_sack_cache_ok(const struct tcp_sock *tp, const struct tcp_sack_bl
 
 static int
 tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
-			u32 prior_snd_una, s32 *sack_rtt)
+			u32 prior_snd_una, s32 *sack_rtt, int flag)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	const unsigned char *ptr = (skb_transport_header(ack_skb) +
@@ -1554,7 +1556,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 	struct sk_buff *skb;
 	int num_sacks = min(TCP_NUM_SACKS, (ptr[1] - TCPOLEN_SACK_BASE) >> 3);
 	int used_sacks;
-	bool found_dup_sack = false;
+	bool found_dup_sack = false, first = true;
 	int i, j;
 	int first_sack_index;
 
@@ -1562,7 +1564,7 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 	state.reord = tp->packets_out;
 	state.rtt = -1;
 
-	if (!tp->sacked_out) {
+	if (!tp->sacked_out && !tp->acked_out) {
 		if (WARN_ON(tp->fackets_out))
 			tp->fackets_out = 0;
 		tcp_highest_sack_reset(sk);
@@ -1612,6 +1614,29 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 			NET_INC_STATS_BH(sock_net(sk), mib_idx);
 			if (i == 0)
 				first_sack_index = -1;
+
+			/* 1. Only account the dup-ack once.
+			 *
+			 * 2. If we are coming from tcp_ack, with an old_ack, we
+			 * should not increase acked_out, as the ack is old :)
+			 *
+			 * 3. Only increase acked_out, if this is really a
+			 * duplicate ack. Check is similar to the one in
+			 * tcp_ack, when setting is_dupack.
+			 *
+			 * 4. We have to reproduce the first check of
+			 * tcp_is_sackblock_valid, because we don't want to
+			 * account dup-acks if the sack-blog is corrupted.
+			 */
+			if (first &&
+			    !before(TCP_SKB_CB(ack_skb)->ack_seq, prior_snd_una) &&
+			    !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)) &&
+			    before(sp[used_sacks].start_seq, sp[used_sacks].end_seq)) {
+				first = false;
+				/* Disable FACK, as we can't trust SACKs anymore */
+				tcp_disable_fack(tp);
+				tcp_add_reno_sack(sk);
+			}
 			continue;
 		}
 
@@ -1740,6 +1765,7 @@ out:
 
 #if FASTRETRANS_DEBUG > 0
 	WARN_ON((int)tp->sacked_out < 0);
+	WARN_ON((int)tp->acked_out < 0);
 	WARN_ON((int)tp->lost_out < 0);
 	WARN_ON((int)tp->retrans_out < 0);
 	WARN_ON((int)tcp_packets_in_flight(tp) < 0);
@@ -1852,7 +1878,7 @@ void tcp_enter_loss(struct sock *sk, int how)
 
 	tcp_clear_retrans_partial(tp);
 
-	if (tcp_is_reno(tp))
+	if (tcp_is_reno(tp) || tp->acked_out)
 		tcp_reset_reno_sack(tp);
 
 	tp->undo_marker = tp->snd_una;
@@ -2189,7 +2215,7 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
 			lost = 1;
 		tcp_mark_head_lost(sk, lost, 0);
 	} else {
-		int sacked_upto = tp->sacked_out - tp->reordering;
+		int sacked_upto = tcp_acked_out(tp) - tp->reordering;
 		if (sacked_upto >= 0)
 			tcp_mark_head_lost(sk, sacked_upto, 0);
 		else if (fast_rexmit)
@@ -2551,7 +2577,7 @@ void tcp_simple_retransmit(struct sock *sk)
 	if (prior_lost == tp->lost_out)
 		return;
 
-	if (tcp_is_reno(tp))
+	if (tcp_is_reno(tp) || tp->acked_out)
 		tcp_limit_reno_sacked(tp);
 
 	tcp_verify_left_out(tp);
@@ -2634,11 +2660,11 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
 	}
 	if (flag & FLAG_DATA_ACKED)
 		icsk->icsk_retransmits = 0;
-	if (tcp_is_reno(tp)) {
+	if (tcp_is_reno(tp) || tp->acked_out) {
 		/* A Reno DUPACK means new data in F-RTO step 2.b above are
 		 * delivered. Lower inflight to clock out (re)tranmissions.
 		 */
-		if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
+		if (tcp_is_reno(tp) && after(tp->snd_nxt, tp->high_seq) && is_dupack)
 			tcp_add_reno_sack(sk);
 		else if (flag & FLAG_SND_UNA_ADVANCED)
 			tcp_reset_reno_sack(tp);
@@ -2739,7 +2765,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 			break;
 
 		case TCP_CA_Recovery:
-			if (tcp_is_reno(tp))
+			if (tcp_is_reno(tp) || tp->acked_out)
 				tcp_reset_reno_sack(tp);
 			if (tcp_try_undo_recovery(sk))
 				return;
@@ -2772,10 +2798,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
 			return;
 		/* Fall through to processing in Open state. */
 	default:
-		if (tcp_is_reno(tp)) {
+		if (tcp_is_reno(tp) || tp->acked_out) {
 			if (flag & FLAG_SND_UNA_ADVANCED)
 				tcp_reset_reno_sack(tp);
-			if (is_dupack)
+			if (is_dupack && tcp_is_reno(tp))
 				tcp_add_reno_sack(sk);
 		}
 
@@ -2952,7 +2978,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 	int flag = 0;
 	u32 pkts_acked = 0;
 	u32 reord = tp->packets_out;
-	u32 prior_sacked = tp->sacked_out;
+	u32 prior_sacked = tcp_acked_out(tp);
 	s32 seq_rtt = -1;
 	s32 ca_seq_rtt = -1;
 	ktime_t last_ackt = net_invalid_timestamp();
@@ -3050,12 +3076,15 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 		} else {
 			int delta;
 
+			if (tp->acked_out)
+				tcp_remove_reno_sacks(sk, pkts_acked);
+
 			/* Non-retransmitted hole got filled? That's reordering */
 			if (reord < prior_fackets)
 				tcp_update_reordering(sk, tp->fackets_out - reord, 0);
 
 			delta = tcp_is_fack(tp) ? pkts_acked :
-						  prior_sacked - tp->sacked_out;
+						  prior_sacked - tcp_acked_out(tp);
 			tp->lost_cnt_hint -= min(tp->lost_cnt_hint, delta);
 		}
 
@@ -3340,7 +3369,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 		if (TCP_SKB_CB(skb)->sacked)
 			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
-							&sack_rtt);
+							&sack_rtt, flag);
 
 		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
 			flag |= FLAG_ECE;
@@ -3413,7 +3442,7 @@ old_ack:
 	 */
 	if (TCP_SKB_CB(skb)->sacked) {
 		flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
-						&sack_rtt);
+						&sack_rtt, flag);
 		tcp_fastretrans_alert(sk, acked, prior_unsacked,
 				      is_dupack, flag);
 	}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 17ebbff..2ecb943 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1008,7 +1008,7 @@ static void tcp_adjust_fackets_out(struct sock *sk, const struct sk_buff *skb,
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (!tp->sacked_out || tcp_is_reno(tp))
+	if ((!tp->sacked_out && !tp->acked_out) || tcp_is_reno(tp))
 		return;
 
 	if (after(tcp_highest_sack_seq(tp), TCP_SKB_CB(skb)->seq))
@@ -1032,7 +1032,7 @@ static void tcp_adjust_pcount(struct sock *sk, const struct sk_buff *skb, int de
 		tp->lost_out -= decr;
 
 	/* Reno case is special. Sigh... */
-	if (tcp_is_reno(tp) && decr > 0)
+	if ((tcp_is_reno(tp) || tp->acked_out) && decr > 0)
 		tp->acked_out -= min_t(u32, tp->acked_out, decr);
 
 	tcp_adjust_fackets_out(sk, skb, decr);
-- 
1.8.1.2

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html