lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <11740704661086-git-send-email-ilpo.jarvinen@helsinki.fi>
Date:	Fri, 16 Mar 2007 20:41:05 +0200
From:	"Ilpo Järvinen" <ilpo.jarvinen@...sinki.fi>
To:	netdev@...r.kernel.org
Cc:	David Miller <davem@...emloft.net>,
	"Ilpo Järvinen" <ilpo.jarvinen@...sinki.fi>
Subject: [RFC PATCHv5 4/5] [TCP]: new LOST marker optimizations

1) Couple of skb states are mutually exclusive

Skb cannot be in both S and L states at the same time, adding
non-S nor L skb count (below highest sack) to that can be
compared against fackets_out to see if anything below the
current skb can still be marked with L or not. If they're
equal (or fackets_out is smaller), the next skb will have L
set for sure.

2) Create a fastpath for the new LOST marker

The fastpath takes advantage of the fact that the latest ACK
very likely contains the globally highest SACK block. By
verifying that is larger than tp->reordering, whole SACK block
can be skipped while counting for not-to-be marked skbs until
tp->reordering of them is encountered. Since TCP now can know,
that should block exists as highest, the marking can begin right
below it. If the latest ACK does not contain a SACK block that
reaches all the way to the highest_sack (e.g., due to
reordering), a slow path is used.

Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@...sinki.fi>
---
 net/ipv4/tcp_input.c |  139 +++++++++++++++++++++++++++++++++-----------------
 1 files changed, 92 insertions(+), 47 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f2b3f68..d34636b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -936,7 +936,8 @@ #endif
  * account for retransmits accurately.
  */
 static int
-tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una)
+tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
+			u32 prior_snd_una, u32 *mark_lost_entry_seq)
 {
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -957,7 +958,8 @@ tcp_sacktag_write_queue(struct sock *sk,
 	if (!tp->sacked_out) {
 		tp->fackets_out = 0;
 		tp->highest_sack = tp->snd_una;
-	}
+	} else
+		*mark_lost_entry_seq = tp->highest_sack;
 	prior_fackets = tp->fackets_out;
 
 	/* Check for D-SACK. */
@@ -1212,6 +1214,19 @@ tcp_sacktag_write_queue(struct sock *sk,
 				tp->retransmit_skb_hint = NULL;
 			}
 		}
+
+		/* Prepare non-reno LOST marking fast path entry point, the
+		 * last ACK must have the globally highest SACK to use
+		 * fastpath, if the highest SACK block is larger than
+		 * tp->reordering, just skip collecting reord_count from
+		 * it when marking LOSTs later.
+		 */
+		if (!before(end_seq, tp->highest_sack)) {
+			if ((end_seq - start_seq) >= tp->reordering * tp->mss_cache)
+				*mark_lost_entry_seq = start_seq;
+			else
+				*mark_lost_entry_seq = tp->highest_sack;
+		}
 	}
 
 	/* Check for lost retransmit. This superb idea is
@@ -1815,7 +1830,7 @@ static void tcp_mark_head_lost_single(st
  *       walk. Basically the entry point will be next skb after highest_sack
  *       or high_seq (if TCP did the skip).
  */
-static void tcp_update_scoreboard_fack(struct sock *sk)
+static void tcp_update_scoreboard_fack(struct sock *sk, u32 entry_seq)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int timedout_continue = 1;
@@ -1829,54 +1844,72 @@ static void tcp_update_scoreboard_fack(s
 		not_marked_skb = tcp_write_queue_next(sk, not_marked_skb);
 
 	} else {
-		unsigned int reord_count = 0;
+		unsigned int holes_seen = 0;
+		int reentry_to_highest_sack = 0;
 
-		skb = tcp_write_queue_find(sk, tp->highest_sack);
+		skb = tcp_write_queue_find(sk, entry_seq);
 		/* If this ever becomes expensive, it can be delayed */
 		not_marked_skb = tcp_write_queue_next(sk, skb);
+		if (entry_seq != tp->highest_sack) {
+			/* Not interested in "the last" SACKed one we got */
+			/* RFC: find_below could help here too */
+			skb = tcp_write_queue_prev(sk, skb);
+			/* Delay lookup because it might turn out unnecessary! */
+			reentry_to_highest_sack = 1;
+		} else {
+			unsigned int reord_count = 0;
 
-		/* Phase I: Search until TCP can mark */
-		tcp_for_write_queue_backwards_from(skb, sk) {
-			if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
-				break;
-
-			if (tcp_skb_timedout(sk, skb))
-				break;
-			else
-				timedout_continue = 0;
+			/* Phase I: Search until TCP can mark */
+			tcp_for_write_queue_backwards_from(skb, sk) {
+				if ((tp->fackets_out <= tp->sacked_out +
+							tp->lost_out +
+							holes_seen) ||
+				    (TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
+					goto backwards_walk_done;
 
-			/*
-			 * Isn't marked, thus a possible entrypoint (last skb
-			 * before LOST edge but TCP doesn't know for sure yet)
-			 */
-			if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
-				not_marked_skb = skb;
-
-			reord_count += tcp_skb_pcount(skb);
-			if (reord_count > tp->reordering) {
-				if (after(TCP_SKB_CB(skb)->seq, tp->high_seq)) {
-					/* RFC: should we have find_below? */
-					skb = tcp_write_queue_find(sk, tp->high_seq);
-					not_marked_skb = skb;
-					skb = tcp_write_queue_prev(sk, skb);
-					/* Timedout top is again uncertain? */
-					if (tcp_skb_timedout(sk, skb))
-						timedout_continue = 1;
+				if (tcp_skb_timedout(sk, skb))
+					break;
+				else {
+					timedout_continue = 0;
+					reentry_to_highest_sack = 0;
 				}
-				/* ...else:
-				 * RFC: Might have to handle skb fragmentation
-				 * here if reord_count > tp->reordering, which
-				 * can be caused by pcount > 1 in the last
-				 * skb... Original does not handle it, btw.
+
+				/*
+				 * Isn't marked, thus a possible entrypoint
+				 * (last skb before LOST edge but TCP doesn't
+				 * know for sure yet)
 				 */
-				break;
+				if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
+					not_marked_skb = skb;
+
+				reord_count += tcp_skb_pcount(skb);
+				if (reord_count > tp->reordering)
+					break;
+
+				if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
+					holes_seen += tcp_skb_pcount(skb);
 			}
 		}
 
+		if (!tcp_skb_timedout(sk, skb) &&
+		    after(TCP_SKB_CB(skb)->seq, tp->high_seq)) {
+			/* RFC: should we have find_below? */
+			skb = tcp_write_queue_find(sk, tp->high_seq);
+			not_marked_skb = skb;
+			skb = tcp_write_queue_prev(sk, skb);
+			/* Timedout top is again uncertain? */
+			if (tcp_skb_timedout(sk, skb))
+				timedout_continue = 1;
+		}
+		/* RFC: ...else if (!tcp_skb_timedout) do skb fragmentation? */
+
 		/* Phase II: Marker */
 		tcp_for_write_queue_backwards_from(skb, sk) {
-			if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
-				break;
+			if ((tp->fackets_out <= tp->sacked_out + tp->lost_out +
+						holes_seen) ||
+			    (TCP_SKB_CB(skb)->sacked & TCPCB_LOST))
+				goto backwards_walk_done;
+
 			if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
 				TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
 				tp->lost_out += tcp_skb_pcount(skb);
@@ -1887,6 +1920,13 @@ static void tcp_update_scoreboard_fack(s
 		/* Phase III: Nothing is still marked?, mark head then */
 		if (!tp->lost_out)
 			tcp_mark_head_lost_single(sk);
+
+backwards_walk_done:
+		if (timedout_continue && reentry_to_highest_sack) {
+			/* ...do the delayed lookup */
+			skb = tcp_write_queue_find(sk, tp->highest_sack);
+			not_marked_skb = tcp_write_queue_next(sk, skb);
+		}
 	}
 
 	/* Continue with timedout work */
@@ -1897,10 +1937,11 @@ static void tcp_update_scoreboard_fack(s
 }
 
 /* Account newly detected lost packet(s) */
-static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
+static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp,
+				  u32 sack_entry_seq)
 {
 	if (!IsReno(tp))
-		tcp_update_scoreboard_fack(sk);
+		tcp_update_scoreboard_fack(sk, sack_entry_seq);
 	else
 		tcp_mark_head_lost_single(sk);
 }
@@ -2170,7 +2211,7 @@ static void tcp_mtup_probe_success(struc
  */
 static void
 tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
-		      int prior_packets, int flag)
+		      int prior_packets, int flag, u32 mark_lost_entry_seq)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -2198,7 +2239,7 @@ tcp_fastretrans_alert(struct sock *sk, u
 	    before(tp->snd_una, tp->high_seq) &&
 	    icsk->icsk_ca_state != TCP_CA_Open &&
 	    tp->fackets_out > tp->reordering) {
-	    	tcp_update_scoreboard_fack(sk);
+	    	tcp_update_scoreboard_fack(sk, mark_lost_entry_seq);
 		NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
 	}
 
@@ -2324,7 +2365,7 @@ tcp_fastretrans_alert(struct sock *sk, u
 	}
 
 	if (is_dupack || tcp_head_timedout(sk, tp))
-		tcp_update_scoreboard(sk, tp);
+		tcp_update_scoreboard(sk, tp, mark_lost_entry_seq);
 	tcp_cwnd_down(sk);
 	tcp_xmit_retransmit_queue(sk);
 }
@@ -2810,6 +2851,7 @@ static int tcp_ack(struct sock *sk, stru
 	u32 ack_seq = TCP_SKB_CB(skb)->seq;
 	u32 ack = TCP_SKB_CB(skb)->ack_seq;
 	u32 prior_in_flight;
+	u32 mark_lost_entry_seq = tp->snd_una;
 	s32 seq_rtt;
 	int prior_packets;
 	int frto_cwnd = 0;
@@ -2852,7 +2894,8 @@ static int tcp_ack(struct sock *sk, stru
 		flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq);
 
 		if (TCP_SKB_CB(skb)->sacked)
-			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+			flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+							&mark_lost_entry_seq);
 
 		if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb)))
 			flag |= FLAG_ECE;
@@ -2882,7 +2925,8 @@ static int tcp_ack(struct sock *sk, stru
 		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
 		    tcp_may_raise_cwnd(sk, flag))
 			tcp_cong_avoid(sk, ack,  seq_rtt, prior_in_flight, 0);
-		tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag);
+		tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag,
+				      mark_lost_entry_seq);
 	} else {
 		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
 			tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1);
@@ -2906,7 +2950,8 @@ no_queue:
 
 old_ack:
 	if (TCP_SKB_CB(skb)->sacked)
-		tcp_sacktag_write_queue(sk, skb, prior_snd_una);
+		tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+					&mark_lost_entry_seq);
 
 uninteresting_ack:
 	SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
-- 
1.4.2

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ