[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Pine.LNX.4.64.0703080703210.27671@kivilampi-30.cs.helsinki.fi>
Date: Thu, 8 Mar 2007 07:05:41 +0200 (EET)
From: "Ilpo Järvinen" <ilpo.jarvinen@...sinki.fi>
To: Baruch Even <baruch@...en.org>
cc: David Miller <davem@...emloft.net>, netdev@...r.kernel.org
Subject: [RFC PATCH 2/2] [TCP]: Reworked recovery's TCPCB_LOST marking
functions
Complete rewrite for update_scoreboard and mark_head_lost. Couple
of hints became unnecessary because of this change, NewReno code
was greatly simplified. I changed !TCPCB_TAGBITS check from the
original to a !(S|L) check but it shouldn't make a difference,
and if there ever is a R only skb TCP will mark it as LOST too
that is a correct behavior (IMHO). The algorithm uses some ideas
presented by David Miller and Baruch Even.
Seqno lookups require fast lookups that are provided using
RB-tree patch (which is nowadays included into the latest
net-2.6.22).
Only compile testing after v1, I'll probably try v3 on Friday
as I should have spare time in the testnet work then.
v2-to-v3 changes:
- Replace of skipping made worries in it obsolete and now really
one skb less is traversed (was promised in v2)
v1-to-v2 changes:
- Changes non-fack SACK to something less broken; not a complete
set of changes (other functions should also be fixed)
- Fixed NewReno bugs
- More comments & added RFCs to places I'm unsure of
- Delayed reord_count increment (traverses now one skb less)
- Copied retransmit_hint clearing from the original but I
think it's superfluous
- Separated highest_sack to own patch
- Fixed off-by-one bug in skipping
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@...sinki.fi>
---
include/linux/tcp.h | 4 -
include/net/tcp.h | 11 ++-
net/ipv4/tcp_input.c | 213 +++++++++++++++++++++++++++++++++++---------------
3 files changed, 156 insertions(+), 72 deletions(-)
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 807fc96..9e706bd 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -323,15 +323,11 @@ #endif
u32 highest_sack; /* Start seq of globally highest revd SACK (valid only in slowpath) */
/* from STCP, retrans queue hinting */
- struct sk_buff* lost_skb_hint;
-
- struct sk_buff *scoreboard_skb_hint;
struct sk_buff *retransmit_skb_hint;
struct sk_buff *forward_skb_hint;
struct sk_buff *fastpath_skb_hint;
int fastpath_cnt_hint;
- int lost_cnt_hint;
int retransmit_cnt_hint;
int forward_cnt_hint;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 2599d98..73ae872 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1047,8 +1047,6 @@ static inline void tcp_mib_init(void)
/*from STCP */
static inline void clear_all_retrans_hints(struct tcp_sock *tp){
- tp->lost_skb_hint = NULL;
- tp->scoreboard_skb_hint = NULL;
tp->retransmit_skb_hint = NULL;
tp->forward_skb_hint = NULL;
tp->fastpath_skb_hint = NULL;
@@ -1194,6 +1192,11 @@ static inline struct sk_buff *tcp_write_
return skb->next;
}
+static inline struct sk_buff *tcp_write_queue_prev(struct sock *sk, struct sk_buff *skb)
+{
+ return skb->prev;
+}
+
#define tcp_for_write_queue(skb, sk) \
for (skb = (sk)->sk_write_queue.next; \
(skb != (struct sk_buff *)&(sk)->sk_write_queue); \
@@ -1203,6 +1206,10 @@ #define tcp_for_write_queue_from(skb, sk
for (; (skb != (struct sk_buff *)&(sk)->sk_write_queue);\
skb = skb->next)
+#define tcp_for_write_queue_backwards_from(skb, sk) \
+ for (; (skb != (struct sk_buff *)&(sk)->sk_write_queue);\
+ skb = skb->prev)
+
static inline struct sk_buff *tcp_send_head(struct sock *sk)
{
return sk->sk_send_head;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d738f8e..3d7bd08 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1730,98 +1730,179 @@ static inline void tcp_reset_reno_sack(s
tp->left_out = tp->lost_out;
}
-/* Mark head of queue up as lost. */
-static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp,
- int packets, u32 high_seq)
+/* RFC: This is from the original, I doubt that this is necessary at all:
+ * clear xmit_retrans hint if seq of this skb is beyond hint. How could we
+ * retransmitted past LOST markings in the first place? I'm not fully sure
+ * about undo and end of connection cases, which can cause R without L?
+ */
+static void tcp_verify_retransmit_hint(struct tcp_sock *tp,
+ struct sk_buff *skb)
{
- struct sk_buff *skb;
- int cnt;
+ if (tp->retransmit_skb_hint &&
+ before(TCP_SKB_CB(skb)->seq,
+ TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
+ tp->retransmit_skb_hint = skb;
+}
- BUG_TRAP(packets <= tp->packets_out);
- if (tp->lost_skb_hint) {
- skb = tp->lost_skb_hint;
- cnt = tp->lost_cnt_hint;
- } else {
- skb = tcp_write_queue_head(sk);
- cnt = 0;
- }
+/* Forward walk starting from until a not timedout skb is encountered, timeout
+ * everything on the way
+ */
+static void tcp_timedout_mark_forward(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
tcp_for_write_queue_from(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
- /* TODO: do this better */
- /* this is not the most efficient way to do this... */
- tp->lost_skb_hint = skb;
- tp->lost_cnt_hint = cnt;
- cnt += tcp_skb_pcount(skb);
- if (cnt > packets || after(TCP_SKB_CB(skb)->end_seq, high_seq))
+ if (skb == tcp_send_head(sk) || !tcp_skb_timedout(sk, skb))
break;
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+ /*
+ * Could be lost already from a previous timedout check? I
+ * think R skbs should never be encountered here.
+ */
+ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST)) {
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
-
- /* clear xmit_retransmit_queue hints
- * if this is beyond hint */
- if(tp->retransmit_skb_hint != NULL &&
- before(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) {
-
- tp->retransmit_skb_hint = NULL;
- }
+ tcp_verify_retransmit_hint(tp, skb);
}
}
- tcp_sync_left_out(tp);
}
-/* Account newly detected lost packet(s) */
-
-static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
+/* Algorithm details
+ *
+ * I) walk backwards until the first previously LOST marked skb is found (or
+ * snd_una is encountered) starting from highest_sack->seq
+ * 1) Skip skbs until reord_count > tp->reordering, then continue from
+ * min(skb_now->seq, tp->high_seq)
+ * 2a) Mark non-SACKed skbs LOST
+ * 2b) If TCP can timeout non-SACKed skb safely, mark it LOST
+ * 3) If LOST was not set for a non-SACKed skb, it's possible an
+ * entrypoint for the step II (store it as such).
+ *
+ * II) call forward walk that marks timedout skbs as LOST starting from an
+ * entrypoint provided by the backwards walk, that is, min(LOST_edge,
+ * tp->high_seq)
+ *
+ * Implements both FACK and RFC3517 SACK. Only FACK does per skb timeouts.
+ * Key difference is that FACK uses both SACK blocks and holes, while RFC3517
+ * is using only SACK blocks to mark (TODO: RFC3517 SACK requires a changes to
+ * elsewhere too).
+ */
+static void tcp_update_scoreboard_fack(struct sock *sk)
{
- if (IsFack(tp)) {
- int lost = tp->fackets_out - tp->reordering;
- if (lost <= 0)
- lost = 1;
- tcp_mark_head_lost(sk, tp, lost, tp->high_seq);
- } else {
- tcp_mark_head_lost(sk, tp, 1, tp->high_seq);
- }
-
- /* New heuristics: it is possible only after we switched
- * to restart timer each time when something is ACKed.
- * Hence, we can detect timed out packets during fast
- * retransmit without falling to slow start.
- */
- if (!IsReno(tp) && tcp_head_timedout(sk, tp)) {
- struct sk_buff *skb;
+ struct tcp_sock *tp = tcp_sk(sk);
+ int reord_count = 0;
+ int ts_linear = 0;
+ struct sk_buff *skb, *reentry_skb = NULL;
- skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
- : tcp_write_queue_head(sk);
+ skb = tcp_write_queue_find(sk, tp->highest_sack);
+ reentry_skb = tcp_write_queue_next(sk, skb);
- tcp_for_write_queue_from(skb, sk) {
- if (skb == tcp_send_head(sk))
- break;
+ tcp_for_write_queue_backwards_from(skb, sk) {
+ /*
+ * BUG-to-solve: Sadly enough, still we might not know here
+ * what timedout thingie must do something (not necessarily
+ * encountered the highest SACKED_RETRANS).
+ * RFC: there is a contradiction between the original code and
+ * DaveM's interpretation of the skb_timedout loop which would
+ * be nice to resolve first before fixing this. What should be
+ * do with the discontinuity of timestamps at SACKED_RETRANS
+ * edge when LOST is set for it, should timedout of a later
+ * skb be postponed due to it?
+ */
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
+ break;
+ /*
+ * Could have S|R skb, or thus checking this here.
+ * BUG: EVER_RETRANS edge somewhere when !tp->retrans
+ * (significance is minor, requires undo or so to occur).
+ */
+ if (IsFack(tp) && !ts_linear &&
+ (!tp->retrans_out ||
+ (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS))) {
+ ts_linear = 1;
+ /* At SACKED_RETRANS edge, nothing to do past it? */
if (!tcp_skb_timedout(sk, skb))
- break;
+ reentry_skb = NULL;
+ }
- if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) {
+ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
+ if (((reord_count >= tp->reordering) ||
+ (ts_linear && tcp_skb_timedout(sk, skb)))) {
+ /*
+ * RFC: Might have to handle skb fragmentation
+ * here if the pcount > 1? Original does not
+ * handle it, btw.
+ */
TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
tp->lost_out += tcp_skb_pcount(skb);
+ tcp_verify_retransmit_hint(tp, skb);
- /* clear xmit_retrans hint */
- if (tp->retransmit_skb_hint &&
- before(TCP_SKB_CB(skb)->seq,
- TCP_SKB_CB(tp->retransmit_skb_hint)->seq))
-
- tp->retransmit_skb_hint = NULL;
+ } else if (reentry_skb != NULL) {
+ /*
+ * ...wasn't marked, thus a possible entry
+ * point (possibly last skb before
+ * TCPCB_LOST edge comes).
+ */
+ reentry_skb = skb;
}
}
- tp->scoreboard_skb_hint = skb;
+ /* For RFC3517, do increment only if SACKED_ACKED is set */
+ if (IsFack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
+ reord_count += tcp_skb_pcount(skb);
+ /*
+ * Skip to high_seq if enough SACK blocks gathered (and no
+ * timedout work has begun)
+ */
+ if (!ts_linear && (reord_count >= tp->reordering) &&
+ after(TCP_SKB_CB(skb)->seq, tp->high_seq)) {
+ /*
+ * reord_count will be out-of-sync after this but
+ * that's an non-issue because it is definately
+ * larger than tp->reordering still!
+ * Search sends us too far, the loop will fix that
+ */
+ skb = tcp_write_queue_find(sk, tp->high_seq);
+ if (reentry_skb != NULL)
+ reentry_skb = skb;
+ }
+ }
+ /*
+ * RFC: Might have to handle the lost <= 0 condition here if nothing
+ * got marked in the loop
+ */
+
+ /* Continue with timedout work */
+ if (IsFack(tp) && (reentry_skb != NULL))
+ tcp_timedout_mark_forward(sk, reentry_skb);
+
+ tcp_sync_left_out(tp);
+}
+/* Simple NewReno thing: Mark head LOST if it wasn't yet and it's below
+ * high_seq, stop. That's all.
+ */
+static void tcp_update_scoreboard_newreno(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct sk_buff *skb = tcp_write_queue_head(sk);
+
+ if (!(TCP_SKB_CB(skb)->sacked & TCPCB_LOST) &&
+ before(tp->snd_una, tp->high_seq)) {
+ TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
+ tp->lost_out += tcp_skb_pcount(skb);
tcp_sync_left_out(tp);
}
}
+/* Account newly detected lost packet(s) */
+static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
+{
+ if (!IsReno(tp))
+ tcp_update_scoreboard_fack(sk);
+ else
+ tcp_update_scoreboard_newreno(sk);
+}
+
/* CWND moderation, preventing bursts due to too big ACKs
* in dubious situations.
*/
@@ -2115,7 +2196,7 @@ tcp_fastretrans_alert(struct sock *sk, u
before(tp->snd_una, tp->high_seq) &&
icsk->icsk_ca_state != TCP_CA_Open &&
tp->fackets_out > tp->reordering) {
- tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq);
+ tcp_update_scoreboard_fack(sk);
NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
}
--
1.4.2
Powered by blists - more mailing lists