[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <46F2E5D9.9080704@gmail.com>
Date: Thu, 20 Sep 2007 14:27:53 -0700
From: Tom Quetchenbach <virtualphtn@...il.com>
To: netdev@...r.kernel.org
Subject: [PATCH 1/2 revised] David Miller's rbtree patches for 2.6.22.6
> Patch 1: David Miller's red-black tree code, tweaked for 2.6.22.6,
> with some bugfixes
oops...
The original patch can cause a kernel panic if tcp_sacktag_write_queue
is called when the write queue is empty. Use this one instead.
-Tom
---
diff -ur linux-2.6.22.6/include/linux/skbuff.h linux-2.6.22.6-rbtree-davem-fixed/include/linux/skbuff.h
--- linux-2.6.22.6/include/linux/skbuff.h 2007-08-30 23:21:01.000000000 -0700
+++ linux-2.6.22.6-rbtree-davem-fixed/include/linux/skbuff.h 2007-09-13 18:23:16.000000000 -0700
@@ -18,6 +18,7 @@
#include <linux/compiler.h>
#include <linux/time.h>
#include <linux/cache.h>
+#include <linux/rbtree.h>
#include <asm/atomic.h>
#include <asm/types.h>
@@ -242,6 +243,8 @@
struct sk_buff *next;
struct sk_buff *prev;
+ struct rb_node rb;
+
struct sock *sk;
ktime_t tstamp;
struct net_device *dev;
diff -ur linux-2.6.22.6/include/linux/tcp.h linux-2.6.22.6-rbtree-davem-fixed/include/linux/tcp.h
--- linux-2.6.22.6/include/linux/tcp.h 2007-08-30 23:21:01.000000000 -0700
+++ linux-2.6.22.6-rbtree-davem-fixed/include/linux/tcp.h 2007-09-13 18:23:16.000000000 -0700
@@ -174,6 +174,7 @@
#include <linux/skbuff.h>
#include <linux/dmaengine.h>
+#include <linux/rbtree.h>
#include <net/sock.h>
#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
@@ -321,6 +322,7 @@
u32 snd_cwnd_used;
u32 snd_cwnd_stamp;
+ struct rb_root write_queue_rb;
struct sk_buff_head out_of_order_queue; /* Out of order segments go here */
u32 rcv_wnd; /* Current receiver window */
@@ -339,9 +341,7 @@
struct sk_buff *scoreboard_skb_hint;
struct sk_buff *retransmit_skb_hint;
struct sk_buff *forward_skb_hint;
- struct sk_buff *fastpath_skb_hint;
- int fastpath_cnt_hint;
int lost_cnt_hint;
int retransmit_cnt_hint;
int forward_cnt_hint;
diff -ur linux-2.6.22.6/include/net/tcp.h linux-2.6.22.6-rbtree-davem-fixed/include/net/tcp.h
--- linux-2.6.22.6/include/net/tcp.h 2007-08-30 23:21:01.000000000 -0700
+++ linux-2.6.22.6-rbtree-davem-fixed/include/net/tcp.h 2007-09-19 17:36:07.000000000 -0700
@@ -540,6 +540,7 @@
__u32 seq; /* Starting sequence number */
__u32 end_seq; /* SEQ + FIN + SYN + datalen */
__u32 when; /* used to compute rtt's */
+ unsigned int fack_count; /* speed up SACK processing */
__u8 flags; /* TCP header flags. */
/* NOTE: These must match up to the flags byte in a
@@ -1043,12 +1044,12 @@
}
/*from STCP */
-static inline void clear_all_retrans_hints(struct tcp_sock *tp){
+static inline void clear_all_retrans_hints(struct tcp_sock *tp)
+{
tp->lost_skb_hint = NULL;
tp->scoreboard_skb_hint = NULL;
tp->retransmit_skb_hint = NULL;
tp->forward_skb_hint = NULL;
- tp->fastpath_skb_hint = NULL;
}
/* MD5 Signature */
@@ -1166,6 +1167,7 @@
while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL)
sk_stream_free_skb(sk, skb);
+ tcp_sk(sk)->write_queue_rb = RB_ROOT;
sk_stream_mem_reclaim(sk);
}
@@ -1208,7 +1210,7 @@
{
struct tcp_sock *tp = tcp_sk(sk);
- sk->sk_send_head = skb->next;
+ sk->sk_send_head = tcp_write_queue_next(sk, skb);
if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
sk->sk_send_head = NULL;
/* Don't override Nagle indefinately with F-RTO */
@@ -1227,9 +1229,61 @@
sk->sk_send_head = NULL;
}
+static inline struct sk_buff *tcp_write_queue_find(struct sock *sk, __u32 seq)
+{
+ struct rb_node *rb_node = tcp_sk(sk)->write_queue_rb.rb_node;
+ struct sk_buff *skb = NULL;
+
+ while (rb_node) {
+ struct sk_buff *tmp = rb_entry(rb_node,struct sk_buff,rb);
+ if (TCP_SKB_CB(tmp)->end_seq > seq) {
+ skb = tmp;
+ if (TCP_SKB_CB(tmp)->seq <= seq)
+ break;
+ rb_node = rb_node->rb_left;
+ } else
+ rb_node = rb_node->rb_right;
+
+ }
+ return skb;
+}
+
+static inline void tcp_rb_insert(struct sk_buff *skb, struct rb_root *root)
+{
+ struct rb_node **rb_link, *rb_parent;
+ __u32 seq = TCP_SKB_CB(skb)->seq;
+
+ rb_link = &root->rb_node;
+ rb_parent = NULL;
+ while (*rb_link != NULL) {
+ struct sk_buff *tmp = rb_entry(*rb_link,struct sk_buff,rb);
+ rb_parent = *rb_link;
+ if (TCP_SKB_CB(tmp)->end_seq > seq) {
+ BUG_ON(TCP_SKB_CB(tmp)->seq <= seq);
+ rb_link = &rb_parent->rb_left;
+ } else {
+ rb_link = &rb_parent->rb_right;
+ }
+ }
+ rb_link_node(&skb->rb, rb_parent, rb_link);
+ rb_insert_color(&skb->rb, root);
+}
+
+static inline void tcp_rb_unlink(struct sk_buff *skb, struct rb_root *root)
+{
+ rb_erase(&skb->rb, root);
+}
+
static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
{
+ struct sk_buff *tail = tcp_write_queue_tail(sk);
+ unsigned int fc = 0;
+
+ if (tail)
+ fc = TCP_SKB_CB(tail)->fack_count + tcp_skb_pcount(tail);
+ TCP_SKB_CB(skb)->fack_count = fc;
__skb_queue_tail(&sk->sk_write_queue, skb);
+ tcp_rb_insert(skb, &tcp_sk(sk)->write_queue_rb);
}
static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
@@ -1241,9 +1295,35 @@
sk->sk_send_head = skb;
}
+/* This is only used for tcp_send_synack(), so the write queue should
+ * be empty. If that stops being true, the fack_count assignment
+ * will need to be more elaborate.
+ */
static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *skb)
{
+ BUG_ON(!skb_queue_empty(&sk->sk_write_queue));
__skb_queue_head(&sk->sk_write_queue, skb);
+ TCP_SKB_CB(skb)->fack_count = 0;
+ tcp_rb_insert(skb, &tcp_sk(sk)->write_queue_rb);
+}
+
+/* An insert into the middle of the write queue causes the fack
+ * counts in subsequent packets to become invalid, fix them up.
+ */
+static inline void tcp_reset_fack_counts(struct sock *sk, struct sk_buff *first)
+{
+ struct sk_buff *prev = first->prev;
+ unsigned int fc = 0;
+
+ if (prev != (struct sk_buff *) &sk->sk_write_queue)
+ fc = TCP_SKB_CB(prev)->fack_count + tcp_skb_pcount(prev);
+
+ while (first != (struct sk_buff *)&sk->sk_write_queue) {
+ TCP_SKB_CB(first)->fack_count = fc;
+
+ fc += tcp_skb_pcount(first);
+ first = first->next;
+ }
}
/* Insert buff after skb on the write queue of sk. */
@@ -1252,19 +1332,24 @@
struct sock *sk)
{
__skb_append(skb, buff, &sk->sk_write_queue);
+ tcp_reset_fack_counts(sk, buff);
+ tcp_rb_insert(buff, &tcp_sk(sk)->write_queue_rb);
}
-/* Insert skb between prev and next on the write queue of sk. */
+/* Insert new before skb on the write queue of sk. */
static inline void tcp_insert_write_queue_before(struct sk_buff *new,
struct sk_buff *skb,
struct sock *sk)
{
__skb_insert(new, skb->prev, skb, &sk->sk_write_queue);
+ tcp_reset_fack_counts(sk, new);
+ tcp_rb_insert(new, &tcp_sk(sk)->write_queue_rb);
}
static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
{
__skb_unlink(skb, &sk->sk_write_queue);
+ tcp_rb_unlink(skb, &tcp_sk(sk)->write_queue_rb);
}
static inline int tcp_skb_is_last(const struct sock *sk,
diff -ur linux-2.6.22.6/net/ipv4/tcp_input.c linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_input.c
--- linux-2.6.22.6/net/ipv4/tcp_input.c 2007-08-30 23:21:01.000000000 -0700
+++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_input.c 2007-09-20 14:10:26.000000000 -0700
@@ -947,14 +947,13 @@
unsigned char *ptr = (skb_transport_header(ack_skb) +
TCP_SKB_CB(ack_skb)->sacked);
struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2);
- struct sk_buff *cached_skb;
int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
int reord = tp->packets_out;
int prior_fackets;
u32 lost_retrans = 0;
int flag = 0;
int found_dup_sack = 0;
- int cached_fack_count;
+ int fack_count_base;
int i;
int first_sack_index;
@@ -1020,7 +1019,6 @@
num_sacks = 1;
else {
int j;
- tp->fastpath_skb_hint = NULL;
/* order SACK blocks to allow in order walk of the retrans queue */
for (i = num_sacks-1; i > 0; i--) {
@@ -1045,13 +1043,8 @@
/* clear flag as used for different purpose in following code */
flag = 0;
- /* Use SACK fastpath hint if valid */
- cached_skb = tp->fastpath_skb_hint;
- cached_fack_count = tp->fastpath_cnt_hint;
- if (!cached_skb) {
- cached_skb = tcp_write_queue_head(sk);
- cached_fack_count = 0;
- }
+ if (likely(tcp_write_queue_head(sk)))
+ fack_count_base = TCP_SKB_CB(tcp_write_queue_head(sk))->fack_count;
for (i=0; i<num_sacks; i++, sp++) {
struct sk_buff *skb;
@@ -1060,8 +1053,10 @@
int fack_count;
int dup_sack = (found_dup_sack && (i == first_sack_index));
- skb = cached_skb;
- fack_count = cached_fack_count;
+ skb = tcp_write_queue_find(sk, start_seq);
+ if (!skb)
+ continue;
+ fack_count = TCP_SKB_CB(skb)->fack_count - fack_count_base;
/* Event "B" in the comment above. */
if (after(end_seq, tp->high_seq))
@@ -1074,13 +1069,6 @@
if (skb == tcp_send_head(sk))
break;
- cached_skb = skb;
- cached_fack_count = fack_count;
- if (i == first_sack_index) {
- tp->fastpath_skb_hint = skb;
- tp->fastpath_cnt_hint = fack_count;
- }
-
/* The retransmission queue is always in order, so
* we can short-circuit the walk early.
*/
diff -ur linux-2.6.22.6/net/ipv4/tcp_ipv4.c linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_ipv4.c
--- linux-2.6.22.6/net/ipv4/tcp_ipv4.c 2007-08-30 23:21:01.000000000 -0700
+++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_ipv4.c 2007-09-13 18:23:16.000000000 -0700
@@ -1845,6 +1845,7 @@
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ tp->write_queue_rb = RB_ROOT;
skb_queue_head_init(&tp->out_of_order_queue);
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
diff -ur linux-2.6.22.6/net/ipv4/tcp_minisocks.c linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_minisocks.c
--- linux-2.6.22.6/net/ipv4/tcp_minisocks.c 2007-08-30 23:21:01.000000000 -0700
+++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_minisocks.c 2007-09-13 18:23:16.000000000 -0700
@@ -421,6 +421,7 @@
tcp_set_ca_state(newsk, TCP_CA_Open);
tcp_init_xmit_timers(newsk);
+ newtp->write_queue_rb = RB_ROOT;
skb_queue_head_init(&newtp->out_of_order_queue);
newtp->write_seq = treq->snt_isn + 1;
newtp->pushed_seq = newtp->write_seq;
diff -ur linux-2.6.22.6/net/ipv4/tcp_output.c linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_output.c
--- linux-2.6.22.6/net/ipv4/tcp_output.c 2007-08-30 23:21:01.000000000 -0700
+++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_output.c 2007-09-13 18:23:16.000000000 -0700
@@ -1278,11 +1278,11 @@
sk_charge_skb(sk, nskb);
skb = tcp_send_head(sk);
+ TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
+ TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
tcp_insert_write_queue_before(nskb, skb, sk);
tcp_advance_send_head(sk, skb);
- TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
- TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
TCP_SKB_CB(nskb)->sacked = 0;
nskb->csum = 0;
diff -ur linux-2.6.22.6/net/ipv6/tcp_ipv6.c linux-2.6.22.6-rbtree-davem-fixed/net/ipv6/tcp_ipv6.c
--- linux-2.6.22.6/net/ipv6/tcp_ipv6.c 2007-08-30 23:21:01.000000000 -0700
+++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv6/tcp_ipv6.c 2007-09-13 18:23:16.000000000 -0700
@@ -1900,6 +1900,7 @@
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ tp->write_queue_rb = RB_ROOT;
skb_queue_head_init(&tp->out_of_order_queue);
tcp_init_xmit_timers(sk);
tcp_prequeue_init(tp);
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists