netdev - [RFC PATCH] tcp: limit data skbs in qdisc layer

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <1341396687.2583.1757.camel@edumazet-glaptop>
Date:	Wed, 04 Jul 2012 12:11:27 +0200
From:	Eric Dumazet <eric.dumazet@...il.com>
To:	David Miller <davem@...emloft.net>
Cc:	Yuchung Cheng <ycheng@...gle.com>, Dave Taht <dave.taht@...il.com>,
	netdev <netdev@...r.kernel.org>, codel@...ts.bufferbloat.net,
	Tom Herbert <therbert@...gle.com>,
	Matt Mathis <mattmathis@...gle.com>,
	Nandita Dukkipati <nanditad@...gle.com>,
	Neal Cardwell <ncardwell@...gle.com>,
	Andrew McGregor <andrewmcgr@...il.com>
Subject: [RFC PATCH] tcp: limit data skbs in qdisc layer

On Fri, 2012-06-29 at 06:51 +0200, Eric Dumazet wrote:

> My long term plan is to reduce number of skbs queued in Qdisc for TCP
> stack, to reduce RTT (removing the artificial RTT bias because of local
> queues)

preliminar patch to give the rough idea :

sk->sk_wmem_alloc not allowed to grow above a given limit,
allowing no more than ~4 segments [1] per tcp socket in qdisc layer at a
given time. (if TSO is enabled, then a single TSO packet hits the limit)

This means we divert sock_wfree() to a tcp_wfree() handler, to
queue/send following frames when skb_orphan() is called for the already
queued skbs.

Note : While this lowers artificial cwnd and rtt, this means more work
has to be done by tx completion handler (softirq instead of preemptable
process context)

To reduce this possible source of latency, we can skb_try_orphan(skb) in
NIC drivers ndo_start_xmit() instead of waiting TX completion, but do
this orphaning only on BQL enabled drivers, or in drivers known to
possibly delay TX completion (NIU is an example, as David had to revert
BQL because of this delaying)

results on my dev machine (tg3 nic) are really impressive, using
standard pfifo_fast, and with or without TSO/GSO

I no longer have 3MBytes backlogged in qdisc by a single netperf
session, and both side socket autotuning no longer use 4 Mbytes.

tcp_write_xmit() call is probably very naive and lacks proper tests.

Note :

[1] because sk_wmem_alloc accounts skb truesize, mss*4 test allow less
then 4 frames, but it seems ok.

 drivers/net/ethernet/broadcom/tg3.c |    1 
 include/linux/skbuff.h              |    6 +++
 include/net/sock.h                  |    1 
 net/ipv4/tcp_output.c               |   44 +++++++++++++++++++++++++-
 4 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index e47ff8b..ce0ca96 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -7020,6 +7020,7 @@ static netdev_tx_t tg3_start_xmit(struct sk_buff *skb, struct net_device *dev)
 
 	skb_tx_timestamp(skb);
 	netdev_tx_sent_queue(txq, skb->len);
+	skb_try_orphan(skb);
 
 	/* Sync BD data before updating mailbox */
 	wmb();
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 885c9bd..c4d4d15 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1667,6 +1667,12 @@ static inline void skb_orphan(struct sk_buff *skb)
 	skb->sk		= NULL;
 }
 
+static inline void skb_try_orphan(struct sk_buff *skb)
+{
+	if (!skb_shinfo(skb)->tx_flags)
+		skb_orphan(skb);
+}
+
 /**
  *	__skb_queue_purge - empty a list
  *	@list: list to empty
diff --git a/include/net/sock.h b/include/net/sock.h
index 640432a..2ce17b1 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1389,6 +1389,7 @@ extern void release_sock(struct sock *sk);
 
 /* BH context may only use the following locking interface. */
 #define bh_lock_sock(__sk)	spin_lock(&((__sk)->sk_lock.slock))
+#define bh_trylock_sock(__sk)	spin_trylock(&((__sk)->sk_lock.slock))
 #define bh_lock_sock_nested(__sk) \
 				spin_lock_nested(&((__sk)->sk_lock.slock), \
 				SINGLE_DEPTH_NESTING)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c465d3e..4e6ef82 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -65,6 +65,8 @@ int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
 EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
 
+static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
+			   int push_one, gfp_t gfp);
 
 /* Account for new data that has been sent to the network. */
 static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
@@ -783,6 +785,34 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 	return size;
 }
 
+/*
+ * Write buffer destructor automatically called from kfree_skb.
+ */
+void tcp_wfree(struct sk_buff *skb)
+{
+	struct sock *sk = skb->sk;
+	unsigned int len = skb->truesize;
+
+	atomic_sub(len - 1, &sk->sk_wmem_alloc);
+	if (bh_trylock_sock(sk)) {
+		if (!sock_owned_by_user(sk)) {
+			if ((1 << sk->sk_state) &
+			    (TCPF_CLOSE_WAIT | TCPF_ESTABLISHED))
+				tcp_write_xmit(sk,
+					       tcp_current_mss(sk),
+					       0, 0,
+					       GFP_ATOMIC);
+		} else {
+			/* TODO : might signal something here
+			 * so that user thread can call tcp_write_xmit()
+			 */
+		}
+		bh_unlock_sock(sk);
+	}
+
+	sk_free(sk);	
+}
+
 /* This routine actually transmits TCP packets queued in by
  * tcp_do_sendmsg().  This is used by both the initial
  * transmission and possible later retransmissions.
@@ -844,7 +874,11 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 
 	skb_push(skb, tcp_header_size);
 	skb_reset_transport_header(skb);
-	skb_set_owner_w(skb, sk);
+
+	skb_orphan(skb);
+	skb->sk = sk;
+	skb->destructor = tcp_wfree;
+	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
 
 	/* Build TCP header and checksum it. */
 	th = tcp_hdr(skb);
@@ -1780,6 +1814,14 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 	while ((skb = tcp_send_head(sk))) {
 		unsigned int limit;
 
+		/* yes, sk_wmem_alloc accounts skb truesize, so mss_now * 4
+		 * is a lazy approximation of our needs, but it seems ok
+		 */
+		if (atomic_read(&sk->sk_wmem_alloc) >= mss_now * 4) {
+			pr_debug("here we stop sending frame because sk_wmem_alloc %d\n",
+				atomic_read(&sk->sk_wmem_alloc));
+			break;
+		}
 		tso_segs = tcp_init_tso_segs(sk, skb, mss_now);
 		BUG_ON(!tso_segs);
 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html