lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:   Mon, 30 Dec 2019 06:06:19 -0800
From:   Eric Dumazet <edumazet@...gle.com>
To:     "David S . Miller" <davem@...emloft.net>
Cc:     netdev <netdev@...r.kernel.org>,
        Eric Dumazet <edumazet@...gle.com>,
        Eric Dumazet <eric.dumazet@...il.com>,
        Soheil Hassas Yeganeh <soheil@...gle.com>,
        Neal Cardwell <ncardwell@...gle.com>,
        Yuchung Cheng <ycheng@...gle.com>,
        Martin KaFai Lau <kafai@...com>
Subject: [PATCH net] tcp_cubic: refactor code to perform a divide only when needed

Neal Cardwell suggested to not change ca->delay_min
and apply the ack delay cushion only when Hystart ACK train
is still under consideration. This should avoid a 64bit
divide unless needed.

Tested:

40Gbit(mlx4) testbed (with sch_fq as packet scheduler)

$ echo -n 'file tcp_cubic.c +p'  >/sys/kernel/debug/dynamic_debug/control
$ nstat -n;for f in {1..10}; do ./super_netperf 1 -H lpaa24 -l -4000000; done;nstat|egrep "Hystart"
  14815
  16280
  15293
  15563
  11574
  15145
  14789
  18548
  16972
  12520
TcpExtTCPHystartTrainDetect     10                 0.0
TcpExtTCPHystartTrainCwnd       1396               0.0
$ dmesg | tail -10
[ 4873.951350] hystart_ack_train (116 > 93) delay_min 24 (+ ack_delay 69) cwnd 80
[ 4875.155379] hystart_ack_train (55 > 50) delay_min 21 (+ ack_delay 29) cwnd 160
[ 4876.333921] hystart_ack_train (69 > 62) delay_min 23 (+ ack_delay 39) cwnd 130
[ 4877.519037] hystart_ack_train (69 > 60) delay_min 22 (+ ack_delay 38) cwnd 130
[ 4878.701559] hystart_ack_train (87 > 63) delay_min 24 (+ ack_delay 39) cwnd 160
[ 4879.844597] hystart_ack_train (93 > 50) delay_min 21 (+ ack_delay 29) cwnd 216
[ 4880.956650] hystart_ack_train (74 > 67) delay_min 20 (+ ack_delay 47) cwnd 108
[ 4882.098500] hystart_ack_train (61 > 57) delay_min 23 (+ ack_delay 34) cwnd 130
[ 4883.262056] hystart_ack_train (72 > 67) delay_min 21 (+ ack_delay 46) cwnd 130
[ 4884.418760] hystart_ack_train (74 > 67) delay_min 29 (+ ack_delay 38) cwnd 152

10Gbit(bnx2x) testbed (with sch_fq as packet scheduler)

$ echo -n 'file tcp_cubic.c +p'  >/sys/kernel/debug/dynamic_debug/control
$ nstat -n;for f in {1..10}; do ./super_netperf 1 -H lpk52 -l -4000000; done;nstat|egrep "Hystart"
   7050
   7065
   7100
   6900
   7202
   7263
   7189
   6869
   7463
   7034
TcpExtTCPHystartTrainDetect     10                 0.0
TcpExtTCPHystartTrainCwnd       3199               0.0
$ dmesg | tail -10
[  176.920012] hystart_ack_train (161 > 141) delay_min 83 (+ ack_delay 58) cwnd 264
[  179.144645] hystart_ack_train (164 > 159) delay_min 120 (+ ack_delay 39) cwnd 444
[  181.354527] hystart_ack_train (214 > 168) delay_min 125 (+ ack_delay 43) cwnd 436
[  183.539565] hystart_ack_train (170 > 147) delay_min 96 (+ ack_delay 51) cwnd 326
[  185.727309] hystart_ack_train (177 > 160) delay_min 61 (+ ack_delay 99) cwnd 128
[  187.947142] hystart_ack_train (184 > 167) delay_min 123 (+ ack_delay 44) cwnd 367
[  190.166680] hystart_ack_train (230 > 153) delay_min 116 (+ ack_delay 37) cwnd 444
[  192.327285] hystart_ack_train (210 > 206) delay_min 86 (+ ack_delay 120) cwnd 152
[  194.511392] hystart_ack_train (173 > 151) delay_min 94 (+ ack_delay 57) cwnd 239
[  196.736023] hystart_ack_train (149 > 146) delay_min 105 (+ ack_delay 41) cwnd 399

Fixes: 42f3a8aaae66 ("tcp_cubic: tweak Hystart detection for short RTT flows")
Signed-off-by: Eric Dumazet <edumazet@...gle.com>
Reported-by: Neal Cardwell <ncardwell@...gle.com>
Link: https://www.spinics.net/lists/netdev/msg621886.html
Link: https://www.spinics.net/lists/netdev/msg621797.html
---
 net/ipv4/tcp_cubic.c | 51 ++++++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 23 deletions(-)

diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index d02bb283c6890e1692e714e053515c6e4981d83a..8f8eefd3a3ce116aa8fa2b7ef85c7eb503fa8da7 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -372,6 +372,26 @@ static void bictcp_state(struct sock *sk, u8 new_state)
 	}
 }
 
+/* Account for TSO/GRO delays.
+ * Otherwise short RTT flows could get too small ssthresh, since during
+ * slow start we begin with small TSO packets and ca->delay_min would
+ * not account for long aggregation delay when TSO packets get bigger.
+ * Ideally even with a very small RTT we would like to have at least one
+ * TSO packet being sent and received by GRO, and another one in qdisc layer.
+ * We apply another 100% factor because @rate is doubled at this point.
+ * We cap the cushion to 1ms.
+ */
+static u32 hystart_ack_delay(struct sock *sk)
+{
+	unsigned long rate;
+
+	rate = READ_ONCE(sk->sk_pacing_rate);
+	if (!rate)
+		return 0;
+	return min_t(u64, USEC_PER_MSEC,
+		     div64_ul((u64)GSO_MAX_SIZE * 4 * USEC_PER_SEC, rate));
+}
+
 static void hystart_update(struct sock *sk, u32 delay)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -385,7 +405,8 @@ static void hystart_update(struct sock *sk, u32 delay)
 		if ((s32)(now - ca->last_ack) <= hystart_ack_delta_us) {
 			ca->last_ack = now;
 
-			threshold = ca->delay_min;
+			threshold = ca->delay_min + hystart_ack_delay(sk);
+
 			/* Hystart ack train triggers if we get ack past
 			 * ca->delay_min/2.
 			 * Pacing might have delayed packets up to RTT/2
@@ -396,6 +417,9 @@ static void hystart_update(struct sock *sk, u32 delay)
 
 			if ((s32)(now - ca->round_start) > threshold) {
 				ca->found = 1;
+				pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n",
+					 now - ca->round_start, threshold,
+					 ca->delay_min, hystart_ack_delay(sk), tp->snd_cwnd);
 				NET_INC_STATS(sock_net(sk),
 					      LINUX_MIB_TCPHYSTARTTRAINDETECT);
 				NET_ADD_STATS(sock_net(sk),
@@ -447,30 +471,11 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample)
 		delay = 1;
 
 	/* first time call or link delay decreases */
-	if (ca->delay_min == 0 || ca->delay_min > delay) {
-		unsigned long rate = READ_ONCE(sk->sk_pacing_rate);
-
-		/* Account for TSO/GRO delays.
-		 * Otherwise short RTT flows could get too small ssthresh,
-		 * since during slow start we begin with small TSO packets
-		 * and could lower ca->delay_min too much.
-		 * Ideally even with a very small RTT we would like to have
-		 * at least one TSO packet being sent and received by GRO,
-		 * and another one in qdisc layer.
-		 * We apply another 100% factor because @rate is doubled at
-		 * this point.
-		 * We cap the cushion to 1ms.
-		 */
-		if (rate)
-			delay += min_t(u64, USEC_PER_MSEC,
-				       div64_ul((u64)GSO_MAX_SIZE *
-						4 * USEC_PER_SEC, rate));
-		if (ca->delay_min == 0 || ca->delay_min > delay)
-			ca->delay_min = delay;
-	}
+	if (ca->delay_min == 0 || ca->delay_min > delay)
+		ca->delay_min = delay;
 
 	/* hystart triggers when cwnd is larger than some threshold */
-	if (!ca->found && hystart && tcp_in_slow_start(tp) &&
+	if (!ca->found && tcp_in_slow_start(tp) && hystart &&
 	    tp->snd_cwnd >= hystart_low_window)
 		hystart_update(sk, delay);
 }
-- 
2.24.1.735.g03f4e72817-goog

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ