netdev - [PATCH net-next] tcp: add tcp_reset_xmit

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20181023185416.182102-1-edumazet@google.com>
Date:   Tue, 23 Oct 2018 11:54:16 -0700
From:   Eric Dumazet <edumazet@...gle.com>
To:     "David S . Miller" <davem@...emloft.net>,
        Neal Cardwell <ncardwell@...gle.com>,
        Yuchung Cheng <ycheng@...gle.com>,
        Soheil Hassas Yeganeh <soheil@...gle.com>,
        Van Jacobson <vanj@...gle.com>
Cc:     netdev <netdev@...r.kernel.org>,
        Eric Dumazet <edumazet@...gle.com>,
        Eric Dumazet <eric.dumazet@...il.com>
Subject: [PATCH net-next] tcp: add tcp_reset_xmit_timer() helper

With EDT model, SRTT no longer is inflated by pacing delays.

This means that RTO and some other xmit timers might be setup
incorrectly. This is particularly visible with either :

- Very small enforced pacing rates (SO_MAX_PACING_RATE)
- Reduced rto (from the default 200 ms)

This can lead to TCP flows aborts in the worst case,
or spurious retransmits in other cases.

For example, this session gets far more throughput
than the requested 80kbit :

$ netperf -H 127.0.0.2 -l 100 -- -q 10000
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 127.0.0.2 () port 0 AF_INET
Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/sec

540000 262144 262144    104.00      2.66

With the fix :

$ netperf -H 127.0.0.2 -l 100 -- -q 10000
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 127.0.0.2 () port 0 AF_INET
Recv   Send    Send
Socket Socket  Message  Elapsed
Size   Size    Size     Time     Throughput
bytes  bytes   bytes    secs.    10^6bits/sec

540000 262144 262144    104.00      0.12

EDT allows for better control of rtx timers, since TCP has
a better idea of the earliest departure time of each skb
in the rtx queue. We only have to eventually add to the
timer the difference of the EDT time with current time.

Signed-off-by: Eric Dumazet <edumazet@...gle.com>
---
 include/net/tcp.h     | 30 +++++++++++++++++++++++++++---
 net/ipv4/tcp_input.c  |  8 ++++----
 net/ipv4/tcp_output.c | 18 ++++++++++--------
 3 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8a61c3e8c15dfaf1095490814288c41c13a629f3..a18914d204864e3016fe8121ec7065da6696f9ef 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1245,8 +1245,31 @@ static inline bool tcp_needs_internal_pacing(const struct sock *sk)
 	return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED;
 }
 
+/* Return in jiffies the delay before one skb is sent.
+ * If @skb is NULL, we look at EDT for next packet being sent on the socket.
+ */
+static inline unsigned long tcp_pacing_delay(const struct sock *sk,
+					     const struct sk_buff *skb)
+{
+	s64 pacing_delay = skb ? skb->tstamp : tcp_sk(sk)->tcp_wstamp_ns;
+
+	pacing_delay -= tcp_sk(sk)->tcp_clock_cache;
+
+	return pacing_delay > 0 ? nsecs_to_jiffies(pacing_delay) : 0;
+}
+
+static inline void tcp_reset_xmit_timer(struct sock *sk,
+					const int what,
+					unsigned long when,
+					const unsigned long max_when,
+					const struct sk_buff *skb)
+{
+	inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk, skb),
+				  max_when);
+}
+
 /* Something is really bad, we could not queue an additional packet,
- * because qdisc is full or receiver sent a 0 window.
+ * because qdisc is full or receiver sent a 0 window, or we are paced.
  * We do not want to add fuel to the fire, or abort too early,
  * so make sure the timer we arm now is at least 200ms in the future,
  * regardless of current icsk_rto value (as it could be ~2ms)
@@ -1268,8 +1291,9 @@ static inline unsigned long tcp_probe0_when(const struct sock *sk,
 static inline void tcp_check_probe_timer(struct sock *sk)
 {
 	if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending)
-		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
-					  tcp_probe0_base(sk), TCP_RTO_MAX);
+		tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+				     tcp_probe0_base(sk), TCP_RTO_MAX,
+				     NULL);
 }
 
 static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 188980c58f873ffdc81c018dcab0996f603cd7eb..2868ef28ce52179b3c5874e749b680ffbdc0521a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2979,8 +2979,8 @@ void tcp_rearm_rto(struct sock *sk)
 			 */
 			rto = usecs_to_jiffies(max_t(int, delta_us, 1));
 		}
-		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
-					  TCP_RTO_MAX);
+		tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
+				     TCP_RTO_MAX, tcp_rtx_queue_head(sk));
 	}
 }
 
@@ -3255,8 +3255,8 @@ static void tcp_ack_probe(struct sock *sk)
 	} else {
 		unsigned long when = tcp_probe0_when(sk, TCP_RTO_MAX);
 
-		inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
-					  when, TCP_RTO_MAX);
+		tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+				     when, TCP_RTO_MAX, NULL);
 	}
 }
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index c07990a35ff3bd9438d32c82863ef207c93bdb9e..9c34b97d365d719ff76250bc9fe7fa20495a3ed2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2455,8 +2455,8 @@ bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto)
 	if (rto_delta_us > 0)
 		timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us));
 
-	inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
-				  TCP_RTO_MAX);
+	tcp_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
+			     TCP_RTO_MAX, NULL);
 	return true;
 }
 
@@ -3020,9 +3020,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
 
 		if (skb == rtx_head &&
 		    icsk->icsk_pending != ICSK_TIME_REO_TIMEOUT)
-			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
-						  inet_csk(sk)->icsk_rto,
-						  TCP_RTO_MAX);
+			tcp_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+					     inet_csk(sk)->icsk_rto,
+					     TCP_RTO_MAX,
+					     skb);
 	}
 }
 
@@ -3752,9 +3753,10 @@ void tcp_send_probe0(struct sock *sk)
 			icsk->icsk_probes_out = 1;
 		probe_max = TCP_RESOURCE_PROBE_INTERVAL;
 	}
-	inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
-				  tcp_probe0_when(sk, probe_max),
-				  TCP_RTO_MAX);
+	tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+			     tcp_probe0_when(sk, probe_max),
+			     TCP_RTO_MAX,
+			     NULL);
 }
 
 int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
-- 
2.19.1.568.g152ad8e336-goog