lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20191209191959.100759-1-yyd@google.com>
Date:   Mon,  9 Dec 2019 14:19:59 -0500
From:   "Kevin(Yudong) Yang" <yyd@...gle.com>
To:     David Miller <davem@...emloft.net>
Cc:     netdev@...r.kernel.org, "Kevin(Yudong) Yang" <yyd@...gle.com>,
        Eric Dumazet <edumazet@...gle.com>,
        Neal Cardwell <ncardwell@...gle.com>,
        Yuchung Cheng <ycheng@...gle.com>
Subject: [PATCH net-next] net-tcp: Disable TCP ssthresh metrics cache by default

This patch introduces a sysctl knob "net.ipv4.tcp_no_ssthresh_metrics_save"
that disables TCP ssthresh metrics cache by default. Other parts of TCP
metrics cache, e.g. rtt, cwnd, remain unchanged.

As modern networks becoming more and more dynamic, TCP metrics cache
today often causes more harm than benefits. For example, the same IP
address is often shared by different subscribers behind NAT in residential
networks. Even if the IP address is not shared by different users,
caching the slow-start threshold of a previous short flow using loss-based
congestion control (e.g. cubic) often causes the future longer flows of
the same network path to exit slow-start prematurely with abysmal
throughput.

Caching ssthresh is very risky and can lead to terrible performance.
Therefore it makes sense to make disabling ssthresh caching by
default and opt-in for specific networks by the administrators.
This practice also has worked well for several years of deployment with
CUBIC congestion control at Google.

Acked-by: Eric Dumazet <edumazet@...gle.com>
Acked-by: Neal Cardwell <ncardwell@...gle.com>
Acked-by: Yuchung Cheng <ycheng@...gle.com>
Signed-off-by: Kevin(Yudong) Yang <yyd@...gle.com>
---
 Documentation/networking/ip-sysctl.txt |  4 ++++
 include/net/netns/ipv4.h               |  1 +
 net/ipv4/sysctl_net_ipv4.c             |  9 +++++++++
 net/ipv4/tcp_ipv4.c                    |  1 +
 net/ipv4/tcp_metrics.c                 | 13 +++++++++----
 5 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 099a55bd1432d..b865b3b2f81ae 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -479,6 +479,10 @@ tcp_no_metrics_save - BOOLEAN
 	degradation.  If set, TCP will not cache metrics on closing
 	connections.
 
+tcp_no_ssthresh_metrics_save - BOOLEAN
+	Controls whether TCP saves ssthresh metrics in the route cache.
+	Default is 1, which disables ssthresh metrics.
+
 tcp_orphan_retries - INTEGER
 	This value influences the timeout of a locally closed TCP connection,
 	when RTO retransmissions remain unacknowledged.
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index c0c0791b19123..08b98414d94e5 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -154,6 +154,7 @@ struct netns_ipv4 {
 	int sysctl_tcp_adv_win_scale;
 	int sysctl_tcp_frto;
 	int sysctl_tcp_nometrics_save;
+	int sysctl_tcp_no_ssthresh_metrics_save;
 	int sysctl_tcp_moderate_rcvbuf;
 	int sysctl_tcp_tso_win_divisor;
 	int sysctl_tcp_workaround_signed_windows;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index fcb2cd167f64a..9684af02e0a59 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -1192,6 +1192,15 @@ static struct ctl_table ipv4_net_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "tcp_no_ssthresh_metrics_save",
+		.data		= &init_net.ipv4.sysctl_tcp_no_ssthresh_metrics_save,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
 	{
 		.procname	= "tcp_moderate_rcvbuf",
 		.data		= &init_net.ipv4.sysctl_tcp_moderate_rcvbuf,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 92282f98dc822..26637fce324d8 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2674,6 +2674,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
 	net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
 	net->ipv4.sysctl_tcp_tw_reuse = 2;
+	net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
 
 	cnt = tcp_hashinfo.ehash_mask + 1;
 	net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index c4848e7a0aad1..279db8822439d 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -385,7 +385,8 @@ void tcp_update_metrics(struct sock *sk)
 
 	if (tcp_in_initial_slowstart(tp)) {
 		/* Slow start still did not finish. */
-		if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+		if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save &&
+		    !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
 			val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
 			if (val && (tp->snd_cwnd >> 1) > val)
 				tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
@@ -400,7 +401,8 @@ void tcp_update_metrics(struct sock *sk)
 	} else if (!tcp_in_slow_start(tp) &&
 		   icsk->icsk_ca_state == TCP_CA_Open) {
 		/* Cong. avoidance phase, cwnd is reliable. */
-		if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
+		if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save &&
+		    !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH))
 			tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
 				       max(tp->snd_cwnd >> 1, tp->snd_ssthresh));
 		if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) {
@@ -416,7 +418,8 @@ void tcp_update_metrics(struct sock *sk)
 			tcp_metric_set(tm, TCP_METRIC_CWND,
 				       (val + tp->snd_ssthresh) >> 1);
 		}
-		if (!tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
+		if (!net->ipv4.sysctl_tcp_no_ssthresh_metrics_save &&
+		    !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) {
 			val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
 			if (val && tp->snd_ssthresh > val)
 				tcp_metric_set(tm, TCP_METRIC_SSTHRESH,
@@ -441,6 +444,7 @@ void tcp_init_metrics(struct sock *sk)
 {
 	struct dst_entry *dst = __sk_dst_get(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
 	struct tcp_metrics_block *tm;
 	u32 val, crtt = 0; /* cached RTT scaled by 8 */
 
@@ -458,7 +462,8 @@ void tcp_init_metrics(struct sock *sk)
 	if (tcp_metric_locked(tm, TCP_METRIC_CWND))
 		tp->snd_cwnd_clamp = tcp_metric_get(tm, TCP_METRIC_CWND);
 
-	val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
+	val = net->ipv4.sysctl_tcp_no_ssthresh_metrics_save ?
+	      0 : tcp_metric_get(tm, TCP_METRIC_SSTHRESH);
 	if (val) {
 		tp->snd_ssthresh = val;
 		if (tp->snd_ssthresh > tp->snd_cwnd_clamp)
-- 
2.24.0.432.g9d3f5f5b63-goog

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ