lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220729143935.2432743-2-marek@cloudflare.com>
Date:   Fri, 29 Jul 2022 16:39:34 +0200
From:   Marek Majkowski <marek@...udflare.com>
To:     netdev@...r.kernel.org
Cc:     bpf@...r.kernel.org, kernel-team@...udflare.com,
        ivan@...udflare.com, edumazet@...gle.com, davem@...emloft.net,
        kuba@...nel.org, pabeni@...hat.com, ast@...nel.org,
        daniel@...earbox.net, andrii@...nel.org, brakmo@...com,
        Marek Majkowski <marek@...udflare.com>
Subject: [PATCH net-next v2 1/2] RTAX_INITRWND should be able to set the rcv_ssthresh above 64KiB

There are three places where we initialize sockets:
 - tcp_output:tcp_connect_init
 - tcp_minisocks:tcp_openreq_init_rwin
 - syncookies

In the first two we already have a call to `tcp_rwnd_init_bpf` and
`dst_metric(RTAX_INITRWND)` which retrieve the bpf/path initrwnd
attribute. We use this value to bring `rcv_ssthresh` up, potentially
above the traditional 64KiB.

With higher initial `rcv_ssthresh` the receiver will open the receive
window more aggresively, which can improve large BDP flows - large
throughput and latency.

This patch does not cover the syncookies case.

Signed-off-by: Marek Majkowski <marek@...udflare.com>
---
 include/linux/tcp.h      | 1 +
 net/ipv4/tcp_minisocks.c | 9 +++++++--
 net/ipv4/tcp_output.c    | 7 +++++--
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index a9fbe22732c3..c7a8c71536f8 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -164,6 +164,7 @@ struct tcp_request_sock {
 						  * FastOpen it's the seq#
 						  * after data-in-SYN.
 						  */
+	u32				rcv_ssthresh;
 	u8				syn_tos;
 };
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index cb95d88497ae..8e5a3bd9a55b 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -355,11 +355,13 @@ void tcp_openreq_init_rwin(struct request_sock *req,
 			   const struct dst_entry *dst)
 {
 	struct inet_request_sock *ireq = inet_rsk(req);
+	struct tcp_request_sock *treq = tcp_rsk(req);
 	const struct tcp_sock *tp = tcp_sk(sk_listener);
 	int full_space = tcp_full_space(sk_listener);
 	u32 window_clamp;
 	__u8 rcv_wscale;
 	u32 rcv_wnd;
+	int adj_mss;
 	int mss;
 
 	mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
@@ -377,16 +379,19 @@ void tcp_openreq_init_rwin(struct request_sock *req,
 		rcv_wnd = dst_metric(dst, RTAX_INITRWND);
 	else if (full_space < rcv_wnd * mss)
 		full_space = rcv_wnd * mss;
+	adj_mss = mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0);
+
 
 	/* tcp_full_space because it is guaranteed to be the first packet */
 	tcp_select_initial_window(sk_listener, full_space,
-		mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+		adj_mss,
 		&req->rsk_rcv_wnd,
 		&req->rsk_window_clamp,
 		ireq->wscale_ok,
 		&rcv_wscale,
 		rcv_wnd);
 	ireq->rcv_wscale = rcv_wscale;
+	treq->rcv_ssthresh = max(tp->rcv_wnd, rcv_wnd * adj_mss);
 }
 EXPORT_SYMBOL(tcp_openreq_init_rwin);
 
@@ -502,7 +507,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 	newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
 	newtp->rx_opt.sack_ok = ireq->sack_ok;
 	newtp->window_clamp = req->rsk_window_clamp;
-	newtp->rcv_ssthresh = req->rsk_rcv_wnd;
+	newtp->rcv_ssthresh = treq->rcv_ssthresh;
 	newtp->rcv_wnd = req->rsk_rcv_wnd;
 	newtp->rx_opt.wscale_ok = ireq->wscale_ok;
 	if (newtp->rx_opt.wscale_ok) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 78b654ff421b..56f22d5da3a7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3649,6 +3649,7 @@ static void tcp_connect_init(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u8 rcv_wscale;
 	u32 rcv_wnd;
+	u32 adj_mss;
 
 	/* We'll fix this up when we get a response from the other end.
 	 * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
@@ -3686,8 +3687,10 @@ static void tcp_connect_init(struct sock *sk)
 	if (rcv_wnd == 0)
 		rcv_wnd = dst_metric(dst, RTAX_INITRWND);
 
+	adj_mss = tp->advmss - (tp->rx_opt.ts_recent_stamp ?
+			    tp->tcp_header_len - sizeof(struct tcphdr) : 0);
 	tcp_select_initial_window(sk, tcp_full_space(sk),
-				  tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
+				  adj_mss,
 				  &tp->rcv_wnd,
 				  &tp->window_clamp,
 				  READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_window_scaling),
@@ -3695,7 +3698,7 @@ static void tcp_connect_init(struct sock *sk)
 				  rcv_wnd);
 
 	tp->rx_opt.rcv_wscale = rcv_wscale;
-	tp->rcv_ssthresh = tp->rcv_wnd;
+	tp->rcv_ssthresh = max(tp->rcv_wnd, rcv_wnd * adj_mss);
 
 	sk->sk_err = 0;
 	sock_reset_flag(sk, SOCK_DONE);
-- 
2.25.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ