[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220721151041.1215017-2-marek@cloudflare.com>
Date: Thu, 21 Jul 2022 17:10:40 +0200
From: Marek Majkowski <marek@...udflare.com>
To: netdev@...r.kernel.org
Cc: bpf@...r.kernel.org, kernel-team@...udflare.com,
ivan@...udflare.com, edumazet@...gle.com, davem@...emloft.net,
kuba@...nel.org, pabeni@...hat.com, ast@...nel.org,
daniel@...earbox.net, andrii@...nel.org,
Marek Majkowski <marek@...udflare.com>
Subject: [PATCH net-next 1/2] RTAX_INITRWND should be able to bring the rcv_ssthresh above 64KiB
We already support RTAX_INITRWND / initrwnd path attribute:
$ ip route change local 127.0.0.0/8 dev lo initrwnd 1024
However normally, the initial advertised receive window is limited to
64KiB by rcv_ssthresh, regardless of initrwnd. This patch changes
that, bumping up rcv_ssthresh to value derived from initrwnd. This
allows for larger initial advertised receive windows, which is useful
for specific types of TCP flows: big BDP ones, where there is a lot of
data to send immediately after the flow is established.
There are three places where we initialize sockets:
- tcp_output:tcp_connect_init
- tcp_minisocks:tcp_openreq_init_rwin
- syncookies
In the first two we already have a call to `tcp_rwnd_init_bpf` and
`dst_metric(RTAX_INITRWND)` which retrieve the bpf/path initrwnd
attribute. We use this value to bring `rcv_ssthresh` up, potentially
above the traditional 64KiB.
With higher initial `rcv_ssthresh` the receiver will open the receive
window more aggresively, which can improve large BDP flows - large
throughput and latency.
This patch does not cover the syncookies case.
Signed-off-by: Marek Majkowski <marek@...udflare.com>
---
include/net/inet_sock.h | 1 +
net/ipv4/tcp_minisocks.c | 8 ++++++--
net/ipv4/tcp_output.c | 10 ++++++++--
3 files changed, 15 insertions(+), 4 deletions(-)
diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
index daead5fb389a..bc68c9b70942 100644
--- a/include/net/inet_sock.h
+++ b/include/net/inet_sock.h
@@ -89,6 +89,7 @@ struct inet_request_sock {
no_srccheck: 1,
smc_ok : 1;
u32 ir_mark;
+ u32 rcv_ssthresh;
union {
struct ip_options_rcu __rcu *ireq_opt;
#if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 6854bb1fb32b..89ba2a30a012 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -360,6 +360,7 @@ void tcp_openreq_init_rwin(struct request_sock *req,
u32 window_clamp;
__u8 rcv_wscale;
u32 rcv_wnd;
+ int adj_mss;
int mss;
mss = tcp_mss_clamp(tp, dst_metric_advmss(dst));
@@ -378,15 +379,18 @@ void tcp_openreq_init_rwin(struct request_sock *req,
else if (full_space < rcv_wnd * mss)
full_space = rcv_wnd * mss;
+ adj_mss = mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0);
+
/* tcp_full_space because it is guaranteed to be the first packet */
tcp_select_initial_window(sk_listener, full_space,
- mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
+ adj_mss,
&req->rsk_rcv_wnd,
&req->rsk_window_clamp,
ireq->wscale_ok,
&rcv_wscale,
rcv_wnd);
ireq->rcv_wscale = rcv_wscale;
+ ireq->rcv_ssthresh = max(req->rsk_rcv_wnd, rcv_wnd * adj_mss);
}
EXPORT_SYMBOL(tcp_openreq_init_rwin);
@@ -502,7 +506,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
newtp->rx_opt.sack_ok = ireq->sack_ok;
newtp->window_clamp = req->rsk_window_clamp;
- newtp->rcv_ssthresh = req->rsk_rcv_wnd;
+ newtp->rcv_ssthresh = ireq->rcv_ssthresh;
newtp->rcv_wnd = req->rsk_rcv_wnd;
newtp->rx_opt.wscale_ok = ireq->wscale_ok;
if (newtp->rx_opt.wscale_ok) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 18c913a2347a..0f2d4174ea59 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3642,6 +3642,7 @@ static void tcp_connect_init(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
u32 rcv_wnd;
+ u32 mss;
/* We'll fix this up when we get a response from the other end.
* See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT.
@@ -3679,8 +3680,10 @@ static void tcp_connect_init(struct sock *sk)
if (rcv_wnd == 0)
rcv_wnd = dst_metric(dst, RTAX_INITRWND);
+ mss = tp->advmss - (tp->rx_opt.ts_recent_stamp ?
+ tp->tcp_header_len - sizeof(struct tcphdr) : 0);
tcp_select_initial_window(sk, tcp_full_space(sk),
- tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0),
+ mss,
&tp->rcv_wnd,
&tp->window_clamp,
sock_net(sk)->ipv4.sysctl_tcp_window_scaling,
@@ -3688,7 +3691,10 @@ static void tcp_connect_init(struct sock *sk)
rcv_wnd);
tp->rx_opt.rcv_wscale = rcv_wscale;
- tp->rcv_ssthresh = tp->rcv_wnd;
+ if (rcv_wnd)
+ tp->rcv_ssthresh = max(tp->rcv_wnd, rcv_wnd * mss);
+ else
+ tp->rcv_ssthresh = tp->rcv_wnd;
sk->sk_err = 0;
sock_reset_flag(sk, SOCK_DONE);
--
2.25.1
Powered by blists - more mailing lists