[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20240402215405.432863-1-hli@netflix.com>
Date: Tue, 2 Apr 2024 14:54:06 -0700
From: Hechao Li <hli@...flix.com>
To: Eric Dumazet <edumazet@...gle.com>,
"David S . Miller" <davem@...emloft.net>,
Jakub Kicinski <kuba@...nel.org>,
Paolo Abeni <pabeni@...hat.com>,
Soheil Hassas Yeganeh <soheil@...gle.com>
Cc: netdev@...r.kernel.org,
kernel-developers@...flix.com,
Hechao Li <hli@...flix.com>,
Tycho Andersen <tycho@...ho.pizza>
Subject: [PATCH net-next] tcp: update window_clamp together with scaling_ratio
After commit dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale"),
we noticed an application-level timeout due to reduced throughput. This
can be reproduced by the following minimal client and server program.
server:
int main(int argc, char *argv[]) {
int sockfd;
char buffer[256];
struct sockaddr_in srv_addr;
// Create socket
sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
if (sockfd < 0) {
perror("server: socket()");
return -1;
}
bzero((char *) &srv_addr, sizeof(srv_addr));
srv_addr.sin_family = AF_INET;
srv_addr.sin_addr.s_addr = htonl(INADDR_ANY);
srv_addr.sin_port = htons(8080);
// Bind socket
if (bind(sockfd, (struct sockaddr *) &srv_addr,
sizeof(srv_addr)) < 0) {
perror("server: bind()");
close(sockfd);
return -1;
}
// Listen for connections
listen(sockfd,5);
while(1) {
int filefd = -1, newsockfd = -1;
struct sockaddr_in cli_addr;
socklen_t cli_len = sizeof(cli_addr);
// Accept connection
newsockfd = accept(sockfd, (struct sockaddr *)&cli_addr, &cli_len);
if (newsockfd < 0) {
perror("server: accept()");
goto end;
}
// Read filename from client
bzero(buffer, sizeof(buffer));
ssize_t n = read(newsockfd,buffer,sizeof(buffer)-1);
if (n < 0) {
perror("server: read()");
goto end;
}
// Open file
filefd = open(buffer, O_RDONLY);
if (filefd < 0) {
perror("server: read()");
goto end;
}
// Get file size
struct stat file_stat;
if(fstat(filefd, &file_stat) < 0) {
perror("server: fstat()");
goto end;
}
// Send file
off_t offset = 0;
ssize_t bytes_sent = 0, bytes_left = file_stat.st_size;
while ((bytes_sent = sendfile(newsockfd, filefd,
&offset, bytes_left)) > 0) {
bytes_left -= bytes_sent;
}
end:
// Close file and client socket
if (filefd > 0) {
close(filefd);
}
if (newsockfd > 0) {
close(newsockfd);
}
}
close(sockfd);
return 0;
}
client:
int main(int argc, char *argv[]) {
int sockfd, filefd;
char *server_addr = argv[1];
char *filename = argv[2];
struct sockaddr_in sockaddr;
char buffer[256];
ssize_t n;
if ((sockfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == -1) {
perror("client: socket()");
return -1;
}
sockaddr.sin_family = AF_INET;
inet_pton(AF_INET, server_addr, &sockaddr.sin_addr);
sockaddr.sin_port = htons(8080);
int val = 65536;
if (setsockopt(sockfd, SOL_SOCKET, SO_RCVBUF,
&val, sizeof(val)) < 0) {
perror("client: setockopt(SO_RCVBUF)");
return -1;
}
if (connect(sockfd, (struct sockaddr*)&sockaddr,
sizeof(sockaddr)) == -1) {
close(sockfd);
perror("client: connect()");
return -1;
}
// Send filename to server
n = write(sockfd, filename, strlen(filename));
if (n < 0) {
perror("client: write()");
return -1;
}
// Open file
filefd = open(filename, O_WRONLY | O_CREAT, 0666);
if(filefd < 0) {
perror("client: open()");
return -1;
}
// Read file from server
while((n = read(sockfd, buffer, sizeof(buffer))) > 0) {
write(filefd, buffer, n);
}
// Close file and socket
close(filefd);
close(sockfd);
return 0;
}
Before the commit, it takes around 22 seconds to transfer 10M data.
After the commit, it takes 40 seconds. Because our application has a
30-second timeout, this regression broke the application.
The reason that it takes longer to transfer data is that
tp->scaling_ratio is initialized to a value that results in ~0.25 of
rcvbuf. In our case, SO_RCVBUF is set to 65536 by the application, which
translates to 2 * 65536 = 131,072 bytes in rcvbuf and hence a ~28k
initial receive window.
Later, even though the scaling_ratio is updated to a more accurate
skb->len/skb->truesize, which is ~0.66 in our environment, the window
stays at ~0.25 * rcvbuf. This is because tp->window_clamp does not
change together with the tp->scaling_ratio update. As a result, the
window size is capped at the initial window_clamp, which is also ~0.25 *
rcvbuf, and never grows bigger.
This patch updates window_clamp along with scaling_ratio. It changes the
calculation of the initial rcv_wscale as well to make sure the scale
factor is also not capped by the initial window_clamp.
A comment from Tycho Andersen <tycho@...ho.pizza> is "What happens if
someone has done setsockopt(sk, TCP_WINDOW_CLAMP) explicitly; will this
and the above not violate userspace's desire to clamp the window size?".
This comment is not addressed in this patch because the existing code
also updates window_clamp at several places without checking if
TCP_WINDOW_CLAMP is set by user space. Adding this check now may break
certain user space assumption (similar to how the original patch broke
the assumption of buffer overhead being 50%). For example, if a user
space program sets TCP_WINDOW_CLAMP but the applicaiton behavior relies
on window_clamp adjusted by the kernel as of today.
Fixes: dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale")
Signed-off-by: Hechao Li <hli@...flix.com>
Reviewed-by: Tycho Andersen <tycho@...ho.pizza>
---
net/ipv4/tcp_input.c | 6 +++++-
net/ipv4/tcp_output.c | 2 +-
2 files changed, 6 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 5d874817a78d..a0cfa2b910d5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -237,9 +237,13 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb)
*/
if (unlikely(len != icsk->icsk_ack.rcv_mss)) {
u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE;
+ struct tcp_sock *tp = tcp_sk(sk);
do_div(val, skb->truesize);
- tcp_sk(sk)->scaling_ratio = val ? val : 1;
+ tp->scaling_ratio = val ? val : 1;
+
+ /* Make the window_clamp follow along. */
+ tp->window_clamp = tcp_full_space(sk);
}
icsk->icsk_ack.rcv_mss = min_t(unsigned int, len,
tcp_sk(sk)->advmss);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e3167ad96567..2341e3f9db58 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -239,7 +239,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
/* Set window scaling on max possible window */
space = max_t(u32, space, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
space = max_t(u32, space, READ_ONCE(sysctl_rmem_max));
- space = min_t(u32, space, *window_clamp);
+ space = min_t(u32, space, sk->sk_rcvbuf);
*rcv_wscale = clamp_t(int, ilog2(space) - 15,
0, TCP_MAX_WSCALE);
}
--
2.34.1
Powered by blists - more mailing lists