[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <1686327959-13478-1-git-send-email-haiyangz@microsoft.com>
Date: Fri, 9 Jun 2023 09:25:59 -0700
From: Haiyang Zhang <haiyangz@...rosoft.com>
To: linux-hyperv@...r.kernel.org, netdev@...r.kernel.org
Cc: haiyangz@...rosoft.com, kys@...rosoft.com, olaf@...fle.de,
vkuznets@...hat.com, davem@...emloft.net, weiwan@...gle.com,
tim.gardner@...onical.com, corbet@....net, edumazet@...gle.com,
kuba@...nel.org, pabeni@...hat.com, dsahern@...nel.org,
atenart@...nel.org, bagasdotme@...il.com, ykaliuta@...hat.com,
kuniyu@...zon.com, stephen@...workplumber.org,
simon.horman@...igine.com, maheshb@...gle.com,
liushixin2@...wei.com, linux-doc@...r.kernel.org,
linux-kernel@...r.kernel.org
Subject: [PATCH net-next] tcp: Make pingpong threshold tunable
TCP pingpong threshold is 1 by default. But some applications, like SQL DB
may prefer a higher pingpong threshold to activate delayed acks in quick
ack mode for better performance.
The pingpong threshold and related code were changed to 3 in the year
2019, and reverted to 1 in the year 2022. There is no single value that
fits all applications.
Add net.core.tcp_pingpong_thresh sysctl tunable, so it can be tuned for
optimal performance based on the application needs.
Signed-off-by: Haiyang Zhang <haiyangz@...rosoft.com>
---
Documentation/admin-guide/sysctl/net.rst | 8 ++++++++
include/net/inet_connection_sock.h | 14 +++++++++++---
net/core/sysctl_net_core.c | 9 +++++++++
net/ipv4/tcp.c | 2 ++
net/ipv4/tcp_output.c | 17 +++++++++++++++--
5 files changed, 45 insertions(+), 5 deletions(-)
diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst
index 4877563241f3..16f54be9461f 100644
--- a/Documentation/admin-guide/sysctl/net.rst
+++ b/Documentation/admin-guide/sysctl/net.rst
@@ -413,6 +413,14 @@ historical importance.
Default: 0
+tcp_pingpong_thresh
+-------------------
+
+TCP pingpong threshold is 1 by default, but some application may need a higher
+threshold for optimal performance.
+
+Default: 1, min: 1, max: 3
+
2. /proc/sys/net/unix - Parameters for Unix domain sockets
----------------------------------------------------------
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index c2b15f7e5516..e84e33ddae49 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -324,11 +324,11 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);
-#define TCP_PINGPONG_THRESH 1
+extern int tcp_pingpong_thresh;
static inline void inet_csk_enter_pingpong_mode(struct sock *sk)
{
- inet_csk(sk)->icsk_ack.pingpong = TCP_PINGPONG_THRESH;
+ inet_csk(sk)->icsk_ack.pingpong = tcp_pingpong_thresh;
}
static inline void inet_csk_exit_pingpong_mode(struct sock *sk)
@@ -338,7 +338,15 @@ static inline void inet_csk_exit_pingpong_mode(struct sock *sk)
static inline bool inet_csk_in_pingpong_mode(struct sock *sk)
{
- return inet_csk(sk)->icsk_ack.pingpong >= TCP_PINGPONG_THRESH;
+ return inet_csk(sk)->icsk_ack.pingpong >= tcp_pingpong_thresh;
+}
+
+static inline void inet_csk_inc_pingpong_cnt(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+
+ if (icsk->icsk_ack.pingpong < U8_MAX)
+ icsk->icsk_ack.pingpong++;
}
static inline bool inet_csk_has_ulp(struct sock *sk)
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 782273bb93c2..b5253567f2bd 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -653,6 +653,15 @@ static struct ctl_table net_core_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
+ {
+ .procname = "tcp_pingpong_thresh",
+ .data = &tcp_pingpong_thresh,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ONE,
+ .extra2 = SYSCTL_THREE,
+ },
{ }
};
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 53b7751b68e1..dcd143193d41 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -308,6 +308,8 @@ EXPORT_SYMBOL(tcp_have_smc);
struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
EXPORT_SYMBOL(tcp_sockets_allocated);
+int tcp_pingpong_thresh __read_mostly = 1;
+
/*
* TCP splice context
*/
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index cfe128b81a01..576d21621778 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -167,12 +167,25 @@ static void tcp_event_data_sent(struct tcp_sock *tp,
if (tcp_packets_in_flight(tp) == 0)
tcp_ca_event(sk, CA_EVENT_TX_START);
+ /* If tcp_pingpong_thresh > 1, and
+ * this is the first data packet sent in response to the
+ * previous received data,
+ * and it is a reply for ato after last received packet,
+ * increase pingpong count.
+ */
+ if (tcp_pingpong_thresh > 1 &&
+ before(tp->lsndtime, icsk->icsk_ack.lrcvtime) &&
+ (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+ inet_csk_inc_pingpong_cnt(sk);
+
tp->lsndtime = now;
- /* If it is a reply for ato after last received
+ /* If tcp_pingpong_thresh == 1, and
+ * it is a reply for ato after last received
* packet, enter pingpong mode.
*/
- if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+ if (tcp_pingpong_thresh == 1 &&
+ (u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
inet_csk_enter_pingpong_mode(sk);
}
--
2.25.1
Powered by blists - more mailing lists