netdev - Re: [PATCH net-next] tcp: add net.ipv4.tcp_comp_sack_rtt

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <0eecf17d-2606-4304-bc75-efe4c7ec73b9@kernel.org>
Date: Thu, 6 Nov 2025 15:56:59 +0100
From: Jesper Dangaard Brouer <hawk@...nel.org>
To: Eric Dumazet <edumazet@...gle.com>, "David S . Miller"
 <davem@...emloft.net>, Jakub Kicinski <kuba@...nel.org>,
 Paolo Abeni <pabeni@...hat.com>
Cc: Simon Horman <horms@...nel.org>, Neal Cardwell <ncardwell@...gle.com>,
 Kuniyuki Iwashima <kuniyu@...gle.com>, netdev@...r.kernel.org,
 eric.dumazet@...il.com
Subject: Re: [PATCH net-next] tcp: add net.ipv4.tcp_comp_sack_rtt_percent

On 05/11/2025 10.38, Eric Dumazet wrote:
> TCP SACK compression has been added in 2018 in commit
> 5d9f4262b7ea ("tcp: add SACK compression").
> 
> It is working great for WAN flows (with large RTT).
> Wifi in particular gets a significant boost _when_ ACK are suppressed.
> 
> Add a new sysctl so that we can tune the very conservative 5 % value
> that has been used so far in this formula, so that small RTT flows
> can benefit from this feature.
> 
> delay = min ( 5 % of RTT, 1 ms)
> 
> This patch adds new tcp_comp_sack_rtt_percent sysctl
> to ease experiments and tuning.
> 
> Given that we cap the delay to 1ms (tcp_comp_sack_delay_ns sysctl),
> set the default value to 100.
> 
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>
> ---
>   Documentation/networking/ip-sysctl.rst | 13 +++++++++++--
>   include/net/netns/ipv4.h               |  1 +
>   net/ipv4/sysctl_net_ipv4.c             |  9 +++++++++
>   net/ipv4/tcp_input.c                   | 26 ++++++++++++++++++--------
>   net/ipv4/tcp_ipv4.c                    |  1 +
>   5 files changed, 40 insertions(+), 10 deletions(-)
> 
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index 7cd35bfd39e68c5b2650eb9d0fbb76e34aed3f2b..ebc11f593305bf87e7d4ad4d50ef085b22aef7da 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -854,9 +854,18 @@ tcp_sack - BOOLEAN
>   
>   	Default: 1 (enabled)
>   
> +tcp_comp_sack_rtt_percent - INTEGER
> +	Percentage of SRTT used for the compressed SACK feature.
> +	See tcp_comp_sack_nr, tcp_comp_sack_delay_ns, tcp_comp_sack_slack_ns.
> +
> +	Possible values : 1 - 1000

If this is a percentage, why does it allow 1000 as max?

> +	Default : 100 %
> +
>   tcp_comp_sack_delay_ns - LONG INTEGER
> -	TCP tries to reduce number of SACK sent, using a timer
> -	based on 5% of SRTT, capped by this sysctl, in nano seconds.
> +	TCP tries to reduce number of SACK sent, using a timer based
> +	on tcp_comp_sack_rtt_percent of SRTT, capped by this sysctl
> +	in nano seconds.
>   	The default is 1ms, based on TSO autosizing period.
>   
>   	Default : 1,000,000 ns (1 ms)
> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
> index 0e96c90e56c6d987a16598ef885c403d5c3eae52..de9d36acc8e22d3203120d8015b3d172e85de121 100644
> --- a/include/net/netns/ipv4.h
> +++ b/include/net/netns/ipv4.h
> @@ -221,6 +221,7 @@ struct netns_ipv4 {
>   	int sysctl_tcp_pacing_ss_ratio;
>   	int sysctl_tcp_pacing_ca_ratio;
>   	unsigned int sysctl_tcp_child_ehash_entries;
> +	int sysctl_tcp_comp_sack_rtt_percent;
>   	unsigned long sysctl_tcp_comp_sack_delay_ns;
>   	unsigned long sysctl_tcp_comp_sack_slack_ns;
>   	int sysctl_max_syn_backlog;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 0c7c8f9041cbf4aa4e51dcebd607aa5d8ac80dcd..35367f8e2da32f2c7de5a06164f5e47c8929c8f1 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -1451,6 +1451,15 @@ static struct ctl_table ipv4_net_table[] = {
>   		.mode		= 0644,
>   		.proc_handler	= proc_doulongvec_minmax,
>   	},
> +	{
> +		.procname	= "tcp_comp_sack_rtt_percent",
> +		.data		= &init_net.ipv4.sysctl_tcp_comp_sack_rtt_percent,
> +		.maxlen		= sizeof(int),
> +		.mode		= 0644,
> +		.proc_handler	= proc_dointvec_minmax,
> +		.extra1		= SYSCTL_ONE,
> +		.extra2		= SYSCTL_ONE_THOUSAND,
> +	},
>   	{
>   		.procname	= "tcp_comp_sack_slack_ns",
>   		.data		= &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns,
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 6db1d4c36a88bfa64b48388ee95e4e9218d9a9fd..d4ee74da018ee97209bed3402688f5e18759866b 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -5893,7 +5893,9 @@ static inline void tcp_data_snd_check(struct sock *sk)
>   static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
>   {
>   	struct tcp_sock *tp = tcp_sk(sk);
> -	unsigned long rtt, delay;
> +	struct net *net = sock_net(sk);
> +	unsigned long rtt;
> +	u64 delay;
>   
>   	    /* More than one full frame received... */
>   	if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss &&
> @@ -5912,7 +5914,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
>   		 * Defer the ack until tcp_release_cb().
>   		 */
>   		if (sock_owned_by_user_nocheck(sk) &&
> -		    READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) {
> +		    READ_ONCE(net->ipv4.sysctl_tcp_backlog_ack_defer)) {
>   			set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags);
>   			return;
>   		}
> @@ -5927,7 +5929,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
>   	}
>   
>   	if (!tcp_is_sack(tp) ||
> -	    tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr))
> +	    tp->compressed_ack >= READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_nr))
>   		goto send_now;
>   
>   	if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) {
> @@ -5942,18 +5944,26 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible)
>   	if (hrtimer_is_queued(&tp->compressed_ack_timer))
>   		return;
>   
> -	/* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */
> +	/* compress ack timer : comp_sack_rtt_percent of rtt,
> +	 * but no more than tcp_comp_sack_delay_ns.
> +	 */
>   
>   	rtt = tp->rcv_rtt_est.rtt_us;
>   	if (tp->srtt_us && tp->srtt_us < rtt)
>   		rtt = tp->srtt_us;
>   
> -	delay = min_t(unsigned long,
> -		      READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns),
> -		      rtt * (NSEC_PER_USEC >> 3)/20);
> +	/* delay = (rtt >> 3) * NSEC_PER_USEC * comp_sack_rtt_percent / 100
> +	 * ->
> +	 * delay = rtt * 1.25 * comp_sack_rtt_percent
> +	 */

Why explain this with shifts.  I have to use extra time to remember that
shift ">> 3" is the same as div "/" 8.  And that ">>" 2 is the same as
div "/4".  For the code, I think the compiler will convert /4 to >>2
anyway.  I don't feel strongly about this, so I'll let it be up to you
if you want to adjust this or not.


> +	delay = (u64)(rtt + (rtt >> 2)) *
> +		READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_rtt_percent);
> +
> +	delay = min(delay, READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_delay_ns));
> +
>   	sock_hold(sk);
>   	hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay),
> -			       READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns),
> +			       READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_slack_ns),
>   			       HRTIMER_MODE_REL_PINNED_SOFT);
>   }
>   
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index b7526a7888cbe296c0f4ba6350772741cfe1765b..a4411cd0229cb7fc5903d206e549d0889d177937 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -3596,6 +3596,7 @@ static int __net_init tcp_sk_init(struct net *net)
>   	net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
>   	net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
>   	net->ipv4.sysctl_tcp_comp_sack_nr = 44;
> +	net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 100;
>   	net->ipv4.sysctl_tcp_backlog_ack_defer = 1;
>   	net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
>   	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;