[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANn89iLvMkeKG_VBrJi3vA5agDxc5+VLz_L3JNT0ALsRUF32xg@mail.gmail.com>
Date: Wed, 21 Jan 2026 15:53:36 +0100
From: Eric Dumazet <edumazet@...gle.com>
To: chia-yu.chang@...ia-bell-labs.com
Cc: pabeni@...hat.com, parav@...dia.com, linux-doc@...r.kernel.org,
corbet@....net, horms@...nel.org, dsahern@...nel.org, kuniyu@...gle.com,
bpf@...r.kernel.org, netdev@...r.kernel.org, dave.taht@...il.com,
jhs@...atatu.com, kuba@...nel.org, stephen@...workplumber.org,
xiyou.wangcong@...il.com, jiri@...nulli.us, davem@...emloft.net,
andrew+netdev@...n.ch, donald.hunter@...il.com, ast@...erby.net,
liuhangbin@...il.com, shuah@...nel.org, linux-kselftest@...r.kernel.org,
ij@...nel.org, ncardwell@...gle.com, koen.de_schepper@...ia-bell-labs.com,
g.white@...lelabs.com, ingemar.s.johansson@...csson.com,
mirja.kuehlewind@...csson.com, cheshire@...le.com, rs.ietf@....at,
Jason_Livingood@...cast.com, vidhi_goel@...le.com
Subject: Re: [PATCH v10 net-next 12/15] tcp: accecn: detect loss ACK w/ AccECN
option and add TCP_ACCECN_OPTION_PERSIST
On Wed, Jan 21, 2026 at 3:32 PM <chia-yu.chang@...ia-bell-labs.com> wrote:
>
> From: Chia-Yu Chang <chia-yu.chang@...ia-bell-labs.com>
>
> Detect spurious retransmission of a previously sent ACK carrying the
> AccECN option after the second retransmission. Since this might be caused
> by the middlebox dropping ACK with options it does not recognize, disable
> the sending of the AccECN option in all subsequent ACKs. This patch
> follows Section 3.2.3.2.2 of AccECN spec (RFC9768).
>
> Also, a new AccECN option sending mode is added to tcp_ecn_option sysctl:
> (TCP_ECN_OPTION_PERSIST), which ignores the AccECN fallback policy and
> persistently sends AccECN option once it fits into TCP option space.
>
> Signed-off-by: Chia-Yu Chang <chia-yu.chang@...ia-bell-labs.com>
> Acked-by: Paolo Abeni <pabeni@...hat.com>
>
> ---
> v5:
> - Add empty line between variable declarations and code
> ---
> Documentation/networking/ip-sysctl.rst | 4 +++-
> include/linux/tcp.h | 3 ++-
> include/net/tcp_ecn.h | 2 ++
> net/ipv4/sysctl_net_ipv4.c | 2 +-
> net/ipv4/tcp_input.c | 10 ++++++++++
> net/ipv4/tcp_output.c | 7 ++++++-
> 6 files changed, 24 insertions(+), 4 deletions(-)
>
> diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
> index bc9a01606daf..28c7e4f5ecf9 100644
> --- a/Documentation/networking/ip-sysctl.rst
> +++ b/Documentation/networking/ip-sysctl.rst
> @@ -482,7 +482,9 @@ tcp_ecn_option - INTEGER
> 1 Send AccECN option sparingly according to the minimum option
> rules outlined in draft-ietf-tcpm-accurate-ecn.
> 2 Send AccECN option on every packet whenever it fits into TCP
> - option space.
> + option space except when AccECN fallback is triggered.
> + 3 Send AccECN option on every packet whenever it fits into TCP
> + option space even when AccECN fallback is triggered.
> = ============================================================
>
> Default: 2
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index 683f38362977..32b031d09294 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -294,7 +294,8 @@ struct tcp_sock {
> u8 nonagle : 4,/* Disable Nagle algorithm? */
> rate_app_limited:1; /* rate_{delivered,interval_us} limited? */
> u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */
> - unused2:4;
> + accecn_opt_sent:1,/* Sent AccECN option in previous ACK */
> + unused2:3;
> u8 accecn_minlen:2,/* Minimum length of AccECN option sent */
> est_ecnfield:2,/* ECN field for AccECN delivered estimates */
> accecn_opt_demand:2,/* Demand AccECN option for n next ACKs */
> diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h
> index bf7d3f9f22c7..41b593ece1dd 100644
> --- a/include/net/tcp_ecn.h
> +++ b/include/net/tcp_ecn.h
> @@ -29,6 +29,7 @@ enum tcp_accecn_option {
> TCP_ACCECN_OPTION_DISABLED = 0,
> TCP_ACCECN_OPTION_MINIMUM = 1,
> TCP_ACCECN_OPTION_FULL = 2,
> + TCP_ACCECN_OPTION_PERSIST = 3,
> };
>
> /* Apply either ECT(0) or ECT(1) based on TCP_CONG_ECT_1_NEGOTIATION flag */
> @@ -413,6 +414,7 @@ static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
> tp->received_ce_pending = 0;
> __tcp_accecn_init_bytes_counters(tp->received_ecn_bytes);
> __tcp_accecn_init_bytes_counters(tp->delivered_ecn_bytes);
> + tp->accecn_opt_sent = 0;
> tp->accecn_minlen = 0;
> tp->accecn_opt_demand = 0;
> tp->est_ecnfield = 0;
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index a1a50a5c80dc..385b5b986d23 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -749,7 +749,7 @@ static struct ctl_table ipv4_net_table[] = {
> .mode = 0644,
> .proc_handler = proc_dou8vec_minmax,
> .extra1 = SYSCTL_ZERO,
> - .extra2 = SYSCTL_TWO,
> + .extra2 = SYSCTL_THREE,
> },
> {
> .procname = "tcp_ecn_option_beacon",
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 8b774019a3a6..472bd57913ae 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -4863,6 +4863,8 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
>
> static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
> {
> + struct tcp_sock *tp = tcp_sk(sk);
> +
> /* When the ACK path fails or drops most ACKs, the sender would
> * timeout and spuriously retransmit the same segment repeatedly.
> * If it seems our ACKs are not reaching the other side,
> @@ -4882,6 +4884,14 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
> /* Save last flowlabel after a spurious retrans. */
> tcp_save_lrcv_flowlabel(sk, skb);
> #endif
> + /* Check DSACK info to detect that the previous ACK carrying the
> + * AccECN option was lost after the second retransmision, and then
> + * stop sending AccECN option in all subsequent ACKs.
> + */
> + if (tcp_ecn_mode_accecn(tp) &&
> + TCP_SKB_CB(skb)->seq == tp->duplicate_sack[0].start_seq &&
> + tp->accecn_opt_sent)
> + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_SEND);
> }
tcp_rcv_spurious_retrans() has two callers.
tcp_send_dupack() checked dsack is enabled.
tcp_data_queue() : No such check.
So I wonder if tp->duplicate_sack[0].start_seq could contain garbage ?
Perhaps test tp->rx_opt.dsack ?
>
> static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 042e7e9b13cc..0cbba38ea87a 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -713,9 +713,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
> if (tp) {
> tp->accecn_minlen = 0;
> tp->accecn_opt_tstamp = tp->tcp_mstamp;
> + tp->accecn_opt_sent = 1;
> if (tp->accecn_opt_demand)
> tp->accecn_opt_demand--;
> }
> + } else if (tp) {
> + tp->accecn_opt_sent = 0;
> }
>
> if (unlikely(OPTION_SACK_ADVERTISE & options)) {
> @@ -1187,7 +1190,9 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
> if (tcp_ecn_mode_accecn(tp)) {
> int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option);
>
> - if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) &&
> + if (ecn_opt && tp->saw_accecn_opt &&
> + (ecn_opt >= TCP_ACCECN_OPTION_PERSIST ||
> + !tcp_accecn_opt_fail_send(tp)) &&
> (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand ||
> tcp_accecn_option_beacon_check(sk))) {
> opts->use_synack_ecn_bytes = 0;
> --
> 2.34.1
>
Powered by blists - more mailing lists