[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CANn89iKX+DxvcM6t8KjanRofdz8ksMkcHj_V0w_LAoredq2gZw@mail.gmail.com>
Date: Wed, 25 Jun 2025 00:57:45 -0700
From: Eric Dumazet <edumazet@...gle.com>
To: chia-yu.chang@...ia-bell-labs.com
Cc: pabeni@...hat.com, linux-doc@...r.kernel.org, corbet@....net,
horms@...nel.org, dsahern@...nel.org, kuniyu@...zon.com, bpf@...r.kernel.org,
netdev@...r.kernel.org, jhs@...atatu.com, kuba@...nel.org,
stephen@...workplumber.org, xiyou.wangcong@...il.com, jiri@...nulli.us,
davem@...emloft.net, andrew+netdev@...n.ch, donald.hunter@...il.com,
ast@...erby.net, liuhangbin@...il.com, shuah@...nel.org,
linux-kselftest@...r.kernel.org, ij@...nel.org, ncardwell@...gle.com,
koen.de_schepper@...ia-bell-labs.com, g.white@...lelabs.com,
ingemar.s.johansson@...csson.com, mirja.kuehlewind@...csson.com,
cheshire@...le.com, rs.ietf@....at, Jason_Livingood@...cast.com,
vidhi_goel@...le.com, Olivier Tilmans <olivier.tilmans@...ia.com>
Subject: Re: [PATCH v9 net-next 05/15] tcp: accecn: AccECN negotiation
On Sat, Jun 21, 2025 at 12:37 PM <chia-yu.chang@...ia-bell-labs.com> wrote:
>
> From: Ilpo Järvinen <ij@...nel.org>
>
> Accurate ECN negotiation parts based on the specification:
> https://tools.ietf.org/id/draft-ietf-tcpm-accurate-ecn-28.txt
>
> Accurate ECN is negotiated using ECE, CWR and AE flags in the
> TCP header. TCP falls back into using RFC3168 ECN if one of the
> ends supports only RFC3168-style ECN.
>
> The AccECN negotiation includes reflecting IP ECN field value
> seen in SYN and SYNACK back using the same bits as negotiation
> to allow responding to SYN CE marks and to detect ECN field
> mangling. CE marks should not occur currently because SYN=1
> segments are sent with Non-ECT in IP ECN field (but proposal
> exists to remove this restriction).
>
> Reflecting SYN IP ECN field in SYNACK is relatively simple.
> Reflecting SYNACK IP ECN field in the final/third ACK of
> the handshake is more challenging. Linux TCP code is not well
> prepared for using the final/third ACK a signalling channel
> which makes things somewhat complicated here.
>
> tcp_ecn sysctl can be used to select the highest ECN variant
> (Accurate ECN, ECN, No ECN) that is attemped to be negotiated and
> requested for incoming connection and outgoing connection:
> TCP_ECN_IN_NOECN_OUT_NOECN, TCP_ECN_IN_ECN_OUT_ECN,
> TCP_ECN_IN_ECN_OUT_NOECN, TCP_ECN_IN_ACCECN_OUT_ACCECN,
> TCP_ECN_IN_ACCECN_OUT_ECN, and TCP_ECN_IN_ACCECN_OUT_NOECN.
>
> After this patch, the size of tcp_request_sock remains unchanged
> and no new holes are added. Below are the pahole outcomes before
> and after this patch:
>
>
> }
>
> Signed-off-by: Ilpo Järvinen <ij@...nel.org>
> Co-developed-by: Olivier Tilmans <olivier.tilmans@...ia.com>
> Signed-off-by: Olivier Tilmans <olivier.tilmans@...ia.com>
> Co-developed-by: Chia-Yu Chang <chia-yu.chang@...ia-bell-labs.com>
> Signed-off-by: Chia-Yu Chang <chia-yu.chang@...ia-bell-labs.com>
> ---
> .../networking/net_cachelines/tcp_sock.rst | 4 +
> include/linux/tcp.h | 9 +-
> include/net/tcp.h | 93 ++++++++-
> net/ipv4/syncookies.c | 3 +
> net/ipv4/sysctl_net_ipv4.c | 3 +-
> net/ipv4/tcp.c | 2 +
> net/ipv4/tcp_input.c | 178 ++++++++++++++++--
> net/ipv4/tcp_ipv4.c | 5 +-
> net/ipv4/tcp_minisocks.c | 51 ++++-
> net/ipv4/tcp_output.c | 78 ++++++--
> net/ipv6/syncookies.c | 1 +
> net/ipv6/tcp_ipv6.c | 1 +
> 12 files changed, 384 insertions(+), 44 deletions(-)
>
> diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst
> index 31313a9adccc..22ac668fe6c7 100644
> --- a/Documentation/networking/net_cachelines/tcp_sock.rst
> +++ b/Documentation/networking/net_cachelines/tcp_sock.rst
> @@ -103,6 +103,10 @@ u32 delivered read_mostly read_w
> u32 delivered_ce read_mostly read_write tcp_rate_skb_sent(tx);tcp_rate_gen(rx)
> u32 received_ce read_mostly read_write
> u8:4 received_ce_pending read_mostly read_write
> +u8:2 syn_ect_snt write_mostly read_write
> +u8:2 syn_ect_rcv read_mostly read_write
> +u8:1 wait_third_ack read_write
> +u8:4 accecn_fail_mode
> u32 lost read_mostly tcp_ack
> u32 app_limited read_write read_mostly tcp_rate_check_app_limited,tcp_rate_skb_sent(tx);tcp_rate_gen(rx)
> u64 first_tx_mstamp read_write tcp_rate_skb_sent
> diff --git a/include/linux/tcp.h b/include/linux/tcp.h
> index 874e0e45dfad..1d8301f2883c 100644
> --- a/include/linux/tcp.h
> +++ b/include/linux/tcp.h
> @@ -168,6 +168,10 @@ struct tcp_request_sock {
> * after data-in-SYN.
> */
> u8 syn_tos;
> + bool accecn_ok;
> + u8 syn_ect_snt: 2,
> + syn_ect_rcv: 2,
> + accecn_fail_mode:4;
> #ifdef CONFIG_TCP_AO
> u8 ao_keyid;
> u8 ao_rcv_next;
> @@ -375,7 +379,9 @@ struct tcp_sock {
> u8 compressed_ack;
> u8 dup_ack_counter:2,
> tlp_retrans:1, /* TLP is a retransmission */
> - unused:5;
> + syn_ect_snt:2, /* AccECN ECT memory, only */
> + syn_ect_rcv:2, /* ... needed durign 3WHS + first seqno */
> + wait_third_ack:1; /* Wait 3rd ACK in simultaneous open */
> u8 thin_lto : 1,/* Use linear timeouts for thin streams */
> fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
> fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */
> @@ -391,6 +397,7 @@ struct tcp_sock {
> syn_fastopen_child:1; /* created TFO passive child socket */
>
> u8 keepalive_probes; /* num of allowed keep alive probes */
> + u8 accecn_fail_mode:4; /* AccECN failure handling */
> u32 tcp_tx_delay; /* delay (in usec) added to TX packets */
>
> /* RTT measurement */
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index 6cf5cea992e3..4d6325fa3f67 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -27,6 +27,7 @@
> #include <linux/ktime.h>
> #include <linux/indirect_call_wrapper.h>
> #include <linux/bits.h>
> +#include <linux/bitfield.h>
>
> #include <net/inet_connection_sock.h>
> #include <net/inet_timewait_sock.h>
> @@ -234,6 +235,37 @@ static_assert((1 << ATO_BITS) > TCP_DELACK_MAX);
> #define TCPOLEN_MSS_ALIGNED 4
> #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8
>
> +/* tp->accecn_fail_mode */
> +#define TCP_ACCECN_ACE_FAIL_SEND BIT(0)
> +#define TCP_ACCECN_ACE_FAIL_RECV BIT(1)
> +#define TCP_ACCECN_OPT_FAIL_SEND BIT(2)
> +#define TCP_ACCECN_OPT_FAIL_RECV BIT(3)
> +
> +static inline bool tcp_accecn_ace_fail_send(const struct tcp_sock *tp)
> +{
> + return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND;
> +}
> +
> +static inline bool tcp_accecn_ace_fail_recv(const struct tcp_sock *tp)
> +{
> + return tp->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV;
> +}
> +
> +static inline bool tcp_accecn_opt_fail_send(const struct tcp_sock *tp)
> +{
> + return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_SEND;
> +}
> +
> +static inline bool tcp_accecn_opt_fail_recv(const struct tcp_sock *tp)
> +{
> + return tp->accecn_fail_mode & TCP_ACCECN_OPT_FAIL_RECV;
> +}
> +
> +static inline void tcp_accecn_fail_mode_set(struct tcp_sock *tp, u8 mode)
> +{
> + tp->accecn_fail_mode |= mode;
> +}
> +
> /* Flags in tp->nonagle */
> #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */
> #define TCP_NAGLE_CORK 2 /* Socket is corked */
> @@ -420,6 +452,23 @@ static inline u8 tcp_accecn_ace(const struct tcphdr *th)
> return (th->ae << 2) | (th->cwr << 1) | th->ece;
> }
>
> +/* Infer the ECT value our SYN arrived with from the echoed ACE field */
> +static inline int tcp_accecn_extract_syn_ect(u8 ace)
> +{
> + if (ace & 0x1)
> + return INET_ECN_ECT_1;
> + if (!(ace & 0x2))
> + return INET_ECN_ECT_0;
> + if (ace & 0x4)
> + return INET_ECN_CE;
> + return INET_ECN_NOT_ECT;
> +}
> +
> +bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, u8 sent_ect);
> +void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb,
> + u8 syn_ect_snt);
> +void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb);
> +
> enum tcp_tw_status {
> TCP_TW_SUCCESS = 0,
> TCP_TW_RST = 1,
> @@ -657,6 +706,15 @@ static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry *
> dst_feature(dst, RTAX_FEATURE_ECN);
> }
>
> +/* AccECN specification, 5.1: [...] a server can determine that it
> + * negotiated AccECN as [...] if the ACK contains an ACE field with
> + * the value 0b010 to 0b111 (decimal 2 to 7).
> + */
> +static inline bool cookie_accecn_ok(const struct tcphdr *th)
> +{
> + return tcp_accecn_ace(th) > 0x1;
> +}
> +
> #if IS_ENABLED(CONFIG_BPF)
> static inline bool cookie_bpf_ok(struct sk_buff *skb)
> {
> @@ -968,6 +1026,7 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
>
> #define TCPHDR_ACE (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)
> #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)
> +#define TCPHDR_SYNACK_ACCECN (TCPHDR_SYN | TCPHDR_ACK | TCPHDR_CWR)
>
> #define TCP_ACCECN_CEP_ACE_MASK 0x7
> #define TCP_ACCECN_ACE_MAX_DELTA 6
> @@ -977,6 +1036,19 @@ static inline u32 tcp_rsk_tsval(const struct tcp_request_sock *treq)
> */
> #define TCP_ACCECN_CEP_INIT_OFFSET 5
>
> +/* The highest ECN variant (Accurate ECN, ECN, or no ECN) that is
> + * attemped to be negotiated and requested for incoming connection
> + * and outgoing connection, respectively.
> + */
> +enum tcp_ecn_mode {
> + TCP_ECN_IN_NOECN_OUT_NOECN = 0,
> + TCP_ECN_IN_ECN_OUT_ECN = 1,
> + TCP_ECN_IN_ECN_OUT_NOECN = 2,
> + TCP_ECN_IN_ACCECN_OUT_ACCECN = 3,
> + TCP_ECN_IN_ACCECN_OUT_ECN = 4,
> + TCP_ECN_IN_ACCECN_OUT_NOECN = 5,
> +};
> +
> static inline void tcp_accecn_init_counters(struct tcp_sock *tp)
> {
> tp->received_ce = 0;
> @@ -1051,6 +1123,15 @@ struct tcp_skb_cb {
>
> #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
>
> +static inline u16 tcp_accecn_reflector_flags(u8 ect)
> +{
> + u32 flags = ect + 2;
> +
> + if (ect == 3)
> + flags++;
A comment might help, I have no idea of what is going on here.
> + return FIELD_PREP(TCPHDR_ACE, flags);
> +}
> +
> extern const struct inet_connection_sock_af_ops ipv4_specific;
>
> #if IS_ENABLED(CONFIG_IPV6)
> @@ -1173,7 +1254,10 @@ enum tcp_ca_ack_event_flags {
> #define TCP_CONG_NON_RESTRICTED BIT(0)
> /* Requires ECN/ECT set on all packets */
> #define TCP_CONG_NEEDS_ECN BIT(1)
> -#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN)
> +/* Require successfully negotiated AccECN capability */
> +#define TCP_CONG_NEEDS_ACCECN BIT(2)
> +#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN | \
> + TCP_CONG_NEEDS_ACCECN)
>
> union tcp_cc_info;
>
> @@ -1305,6 +1389,13 @@ static inline bool tcp_ca_needs_ecn(const struct sock *sk)
> return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN;
> }
>
> +static inline bool tcp_ca_needs_accecn(const struct sock *sk)
> +{
> + const struct inet_connection_sock *icsk = inet_csk(sk);
> +
> + return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ACCECN;
> +}
> +
> static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event)
> {
> const struct inet_connection_sock *icsk = inet_csk(sk);
> diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
> index 5459a78b9809..3a44eb9c1d1a 100644
> --- a/net/ipv4/syncookies.c
> +++ b/net/ipv4/syncookies.c
> @@ -403,6 +403,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
> struct tcp_sock *tp = tcp_sk(sk);
> struct inet_request_sock *ireq;
> struct net *net = sock_net(sk);
> + struct tcp_request_sock *treq;
> struct request_sock *req;
> struct sock *ret = sk;
> struct flowi4 fl4;
> @@ -428,6 +429,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
> }
>
> ireq = inet_rsk(req);
> + treq = tcp_rsk(req);
>
> sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
> sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
> @@ -482,6 +484,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
> if (!req->syncookie)
> ireq->rcv_wscale = rcv_wscale;
> ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst);
> + treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th);
>
> ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst);
> /* ip_queue_xmit() depends on our flow being setup
> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
> index 3a43010d726f..75ec1a599b52 100644
> --- a/net/ipv4/sysctl_net_ipv4.c
> +++ b/net/ipv4/sysctl_net_ipv4.c
> @@ -47,6 +47,7 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
> static int tcp_plb_max_rounds = 31;
> static int tcp_plb_max_cong_thresh = 256;
> static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
> +static int tcp_ecn_mode_max = 5;
>
> /* obsolete */
> static int sysctl_tcp_low_latency __read_mostly;
> @@ -728,7 +729,7 @@ static struct ctl_table ipv4_net_table[] = {
> .mode = 0644,
> .proc_handler = proc_dou8vec_minmax,
> .extra1 = SYSCTL_ZERO,
> - .extra2 = SYSCTL_TWO,
> + .extra2 = &tcp_ecn_mode_max,
Please change Documentation/networking/ip-sysctl.rst tcp_ecn accordingly ?
> },
> {
> .procname = "tcp_ecn_fallback",
> diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
> index 8e0e8d784b1c..e6d7b5420c88 100644
> --- a/net/ipv4/tcp.c
> +++ b/net/ipv4/tcp.c
> @@ -3392,6 +3392,8 @@ int tcp_disconnect(struct sock *sk, int flags)
> tp->window_clamp = 0;
> tp->delivered = 0;
> tp->delivered_ce = 0;
> + tp->wait_third_ack = 0;
> + tp->accecn_fail_mode = 0;
> tcp_accecn_init_counters(tp);
> if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
> icsk->icsk_ca_ops->release(sk);
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 0fa3803b353d..c986452302cb 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -411,14 +411,114 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
> }
> }
I do think this patch is too big and should be split.
Powered by blists - more mailing lists