lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CANn89iJe=CjG8Lt+XWDOtgd7MdHzCszSd-j-jn0KXtEzWeXJjw@mail.gmail.com>
Date: Mon, 9 Jun 2025 09:26:26 -0700
From: Eric Dumazet <edumazet@...gle.com>
To: Jeremy Harris <jgh@...m.org>
Cc: netdev@...r.kernel.org, linux-api@...r.kernel.org, ncardwell@...gle.com
Subject: Re: [PATCH net-next v3 2/6] tcp: copy write-data from listen socket
 to accept child socket

On Mon, Jun 9, 2025 at 9:05 AM Jeremy Harris <jgh@...m.org> wrote:
>
> Set the request_sock flag for fastopen earlier, making it available
> to the af_ops SYN-handler function.
>
> In that function copy data from the listen socket write queue into an
> sk_buff, allocating if needed and adding to the write queue of the
> newly-created child socket.
> Set sequence number values depending on the fastopen status.
>
> Signed-off-by: Jeremy Harris <jgh@...m.org>
> ---
>  net/ipv4/tcp_fastopen.c  |  3 ++-
>  net/ipv4/tcp_ipv4.c      |  4 +--
>  net/ipv4/tcp_minisocks.c | 58 ++++++++++++++++++++++++++++++++++++----
>  3 files changed, 57 insertions(+), 8 deletions(-)
>
> diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
> index 9b83d639b5ac..03a86d0b87ba 100644
> --- a/net/ipv4/tcp_fastopen.c
> +++ b/net/ipv4/tcp_fastopen.c
> @@ -245,6 +245,8 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
>         struct sock *child;
>         bool own_req;
>
> +       tcp_rsk(req)->tfo_listener = true;
> +
>         child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
>                                                          NULL, &own_req);
>         if (!child)
> @@ -261,7 +263,6 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
>         tp = tcp_sk(child);
>
>         rcu_assign_pointer(tp->fastopen_rsk, req);
> -       tcp_rsk(req)->tfo_listener = true;
>
>         /* RFC1323: The window in SYN & SYN/ACK segments is never
>          * scaled. So correct it appropriately.
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index 6a14f9e6fef6..e488effdbdb2 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1747,8 +1747,8 @@ EXPORT_IPV6_MOD(tcp_v4_conn_request);
>
>
>  /*
> - * The three way handshake has completed - we got a valid synack -
> - * now create the new socket.
> + * The three way handshake has completed - we got a valid synack
> + * (or a FASTOPEN syn) - now create the new socket.
>   */
>  struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
>                                   struct request_sock *req,
> diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
> index 43d7852ce07e..d471531b4a78 100644
> --- a/net/ipv4/tcp_minisocks.c
> +++ b/net/ipv4/tcp_minisocks.c
> @@ -529,7 +529,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
>         struct inet_connection_sock *newicsk;
>         const struct tcp_sock *oldtp;
>         struct tcp_sock *newtp;
> -       u32 seq;
> +       u32 seq, a_seq, n_seq;
>
>         if (!newsk)
>                 return NULL;
> @@ -550,9 +550,55 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
>         newtp->segs_in = 1;
>
>         seq = treq->snt_isn + 1;
> -       newtp->snd_sml = newtp->snd_una = seq;
> -       WRITE_ONCE(newtp->snd_nxt, seq);
> -       newtp->snd_up = seq;
> +       n_seq = seq;
> +       a_seq = seq;
> +       newtp->write_seq = seq;
> +       newtp->snd_una = seq;
> +
> +       /* If there is write-data sitting on the listen socket, copy it to
> +        * the accept socket. If FASTOPEN we will send it on the synack,
> +        * otherwise it sits there until 3rd-ack arrives.
> +        */
> +
> +       if (unlikely(!skb_queue_empty(&sk->sk_write_queue))) {
> +               struct sk_buff *l_skb = tcp_send_head(sk),
> +                               *a_skb = tcp_write_queue_tail(newsk);
> +               ssize_t copy = 0;
> +
> +               if (a_skb)
> +                       copy = l_skb->len - a_skb->len;
> +
> +               if (copy <= 0 || !tcp_skb_can_collapse_to(a_skb)) {
> +                       bool first_skb = tcp_rtx_and_write_queues_empty(newsk);
> +
> +                       a_skb = tcp_stream_alloc_skb(newsk,
> +                                                    newsk->sk_allocation,
> +                                                    first_skb);
> +                       if (!a_skb) {
> +                               /* is this the correct free? */
> +                               bh_unlock_sock(newsk);
> +                               sk_free(newsk);
> +                               return NULL;
> +                       }
> +
> +                       tcp_skb_entail(newsk, a_skb);
> +               }
> +               copy = min_t(int, l_skb->len, skb_tailroom(a_skb));
> +               skb_put_data(a_skb, l_skb->data, copy);

TCP stack no longer puts payload in skb->head. This caused many
issues, trust me.
Please do not try to bring this back.

Also I see no locking there, this is racy with another thread trying
to sendmsg() data to the listener.

Honestly, I do not like this series at all adding cost in TCP fast
path for a very narrow use-case.

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ