lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 19 Nov 2014 14:10:42 +0800
From:	Yuchung Cheng <ycheng@...gle.com>
To:	Eric Dumazet <eric.dumazet@...il.com>
Cc:	Denys Fedoryshchenko <nuclearcat@...learcat.com>,
	David Miller <davem@...emloft.net>,
	netdev <netdev@...r.kernel.org>,
	Neal Cardwell <ncardwell@...gle.com>
Subject: Re: [PATCH v2 net-next] tcp: make connect() mem charging friendly

On Tue, Nov 18, 2014 at 3:06 PM, Eric Dumazet <eric.dumazet@...il.com> wrote:
> From: Eric Dumazet <edumazet@...gle.com>
>
> While working on sk_forward_alloc problems reported by Denys
> Fedoryshchenko, we found that tcp connect() (and fastopen) do not call
> sk_wmem_schedule() for SYN packet (and/or SYN/DATA packet), so
> sk_forward_alloc is negative while connect is in progress.
>
> We can fix this by calling regular sk_stream_alloc_skb() both for the
> SYN packet (in tcp_connect()) and the syn_data packet in
> tcp_send_syn_data()
>
> Then, tcp_send_syn_data() can avoid copying syn_data as we simply
> can manipulate syn_data->cb[] to remove SYN flag (and increment seq)
>
> Instead of open coding memcpy_fromiovecend(), simply use this helper.
>
> This leaves in socket write queue clean fast clone skbs.
>
> This was tested against our fastopen packetdrill tests.
>
> Reported-by: Denys Fedoryshchenko <nuclearcat@...learcat.com>
> Signed-off-by: Eric Dumazet <edumazet@...gle.com>
Acked-by: Yuchung Cheng <ycheng@...gle.com>

Thanks! this simplifies the code a lot.

> ---
> v2: added a kfree_skb(syn_data) if memcpy_fromiovecend() fails,
>     as spotted by Yuchung.
>
>  net/ipv4/tcp_output.c |   68 ++++++++++++++++------------------------
>  1 file changed, 28 insertions(+), 40 deletions(-)
>
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index eb73a1dccf56b823a45c0ca034e40dc50fc48068..f5bd4bd3f7e669b3fd48a843d55e7313a30a3409 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -3011,9 +3011,9 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
>  {
>         struct tcp_sock *tp = tcp_sk(sk);
>         struct tcp_fastopen_request *fo = tp->fastopen_req;
> -       int syn_loss = 0, space, i, err = 0, iovlen = fo->data->msg_iovlen;
> -       struct sk_buff *syn_data = NULL, *data;
> +       int syn_loss = 0, space, err = 0;
>         unsigned long last_syn_loss = 0;
> +       struct sk_buff *syn_data;
>
>         tp->rx_opt.mss_clamp = tp->advmss;  /* If MSS is not cached */
>         tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie,
> @@ -3044,48 +3044,40 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
>         /* limit to order-0 allocations */
>         space = min_t(size_t, space, SKB_MAX_HEAD(MAX_TCP_HEADER));
>
> -       syn_data = skb_copy_expand(syn, MAX_TCP_HEADER, space,
> -                                  sk->sk_allocation);
> -       if (syn_data == NULL)
> +       syn_data = sk_stream_alloc_skb(sk, space, sk->sk_allocation);
> +       if (!syn_data)
>                 goto fallback;
> +       syn_data->ip_summed = CHECKSUM_PARTIAL;
> +       memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
> +       if (unlikely(memcpy_fromiovecend(skb_put(syn_data, space),
> +                                        fo->data->msg_iov, 0, space))) {
> +               kfree_skb(syn_data);
> +               goto fallback;
> +       }
>
> -       for (i = 0; i < iovlen && syn_data->len < space; ++i) {
> -               struct iovec *iov = &fo->data->msg_iov[i];
> -               unsigned char __user *from = iov->iov_base;
> -               int len = iov->iov_len;
> +       /* No more data pending in inet_wait_for_connect() */
> +       if (space == fo->size)
> +               fo->data = NULL;
> +       fo->copied = space;
>
> -               if (syn_data->len + len > space)
> -                       len = space - syn_data->len;
> -               else if (i + 1 == iovlen)
> -                       /* No more data pending in inet_wait_for_connect() */
> -                       fo->data = NULL;
> +       tcp_connect_queue_skb(sk, syn_data);
>
> -               if (skb_add_data(syn_data, from, len))
> -                       goto fallback;
> -       }
> +       err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
>
> -       /* Queue a data-only packet after the regular SYN for retransmission */
> -       data = pskb_copy(syn_data, sk->sk_allocation);
> -       if (data == NULL)
> -               goto fallback;
> -       TCP_SKB_CB(data)->seq++;
> -       TCP_SKB_CB(data)->tcp_flags &= ~TCPHDR_SYN;
> -       TCP_SKB_CB(data)->tcp_flags = (TCPHDR_ACK|TCPHDR_PSH);
> -       tcp_connect_queue_skb(sk, data);
> -       fo->copied = data->len;
> -
> -       /* syn_data is about to be sent, we need to take current time stamps
> -        * for the packets that are in write queue : SYN packet and DATA
> -        */
> -       skb_mstamp_get(&syn->skb_mstamp);
> -       data->skb_mstamp = syn->skb_mstamp;
> +       syn->skb_mstamp = syn_data->skb_mstamp;
>
> -       if (tcp_transmit_skb(sk, syn_data, 0, sk->sk_allocation) == 0) {
> +       /* Now full SYN+DATA was cloned and sent (or not),
> +        * remove the SYN from the original skb (syn_data)
> +        * we keep in write queue in case of a retransmit, as we
> +        * also have the SYN packet (with no data) in the same queue.
> +        */
> +       TCP_SKB_CB(syn_data)->seq++;
> +       TCP_SKB_CB(syn_data)->tcp_flags = TCPHDR_ACK | TCPHDR_PSH;
> +       if (!err) {
>                 tp->syn_data = (fo->copied > 0);
>                 NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPORIGDATASENT);
>                 goto done;
>         }
> -       syn_data = NULL;
>
>  fallback:
>         /* Send a regular SYN with Fast Open cookie request option */
> @@ -3094,7 +3086,6 @@ fallback:
>         err = tcp_transmit_skb(sk, syn, 1, sk->sk_allocation);
>         if (err)
>                 tp->syn_fastopen = 0;
> -       kfree_skb(syn_data);
>  done:
>         fo->cookie.len = -1;  /* Exclude Fast Open option for SYN retries */
>         return err;
> @@ -3114,13 +3105,10 @@ int tcp_connect(struct sock *sk)
>                 return 0;
>         }
>
> -       buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
> -       if (unlikely(buff == NULL))
> +       buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation);
> +       if (unlikely(!buff))
>                 return -ENOBUFS;
>
> -       /* Reserve space for headers. */
> -       skb_reserve(buff, MAX_TCP_HEADER);
> -
>         tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
>         tp->retrans_stamp = tcp_time_stamp;
>         tcp_connect_queue_skb(sk, buff);
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ