[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CANn89iL+o-SeOXOajTqcNnyajK2PRAMTMEk1b_A1JC-dFTMrNA@mail.gmail.com>
Date: Fri, 27 Jan 2023 19:44:43 +0100
From: Eric Dumazet <edumazet@...gle.com>
To: Xin Long <lucien.xin@...il.com>
Cc: network dev <netdev@...r.kernel.org>, davem@...emloft.net,
kuba@...nel.org, Paolo Abeni <pabeni@...hat.com>,
David Ahern <dsahern@...il.com>,
Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
Pravin B Shelar <pshelar@....org>,
Jamal Hadi Salim <jhs@...atatu.com>,
Cong Wang <xiyou.wangcong@...il.com>,
Jiri Pirko <jiri@...nulli.us>,
Pablo Neira Ayuso <pablo@...filter.org>,
Florian Westphal <fw@...len.de>,
Marcelo Ricardo Leitner <marcelo.leitner@...il.com>,
Ilya Maximets <i.maximets@....org>,
Aaron Conole <aconole@...hat.com>,
Roopa Prabhu <roopa@...dia.com>,
Nikolay Aleksandrov <razor@...ckwall.org>,
Mahesh Bandewar <maheshb@...gle.com>,
Paul Moore <paul@...l-moore.com>,
Guillaume Nault <gnault@...hat.com>
Subject: Re: [PATCHv3 net-next 10/10] net: add support for ipv4 big tcp
On Fri, Jan 27, 2023 at 7:37 PM Xin Long <lucien.xin@...il.com> wrote:
>
> On Fri, Jan 27, 2023 at 12:41 PM Eric Dumazet <edumazet@...gle.com> wrote:
> >
> > On Fri, Jan 27, 2023 at 5:00 PM Xin Long <lucien.xin@...il.com> wrote:
> > >
> > > Similar to Eric's IPv6 BIG TCP, this patch is to enable IPv4 BIG TCP.
> > >
> > > Firstly, allow sk->sk_gso_max_size to be set to a value greater than
> > > GSO_LEGACY_MAX_SIZE by not trimming gso_max_size in sk_trim_gso_size()
> > > for IPv4 TCP sockets.
> > >
> > > Then on TX path, set IP header tot_len to 0 when skb->len > IP_MAX_MTU
> > > in __ip_local_out() to allow to send BIG TCP packets, and this implies
> > > that skb->len is the length of a IPv4 packet; On RX path, use skb->len
> > > as the length of the IPv4 packet when the IP header tot_len is 0 and
> > > skb->len > IP_MAX_MTU in ip_rcv_core(). As the API iph_set_totlen() and
> > > skb_ip_totlen() are used in __ip_local_out() and ip_rcv_core(), we only
> > > need to update these APIs.
> > >
> > > Also in GRO receive, add the check for ETH_P_IP/IPPROTO_TCP, and allows
> > > the merged packet size >= GRO_LEGACY_MAX_SIZE in skb_gro_receive(). In
> > > GRO complete, set IP header tot_len to 0 when the merged packet size
> > > greater than IP_MAX_MTU in iph_set_totlen() so that it can be processed
> > > on RX path.
> > >
> > > Note that by checking skb_is_gso_tcp() in API iph_totlen(), it makes
> > > this implementation safe to use iph->len == 0 indicates IPv4 BIG TCP
> > > packets.
> > >
> > > Signed-off-by: Xin Long <lucien.xin@...il.com>
> > > ---
> > > net/core/gro.c | 12 +++++++-----
> > > net/core/sock.c | 8 ++++++--
> > > net/ipv4/af_inet.c | 7 ++++---
> > > net/ipv4/ip_input.c | 2 +-
> > > net/ipv4/ip_output.c | 2 +-
> > > 5 files changed, 19 insertions(+), 12 deletions(-)
> > >
> > > diff --git a/net/core/gro.c b/net/core/gro.c
> > > index 506f83d715f8..b15f85546bdd 100644
> > > --- a/net/core/gro.c
> > > +++ b/net/core/gro.c
> > > @@ -162,16 +162,18 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
> > > struct sk_buff *lp;
> > > int segs;
> > >
> > > - /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
> > > - gro_max_size = READ_ONCE(p->dev->gro_max_size);
> > > + /* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */
> > > + gro_max_size = p->protocol == htons(ETH_P_IPV6) ?
> > > + READ_ONCE(p->dev->gro_max_size) :
> > > + READ_ONCE(p->dev->gro_ipv4_max_size);
> > >
> > > if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush))
> > > return -E2BIG;
> > >
> > > if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
> > > - if (p->protocol != htons(ETH_P_IPV6) ||
> > > - skb_headroom(p) < sizeof(struct hop_jumbo_hdr) ||
> > > - ipv6_hdr(p)->nexthdr != IPPROTO_TCP ||
> > > + if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP ||
> > > + (p->protocol == htons(ETH_P_IPV6) &&
> > > + skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) ||
> > > p->encapsulation)
> > > return -E2BIG;
> > > }
> > > diff --git a/net/core/sock.c b/net/core/sock.c
> > > index 7ba4891460ad..c98f9a4eeff9 100644
> > > --- a/net/core/sock.c
> > > +++ b/net/core/sock.c
> > > @@ -2383,6 +2383,8 @@ static void sk_trim_gso_size(struct sock *sk)
> > > !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
> > > return;
> > > #endif
> > > + if (sk->sk_family == AF_INET && sk_is_tcp(sk))
> > > + return;
> >
> > Or simply
> >
> > diff --git a/net/core/sock.c b/net/core/sock.c
> > index 7ba4891460adbd6c13c0ce1dcdd7f23c8c1f0f5d..dcb8fff91fd9a9472267a2cf2fdc98114a7d2b7d
> > 100644
> > --- a/net/core/sock.c
> > +++ b/net/core/sock.c
> > @@ -2375,14 +2375,9 @@ EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
> >
> > static void sk_trim_gso_size(struct sock *sk)
> > {
> > - if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
> > + if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE ||
> > + sk_is_tcp(sk))
> > return;
> > -#if IS_ENABLED(CONFIG_IPV6)
> > - if (sk->sk_family == AF_INET6 &&
> > - sk_is_tcp(sk) &&
> > - !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
> > - return;
> > -#endif
> > sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
> > }
> There's a difference, AF_INET6 TCP socket may send ipv4 packets with
> ipv6_addr_v4mapped, if we don't check ipv6_addr_v4mapped(), IPV4
> GSO packets might go with the "gso_max_size" for IPV6.
>
But the change you wrote in sk_setup_caps() only checked sk_family.
> I think we could use the change you wrote above, but we also need to
> use dst->ops->family instead of sk->sk_family in sk_setup_caps():
>
> + sk->sk_gso_max_size = dst->ops->family == AF_INET6 ?
> + READ_ONCE(dst->dev->gso_max_size) :
> +
> READ_ONCE(dst->dev->gso_ipv4_max_size);
>
> >
> >
> >
> > > sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
> > > }
> > >
> > > @@ -2403,8 +2405,10 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
> > > sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
> > > } else {
> > > sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
> > > - /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
> > > - sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
> > > + /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
> > > + sk->sk_gso_max_size = sk->sk_family == AF_INET6 ?
> > > + READ_ONCE(dst->dev->gso_max_size) :
> > > + READ_ONCE(dst->dev->gso_ipv4_max_size);
Here...
So if you need ipv6_addr_v4mapped() this should be done here anyway.
> > > sk_trim_gso_size(sk);
> > > sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
> > > /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
> > > diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> > > index 6c0ec2789943..2f992a323b95 100644
> > > --- a/net/ipv4/af_inet.c
> > > +++ b/net/ipv4/af_inet.c
> > > @@ -1485,6 +1485,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
> > > if (unlikely(ip_fast_csum((u8 *)iph, 5)))
> > > goto out;
> > >
> > > + NAPI_GRO_CB(skb)->proto = proto;
> > > id = ntohl(*(__be32 *)&iph->id);
> > > flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
> > > id >>= 16;
> > > @@ -1618,9 +1619,9 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
> > >
> > > int inet_gro_complete(struct sk_buff *skb, int nhoff)
> > > {
> > > - __be16 newlen = htons(skb->len - nhoff);
> > > struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
> > > const struct net_offload *ops;
> > > + __be16 totlen = iph->tot_len;
> > > int proto = iph->protocol;
> > > int err = -ENOSYS;
> > >
> > > @@ -1629,8 +1630,8 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
> > > skb_set_inner_network_header(skb, nhoff);
> > > }
> > >
> > > - csum_replace2(&iph->check, iph->tot_len, newlen);
> > > - iph->tot_len = newlen;
> > > + iph_set_totlen(iph, skb->len - nhoff);
> > > + csum_replace2(&iph->check, totlen, iph->tot_len);
> > >
> > > ops = rcu_dereference(inet_offloads[proto]);
> > > if (WARN_ON(!ops || !ops->callbacks.gro_complete))
> > > diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
> > > index e880ce77322a..0aa8c49b4e1b 100644
> > > --- a/net/ipv4/ip_input.c
> > > +++ b/net/ipv4/ip_input.c
> > > @@ -511,7 +511,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
> > > if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
> > > goto csum_error;
> > >
> > > - len = ntohs(iph->tot_len);
> > > + len = skb_ip_totlen(skb);
> >
> > len = iph_totlen(skb, iph);
> OK, thanks.
Powered by blists - more mailing lists