netdev - [PATCHv4 net-next 10/10] net: add support for ipv4 big tcp

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <637aa55b8dbf0c85c2ee8892df26a8bbbf9f2ef5.1674921359.git.lucien.xin@gmail.com>
Date:   Sat, 28 Jan 2023 10:58:39 -0500
From:   Xin Long <lucien.xin@...il.com>
To:     network dev <netdev@...r.kernel.org>
Cc:     davem@...emloft.net, kuba@...nel.org,
        Eric Dumazet <edumazet@...gle.com>,
        Paolo Abeni <pabeni@...hat.com>,
        David Ahern <dsahern@...il.com>,
        Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
        Pravin B Shelar <pshelar@....org>,
        Jamal Hadi Salim <jhs@...atatu.com>,
        Cong Wang <xiyou.wangcong@...il.com>,
        Jiri Pirko <jiri@...nulli.us>,
        Pablo Neira Ayuso <pablo@...filter.org>,
        Florian Westphal <fw@...len.de>,
        Marcelo Ricardo Leitner <marcelo.leitner@...il.com>,
        Ilya Maximets <i.maximets@....org>,
        Aaron Conole <aconole@...hat.com>,
        Roopa Prabhu <roopa@...dia.com>,
        Nikolay Aleksandrov <razor@...ckwall.org>,
        Mahesh Bandewar <maheshb@...gle.com>,
        Paul Moore <paul@...l-moore.com>,
        Guillaume Nault <gnault@...hat.com>
Subject: [PATCHv4 net-next 10/10] net: add support for ipv4 big tcp

Similar to Eric's IPv6 BIG TCP, this patch is to enable IPv4 BIG TCP.

Firstly, allow sk->sk_gso_max_size to be set to a value greater than
GSO_LEGACY_MAX_SIZE by not trimming gso_max_size in sk_trim_gso_size()
for IPv4 TCP sockets.

Then on TX path, set IP header tot_len to 0 when skb->len > IP_MAX_MTU
in __ip_local_out() to allow to send BIG TCP packets, and this implies
that skb->len is the length of a IPv4 packet; On RX path, use skb->len
as the length of the IPv4 packet when the IP header tot_len is 0 and
skb->len > IP_MAX_MTU in ip_rcv_core(). As the API iph_set_totlen() and
skb_ip_totlen() are used in __ip_local_out() and ip_rcv_core(), we only
need to update these APIs.

Also in GRO receive, add the check for ETH_P_IP/IPPROTO_TCP, and allows
the merged packet size >= GRO_LEGACY_MAX_SIZE in skb_gro_receive(). In
GRO complete, set IP header tot_len to 0 when the merged packet size
greater than IP_MAX_MTU in iph_set_totlen() so that it can be processed
on RX path.

Note that by checking skb_is_gso_tcp() in API iph_totlen(), it makes
this implementation safe to use iph->len == 0 indicates IPv4 BIG TCP
packets.

Signed-off-by: Xin Long <lucien.xin@...il.com>
---
 net/core/gro.c       | 12 +++++++-----
 net/core/sock.c      | 26 ++++++++++++++------------
 net/ipv4/af_inet.c   |  7 ++++---
 net/ipv4/ip_input.c  |  2 +-
 net/ipv4/ip_output.c |  2 +-
 5 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/net/core/gro.c b/net/core/gro.c
index 506f83d715f8..b15f85546bdd 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -162,16 +162,18 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
 	struct sk_buff *lp;
 	int segs;
 
-	/* pairs with WRITE_ONCE() in netif_set_gro_max_size() */
-	gro_max_size = READ_ONCE(p->dev->gro_max_size);
+	/* pairs with WRITE_ONCE() in netif_set_gro(_ipv4)_max_size() */
+	gro_max_size = p->protocol == htons(ETH_P_IPV6) ?
+			READ_ONCE(p->dev->gro_max_size) :
+				READ_ONCE(p->dev->gro_ipv4_max_size);
 
 	if (unlikely(p->len + len >= gro_max_size || NAPI_GRO_CB(skb)->flush))
 		return -E2BIG;
 
 	if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
-		if (p->protocol != htons(ETH_P_IPV6) ||
-		    skb_headroom(p) < sizeof(struct hop_jumbo_hdr) ||
-		    ipv6_hdr(p)->nexthdr != IPPROTO_TCP ||
+		if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP ||
+		    (p->protocol == htons(ETH_P_IPV6) &&
+		     skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) ||
 		    p->encapsulation)
 			return -E2BIG;
 	}
diff --git a/net/core/sock.c b/net/core/sock.c
index 7ba4891460ad..f08b76acde9b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2373,17 +2373,22 @@ void sk_free_unlock_clone(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
 
-static void sk_trim_gso_size(struct sock *sk)
+static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
 {
-	if (sk->sk_gso_max_size <= GSO_LEGACY_MAX_SIZE)
-		return;
+	bool is_ipv6 = false;
+	u32 max_size;
+
 #if IS_ENABLED(CONFIG_IPV6)
-	if (sk->sk_family == AF_INET6 &&
-	    sk_is_tcp(sk) &&
-	    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
-		return;
+	is_ipv6 = (sk->sk_family == AF_INET6 &&
+		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
 #endif
-	sk->sk_gso_max_size = GSO_LEGACY_MAX_SIZE;
+	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
+	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
+			READ_ONCE(dst->dev->gso_ipv4_max_size);
+	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
+		max_size = GSO_LEGACY_MAX_SIZE;
+
+	return max_size - (MAX_TCP_HEADER + 1);
 }
 
 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
@@ -2403,10 +2408,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 		} else {
 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
-			/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
-			sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
-			sk_trim_gso_size(sk);
-			sk->sk_gso_max_size -= (MAX_TCP_HEADER + 1);
+			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
 		}
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 6c0ec2789943..2f992a323b95 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1485,6 +1485,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
 	if (unlikely(ip_fast_csum((u8 *)iph, 5)))
 		goto out;
 
+	NAPI_GRO_CB(skb)->proto = proto;
 	id = ntohl(*(__be32 *)&iph->id);
 	flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF));
 	id >>= 16;
@@ -1618,9 +1619,9 @@ int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 
 int inet_gro_complete(struct sk_buff *skb, int nhoff)
 {
-	__be16 newlen = htons(skb->len - nhoff);
 	struct iphdr *iph = (struct iphdr *)(skb->data + nhoff);
 	const struct net_offload *ops;
+	__be16 totlen = iph->tot_len;
 	int proto = iph->protocol;
 	int err = -ENOSYS;
 
@@ -1629,8 +1630,8 @@ int inet_gro_complete(struct sk_buff *skb, int nhoff)
 		skb_set_inner_network_header(skb, nhoff);
 	}
 
-	csum_replace2(&iph->check, iph->tot_len, newlen);
-	iph->tot_len = newlen;
+	iph_set_totlen(iph, skb->len - nhoff);
+	csum_replace2(&iph->check, totlen, iph->tot_len);
 
 	ops = rcu_dereference(inet_offloads[proto]);
 	if (WARN_ON(!ops || !ops->callbacks.gro_complete))
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index e880ce77322a..fe9ead9ee863 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -511,7 +511,7 @@ static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
 	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
 		goto csum_error;
 
-	len = ntohs(iph->tot_len);
+	len = iph_totlen(skb, iph);
 	if (skb->len < len) {
 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
 		__IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 922c87ef1ab5..4e4e308c3230 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -100,7 +100,7 @@ int __ip_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
 	struct iphdr *iph = ip_hdr(skb);
 
-	iph->tot_len = htons(skb->len);
+	iph_set_totlen(iph, skb->len);
 	ip_send_check(iph);
 
 	/* if egress device is enslaved to an L3 master device pass the
-- 
2.31.1