[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <b6978999-4f05-5e87-2964-bec444221cf5@gmail.com>
Date: Mon, 3 Aug 2020 17:44:16 -0600
From: David Ahern <dsahern@...il.com>
To: Stefano Brivio <sbrivio@...hat.com>,
"David S. Miller" <davem@...emloft.net>
Cc: Florian Westphal <fw@...len.de>, Aaron Conole <aconole@...hat.com>,
Numan Siddique <nusiddiq@...hat.com>,
Jakub Kicinski <kuba@...nel.org>,
Pravin B Shelar <pshelar@....org>,
Roopa Prabhu <roopa@...ulusnetworks.com>,
Nikolay Aleksandrov <nikolay@...ulusnetworks.com>,
Lourdes Pedrajas <lu@...o.net>, netdev@...r.kernel.org
Subject: Re: [PATCH net-next 2/6] tunnels: PMTU discovery support for directly
bridged IP packets
On 8/3/20 2:52 PM, Stefano Brivio wrote:
> @@ -461,6 +464,91 @@ static inline void iptunnel_xmit_stats(struct net_device *dev, int pkt_len)
> }
> }
>
> +/**
> + * skb_tunnel_check_pmtu() - Check, update PMTU and trigger ICMP reply as needed
> + * @skb: Buffer being sent by encapsulation, L2 headers expected
> + * @encap_dst: Destination for tunnel encapsulation (outer IP)
> + * @headroom: Encapsulation header size, bytes
> + * @reply: Build matching ICMP or ICMPv6 message as a result
> + *
> + * L2 tunnel implementations that can carry IP and can be directly bridged
> + * (currently UDP tunnels) can't always rely on IP forwarding paths to handle
> + * PMTU discovery. In the bridged case, ICMP or ICMPv6 messages need to be built
> + * based on payload and sent back by the encapsulation itself.
> + *
> + * For routable interfaces, we just need to update the PMTU for the destination.
> + *
> + * Return: 0 if ICMP error not needed, length if built, negative value on error
> + */
> +static inline int skb_tunnel_check_pmtu(struct sk_buff *skb,
> + struct dst_entry *encap_dst,
> + int headroom, bool reply)
Given its size, this is probably better as a function. I believe it can
go into net/ipv4/ip_tunnel_core.c like you have iptunnel_pmtud_build_icmp.
> +{
> + u32 mtu = dst_mtu(encap_dst) - headroom;
> +
> + if ((skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) ||
> + (!skb_is_gso(skb) && (skb->len - skb_mac_header_len(skb)) <= mtu))
> + return 0;
> +
> + skb_dst_update_pmtu_no_confirm(skb, mtu);
> +
> + if (!reply || skb->pkt_type == PACKET_HOST)
> + return 0;
> +
> + if (skb->protocol == htons(ETH_P_IP) && mtu > 576) {
I am surprised the 576 does not have an existing macro.
> + const struct icmphdr *icmph = icmp_hdr(skb);
> + const struct iphdr *iph = ip_hdr(skb);
> +
> + if (iph->frag_off != htons(IP_DF) ||
> + ipv4_is_lbcast(iph->daddr) ||
> + ipv4_is_multicast(iph->daddr) ||
> + ipv4_is_zeronet(iph->saddr) ||
> + ipv4_is_loopback(iph->saddr) ||
> + ipv4_is_lbcast(iph->saddr) ||
> + ipv4_is_multicast(iph->saddr) ||
> + (iph->protocol == IPPROTO_ICMP && icmp_is_err(icmph->type)))
> + return 0;
> +
> + return iptunnel_pmtud_build_icmp(skb, mtu);
> + }
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> + if (skb->protocol == htons(ETH_P_IPV6) && mtu > IPV6_MIN_MTU) {
> + const struct ipv6hdr *ip6h = ipv6_hdr(skb);
> + int stype = ipv6_addr_type(&ip6h->saddr);
> + u8 proto = ip6h->nexthdr;
> + __be16 frag_off;
> + int offset;
> +
> + if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST ||
> + stype == IPV6_ADDR_LOOPBACK)
> + return 0;
> +
> + offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &proto,
> + &frag_off);
> + if (offset < 0 || (frag_off & htons(~0x7)))
> + return 0;
> +
> + if (proto == IPPROTO_ICMPV6) {
> + struct icmp6hdr *icmp6h;
> +
> + if (!pskb_may_pull(skb, (skb_network_header(skb) +
> + offset + 1 - skb->data)))
> + return 0;
> +
> + icmp6h = (struct icmp6hdr *)(skb_network_header(skb) +
> + offset);
> + if (icmpv6_is_err(icmp6h->icmp6_type) ||
> + icmp6h->icmp6_type == NDISC_REDIRECT)
> + return 0;
> + }
> +
> + return iptunnel_pmtud_build_icmp(skb, mtu);
> + }
> +#endif
separate v4 and v6 code into helpers based on skb->protocol; the mtu
check then becomes part of the version specific helpers.
> + return 0;
> +}
> +
> static inline void *ip_tunnel_info_opts(struct ip_tunnel_info *info)
> {
> return info + 1;
> diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
> index f8b419e2475c..54a3dbf7f512 100644
> --- a/net/ipv4/ip_tunnel_core.c
> +++ b/net/ipv4/ip_tunnel_core.c
> @@ -184,6 +184,128 @@ int iptunnel_handle_offloads(struct sk_buff *skb,
> }
> EXPORT_SYMBOL_GPL(iptunnel_handle_offloads);
>
> +/**
> + * iptunnel_pmtud_build_icmp() - Build ICMP or ICMPv6 error message for PMTUD
> + * @skb: Original packet with L2 header
> + * @mtu: MTU value for ICMP or ICMPv6 error
> + *
> + * Return: length on success, negative error code if message couldn't be built.
> + */
> +int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu)
> +{
> + struct ethhdr eh;
> + int len, err;
> +
> + if (skb->protocol == htons(ETH_P_IP))
> + len = ETH_HLEN + sizeof(struct iphdr);
> + else if (IS_ENABLED(CONFIG_IPV6) && skb->protocol == htons(ETH_P_IPV6))
> + len = ETH_HLEN + sizeof(struct ipv6hdr);
> + else
> + return 0;
> +
> + if (!pskb_may_pull(skb, len))
> + return -EINVAL;
> +
> + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN);
> + pskb_pull(skb, ETH_HLEN);
> + skb_reset_network_header(skb);
> +
> + if (skb->protocol == htons(ETH_P_IP)) {
> + const struct iphdr *iph = ip_hdr(skb);
> + struct icmphdr *icmph;
> + struct iphdr *niph;
> +
> + err = pskb_trim(skb, 576 - sizeof(*niph) - sizeof(*icmph));
> + if (err)
> + return err;
> +
> + len = skb->len + sizeof(*icmph);
> + err = skb_cow(skb, sizeof(*niph) + sizeof(*icmph) + ETH_HLEN);
> + if (err)
> + return err;
> +
> + icmph = skb_push(skb, sizeof(*icmph));
> + *icmph = (struct icmphdr) {
> + .type = ICMP_DEST_UNREACH,
> + .code = ICMP_FRAG_NEEDED,
> + .checksum = 0,
> + .un.frag.__unused = 0,
> + .un.frag.mtu = ntohs(mtu),
> + };
> + icmph->checksum = ip_compute_csum(icmph, len);
> + skb_reset_transport_header(skb);
> +
> + niph = skb_push(skb, sizeof(*niph));
> + *niph = (struct iphdr) {
> + .ihl = sizeof(*niph) / 4u,
> + .version = 4,
> + .tos = 0,
> + .tot_len = htons(len + sizeof(*niph)),
> + .id = 0,
> + .frag_off = htons(IP_DF),
> + .ttl = iph->ttl,
> + .protocol = IPPROTO_ICMP,
> + .saddr = iph->daddr,
> + .daddr = iph->saddr,
> + };
> + ip_send_check(niph);
> + }
> +
> +#if IS_ENABLED(CONFIG_IPV6)
> + else if (skb->protocol == htons(ETH_P_IPV6)) {
> + const struct ipv6hdr *ip6h = ipv6_hdr(skb);
> + struct icmp6hdr *icmp6h;
> + struct ipv6hdr *nip6h;
> + __wsum csum;
> +
> + err = pskb_trim(skb, IPV6_MIN_MTU -
> + sizeof(*nip6h) - sizeof(*icmp6h));
> + if (err)
> + return err;
> +
> + len = skb->len + sizeof(*icmp6h);
> + err = skb_cow(skb, sizeof(*nip6h) + sizeof(*icmp6h) + ETH_HLEN);
> + if (err)
> + return err;
> +
> + icmp6h = skb_push(skb, sizeof(*icmp6h));
> + *icmp6h = (struct icmp6hdr) {
> + .icmp6_type = ICMPV6_PKT_TOOBIG,
> + .icmp6_code = 0,
> + .icmp6_cksum = 0,
> + .icmp6_mtu = htonl(mtu),
> + };
> + skb_reset_transport_header(skb);
> +
> + nip6h = skb_push(skb, sizeof(*nip6h));
> + *nip6h = (struct ipv6hdr) {
> + .priority = 0,
> + .version = 6,
> + .flow_lbl = { 0 },
> + .payload_len = htons(len),
> + .nexthdr = IPPROTO_ICMPV6,
> + .hop_limit = ip6h->hop_limit,
> + .saddr = ip6h->daddr,
> + .daddr = ip6h->saddr,
> + };
> +
> + csum = csum_partial(icmp6h, len, 0);
> + icmp6h->icmp6_cksum = csum_ipv6_magic(&nip6h->saddr,
> + &nip6h->daddr, len,
> + IPPROTO_ICMPV6, csum);
> + }
> +#endif
> +
> + skb_reset_network_header(skb);
> + skb->ip_summed = CHECKSUM_NONE;
> +
> + eth_header(skb, skb->dev, htons(eh.h_proto), eh.h_source, eh.h_dest, 0);
> + skb_reset_mac_header(skb);
> +
> + return skb->len;
> +}
> +EXPORT_SYMBOL(iptunnel_pmtud_build_icmp);
I think separate v4 and v6 versions would be more readable; the
duplication is mostly skb manipulation.
Powered by blists - more mailing lists