linux-kernel - Re: [net-next,v2,4/5] seg6: add support for the SRv6 End.DT4 behavior

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20201110151255.3a86afcc@kicinski-fedora-pc1c0hjn.dhcp.thefacebook.com>
Date:   Tue, 10 Nov 2020 15:12:55 -0800
From:   Jakub Kicinski <kuba@...nel.org>
To:     Andrea Mayer <andrea.mayer@...roma2.it>
Cc:     "David S. Miller" <davem@...emloft.net>,
        David Ahern <dsahern@...nel.org>,
        Alexey Kuznetsov <kuznet@....inr.ac.ru>,
        Hideaki YOSHIFUJI <yoshfuji@...ux-ipv6.org>,
        Shuah Khan <shuah@...nel.org>,
        Shrijeet Mukherjee <shrijeet@...il.com>,
        Alexei Starovoitov <ast@...nel.org>,
        Daniel Borkmann <daniel@...earbox.net>,
        Andrii Nakryiko <andrii@...nel.org>,
        Martin KaFai Lau <kafai@...com>,
        Song Liu <songliubraving@...com>, Yonghong Song <yhs@...com>,
        John Fastabend <john.fastabend@...il.com>,
        KP Singh <kpsingh@...omium.org>, netdev@...r.kernel.org,
        linux-kernel@...r.kernel.org, linux-kselftest@...r.kernel.org,
        Stefano Salsano <stefano.salsano@...roma2.it>,
        Paolo Lungaroni <paolo.lungaroni@...t.it>,
        Ahmed Abdelsalam <ahabdels.dev@...il.com>
Subject: Re: [net-next,v2,4/5] seg6: add support for the SRv6 End.DT4
 behavior

On Sat,  7 Nov 2020 16:31:38 +0100 Andrea Mayer wrote:
> SRv6 End.DT4 is defined in the SRv6 Network Programming [1].
> 
> The SRv6 End.DT4 is used to implement IPv4 L3VPN use-cases in
> multi-tenants environments. It decapsulates the received packets and it
> performs IPv4 routing lookup in the routing table of the tenant.
> 
> The SRv6 End.DT4 Linux implementation leverages a VRF device in order to
> force the routing lookup into the associated routing table.

How does the behavior of DT4 compare to DT6?

The implementation looks quite different.

> To make the End.DT4 work properly, it must be guaranteed that the routing
> table used for routing lookup operations is bound to one and only one
> VRF during the tunnel creation. Such constraint has to be enforced by
> enabling the VRF strict_mode sysctl parameter, i.e:
>  $ sysctl -wq net.vrf.strict_mode=1.
> 
> At JANOG44, LINE corporation presented their multi-tenant DC architecture
> using SRv6 [2]. In the slides, they reported that the Linux kernel is
> missing the support of SRv6 End.DT4 behavior.
> 
> The iproute2 counterpart required for configuring the SRv6 End.DT4
> behavior is already implemented along with the other supported SRv6
> behaviors [3].
> 
> [1] https://tools.ietf.org/html/draft-ietf-spring-srv6-network-programming
> [2] https://speakerdeck.com/line_developers/line-data-center-networking-with-srv6
> [3] https://patchwork.ozlabs.org/patch/799837/
> 
> Signed-off-by: Andrea Mayer <andrea.mayer@...roma2.it>
> ---
>  net/ipv6/seg6_local.c | 205 ++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 205 insertions(+)
> 
> diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c
> index 4b0f155d641d..a41074acd43e 100644
> --- a/net/ipv6/seg6_local.c
> +++ b/net/ipv6/seg6_local.c
> @@ -57,6 +57,14 @@ struct bpf_lwt_prog {
>  	char *name;
>  };
>  
> +struct seg6_end_dt4_info {
> +	struct net *net;
> +	/* VRF device associated to the routing table used by the SRv6 End.DT4
> +	 * behavior for routing IPv4 packets.
> +	 */
> +	int vrf_ifindex;
> +};
> +
>  struct seg6_local_lwt {
>  	int action;
>  	struct ipv6_sr_hdr *srh;
> @@ -66,6 +74,7 @@ struct seg6_local_lwt {
>  	int iif;
>  	int oif;
>  	struct bpf_lwt_prog bpf;
> +	struct seg6_end_dt4_info dt4_info;
>  
>  	int headroom;
>  	struct seg6_action_desc *desc;
> @@ -413,6 +422,194 @@ static int input_action_end_dx4(struct sk_buff *skb,
>  	return -EINVAL;
>  }
>  
> +#ifdef CONFIG_NET_L3_MASTER_DEV
> +

no need for this empty line.

> +static struct net *fib6_config_get_net(const struct fib6_config *fib6_cfg)
> +{
> +	const struct nl_info *nli = &fib6_cfg->fc_nlinfo;
> +
> +	return nli->nl_net;
> +}
> +
> +static int seg6_end_dt4_build(struct seg6_local_lwt *slwt, const void *cfg,
> +			      struct netlink_ext_ack *extack)
> +{
> +	struct seg6_end_dt4_info *info = &slwt->dt4_info;
> +	int vrf_ifindex;
> +	struct net *net;
> +
> +	net = fib6_config_get_net(cfg);
> +
> +	vrf_ifindex = l3mdev_ifindex_lookup_by_table_id(L3MDEV_TYPE_VRF, net,
> +							slwt->table);
> +	if (vrf_ifindex < 0) {
> +		if (vrf_ifindex == -EPERM) {
> +			NL_SET_ERR_MSG(extack,
> +				       "Strict mode for VRF is disabled");
> +		} else if (vrf_ifindex == -ENODEV) {
> +			NL_SET_ERR_MSG(extack, "No such device");

That's what -ENODEV already says.

> +		} else {
> +			NL_SET_ERR_MSG(extack, "Unknown error");

Useless error.

> +			pr_debug("seg6local: SRv6 End.DT4 creation error=%d\n",
> +				 vrf_ifindex);
> +		}
> +
> +		return vrf_ifindex;
> +	}
> +
> +	info->net = net;
> +	info->vrf_ifindex = vrf_ifindex;
> +
> +	return 0;
> +}
> +
> +/* The SRv6 End.DT4 behavior extracts the inner (IPv4) packet and routes the
> + * IPv4 packet by looking at the configured routing table.
> + *
> + * In the SRv6 End.DT4 use case, we can receive traffic (IPv6+Segment Routing
> + * Header packets) from several interfaces and the IPv6 destination address (DA)
> + * is used for retrieving the specific instance of the End.DT4 behavior that
> + * should process the packets.
> + *
> + * However, the inner IPv4 packet is not really bound to any receiving
> + * interface and thus the End.DT4 sets the VRF (associated with the
> + * corresponding routing table) as the *receiving* interface.
> + * In other words, the End.DT4 processes a packet as if it has been received
> + * directly by the VRF (and not by one of its slave devices, if any).
> + * In this way, the VRF interface is used for routing the IPv4 packet in
> + * according to the routing table configured by the End.DT4 instance.
> + *
> + * This design allows you to get some interesting features like:
> + *  1) the statistics on rx packets;
> + *  2) the possibility to install a packet sniffer on the receiving interface
> + *     (the VRF one) for looking at the incoming packets;
> + *  3) the possibility to leverage the netfilter prerouting hook for the inner
> + *     IPv4 packet.
> + *
> + * This function returns:
> + *  - the sk_buff* when the VRF rcv handler has processed the packet correctly;
> + *  - NULL when the skb is consumed by the VRF rcv handler;
> + *  - a pointer which encodes a negative error number in case of error.
> + *    Note that in this case, the function takes care of freeing the skb.
> + */
> +static struct sk_buff *end_dt4_vrf_rcv(struct sk_buff *skb,
> +				       struct net_device *dev)
> +{
> +	/* based on l3mdev_ip_rcv; we are only interested in the master */
> +	if (unlikely(!netif_is_l3_master(dev) && !netif_has_l3_rx_handler(dev)))
> +		goto drop;
> +
> +	if (unlikely(!dev->l3mdev_ops->l3mdev_l3_rcv))
> +		goto drop;
> +
> +	/* the decap packet (IPv4) does not come with any mac header info.
> +	 * We must unset the mac header to allow the VRF device to rebuild it,
> +	 * just in case there is a sniffer attached on the device.
> +	 */
> +	skb_unset_mac_header(skb);
> +
> +	skb = dev->l3mdev_ops->l3mdev_l3_rcv(dev, skb, AF_INET);
> +	if (!skb)
> +		/* the skb buffer was consumed by the handler */
> +		return NULL;
> +
> +	/* when a packet is received by a VRF or by one of its slaves, the
> +	 * master device reference is set into the skb.
> +	 */
> +	if (unlikely(skb->dev != dev || skb->skb_iif != dev->ifindex))
> +		goto drop;
> +
> +	return skb;
> +
> +drop:
> +	kfree_skb(skb);
> +	return ERR_PTR(-EINVAL);
> +}
> +
> +static struct net_device *end_dt4_get_vrf_rcu(struct sk_buff *skb,
> +					      struct seg6_end_dt4_info *info)
> +{
> +	int vrf_ifindex = info->vrf_ifindex;
> +	struct net *net = info->net;
> +
> +	if (unlikely(vrf_ifindex < 0))
> +		goto error;
> +
> +	if (unlikely(!net_eq(dev_net(skb->dev), net)))
> +		goto error;
> +
> +	return dev_get_by_index_rcu(net, vrf_ifindex);
> +
> +error:
> +	return NULL;
> +}
> +
> +static int input_action_end_dt4(struct sk_buff *skb,
> +				struct seg6_local_lwt *slwt)
> +{
> +	struct net_device *vrf;
> +	struct iphdr *iph;
> +	int err;
> +
> +	if (!decap_and_validate(skb, IPPROTO_IPIP))
> +		goto drop;
> +
> +	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
> +		goto drop;
> +
> +	vrf = end_dt4_get_vrf_rcu(skb, &slwt->dt4_info);
> +	if (unlikely(!vrf))
> +		goto drop;
> +
> +	skb->protocol = htons(ETH_P_IP);
> +
> +	skb_dst_drop(skb);
> +
> +	skb_set_transport_header(skb, sizeof(struct iphdr));
> +
> +	skb = end_dt4_vrf_rcv(skb, vrf);
> +	if (!skb)
> +		/* packet has been processed and consumed by the VRF */
> +		return 0;
> +
> +	if (IS_ERR(skb)) {
> +		err = PTR_ERR(skb);
> +		return err;

return PTR_ERR(skb)

> +	}
> +
> +	iph = ip_hdr(skb);
> +
> +	err = ip_route_input(skb, iph->daddr, iph->saddr, 0, skb->dev);
> +	if (err)
> +		goto drop;
> +
> +	return dst_input(skb);
> +
> +drop:
> +	kfree_skb(skb);
> +	return -EINVAL;
> +}
> +
> +#else
> +

new line not needed

> +static int seg6_end_dt4_build(struct seg6_local_lwt *slwt, const void *cfg,
> +			      struct netlink_ext_ack *extack)
> +{
> +	NL_SET_ERR_MSG(extack, "Operation is not supported");

This extack message probably could be more helpful. As it stands it's
basically 

> +
> +	return -EOPNOTSUPP;
> +}
> +
> +static int input_action_end_dt4(struct sk_buff *skb,
> +				struct seg6_local_lwt *slwt)

Maybe just ifdef out the part of the action table instead of creating
those stubs?

> +{
> +	kfree_skb(skb);
> +	return -EOPNOTSUPP;
> +}
> +
> +#endif
> +
>  static int input_action_end_dt6(struct sk_buff *skb,
>  				struct seg6_local_lwt *slwt)
>  {
> @@ -601,6 +798,14 @@ static struct seg6_action_desc seg6_action_table[] = {

BTW any idea why the action table is not marked as const?

Would you mind sending a patch to fix that?

>  		.attrs		= (1 << SEG6_LOCAL_NH4),
>  		.input		= input_action_end_dx4,
>  	},
> +	{
> +		.action		= SEG6_LOCAL_ACTION_END_DT4,
> +		.attrs		= (1 << SEG6_LOCAL_TABLE),
> +		.input		= input_action_end_dt4,
> +		.slwt_ops	= {
> +					.build_state = seg6_end_dt4_build,
> +				  },
> +	},
>  	{
>  		.action		= SEG6_LOCAL_ACTION_END_DT6,
>  		.attrs		= (1 << SEG6_LOCAL_TABLE),