netdev - Re: [RFC net-next 3/3] mpls: new ipmpls device for encapsulating IP packets as mpls

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <87a8wivue4.fsf@x220.int.ebiederm.org>
Date:	Tue, 02 Jun 2015 13:26:43 -0500
From:	ebiederm@...ssion.com (Eric W. Biederman)
To:	Robert Shearman <rshearma@...cade.com>
Cc:	<netdev@...r.kernel.org>, roopa <roopa@...ulusnetworks.com>,
	Thomas Graf <tgraf@...g.ch>
Subject: Re: [RFC net-next 3/3] mpls: new ipmpls device for encapsulating IP packets as mpls

Robert Shearman <rshearma@...cade.com> writes:

> Allow creating an mpls device for the purposes of encapsulating IP
> packets with:
>
>   ip link add type ipmpls
>
> This device defines its per-nexthop encapsulation data as a stack of
> labels, in the same format as for RTA_NEWST. It uses the encap data
> which will have been stored in the IP route to encapsulate the packet
> with that stack of labels, with the last label corresponding to a
> local label that defines how the packet will be sent out. The device
> sends packets over loopback to the local MPLS forwarding logic which
> performs all of the work.
>
> Stats are implemented, although any error in the sending via the real
> interface will be handled by the main mpls forwarding code and so not
> accounted by the interface.

Eeek stats!  Lots of unnecessary overhead.  If stats were ok we could
have simply reduced the cost of struct net_device to the point where it
would not matter.

This is really a bad hack for not getting in and being able to set
dst_output the way the xfrm infrastructure does.

What we really want here is xfrm-lite.  By lite I mean the tunnel
selection criteria is simple enough that it fits into the normal
routing table instead of having to do weird flow based magic that
is rarely needed.

I believe what we want are the xfrm stacking of dst entries.

Eric


> This implementation is based on an alternative earlier implementation
> by Eric W. Biederman.
>
> Signed-off-by: Robert Shearman <rshearma@...cade.com>
> ---
>  include/uapi/linux/if_arp.h |   1 +
>  net/mpls/Kconfig            |   5 +
>  net/mpls/Makefile           |   1 +
>  net/mpls/af_mpls.c          |   2 +
>  net/mpls/ipmpls.c           | 284 ++++++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 293 insertions(+)
>  create mode 100644 net/mpls/ipmpls.c
>
> diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h
> index 4d024d75d64b..17d669fd1781 100644
> --- a/include/uapi/linux/if_arp.h
> +++ b/include/uapi/linux/if_arp.h
> @@ -88,6 +88,7 @@
>  #define ARPHRD_IEEE80211_RADIOTAP 803	/* IEEE 802.11 + radiotap header */
>  #define ARPHRD_IEEE802154	  804
>  #define ARPHRD_IEEE802154_MONITOR 805	/* IEEE 802.15.4 network monitor */
> +#define ARPHRD_MPLS	806		/* IP and IPv6 over MPLS tunnels */
>  
>  #define ARPHRD_PHONET	820		/* PhoNet media type		*/
>  #define ARPHRD_PHONET_PIPE 821		/* PhoNet pipe header		*/
> diff --git a/net/mpls/Kconfig b/net/mpls/Kconfig
> index 17bde799c854..5264da94733a 100644
> --- a/net/mpls/Kconfig
> +++ b/net/mpls/Kconfig
> @@ -27,4 +27,9 @@ config MPLS_ROUTING
>  	help
>  	 Add support for forwarding of mpls packets.
>  
> +config MPLS_IPTUNNEL
> +	tristate "MPLS: IP over MPLS tunnel support"
> +	help
> +	 A network device that encapsulates ip packets as mpls
> +
>  endif # MPLS
> diff --git a/net/mpls/Makefile b/net/mpls/Makefile
> index 65bbe68c72e6..3a93c14b23c5 100644
> --- a/net/mpls/Makefile
> +++ b/net/mpls/Makefile
> @@ -3,5 +3,6 @@
>  #
>  obj-$(CONFIG_NET_MPLS_GSO) += mpls_gso.o
>  obj-$(CONFIG_MPLS_ROUTING) += mpls_router.o
> +obj-$(CONFIG_MPLS_IPTUNNEL) += ipmpls.o
>  
>  mpls_router-y := af_mpls.o
> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
> index 7b3f732269e4..68bdfbdddfaf 100644
> --- a/net/mpls/af_mpls.c
> +++ b/net/mpls/af_mpls.c
> @@ -615,6 +615,7 @@ int nla_put_labels(struct sk_buff *skb, int attrtype,
>  
>  	return 0;
>  }
> +EXPORT_SYMBOL(nla_put_labels);
>  
>  int nla_get_labels(const struct nlattr *nla,
>  		   u32 max_labels, u32 *labels, u32 label[])
> @@ -660,6 +661,7 @@ int nla_get_labels(const struct nlattr *nla,
>  	*labels = nla_labels;
>  	return 0;
>  }
> +EXPORT_SYMBOL(nla_get_labels);
>  
>  static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
>  			       struct mpls_route_config *cfg)
> diff --git a/net/mpls/ipmpls.c b/net/mpls/ipmpls.c
> new file mode 100644
> index 000000000000..cf6894ae0c61
> --- /dev/null
> +++ b/net/mpls/ipmpls.c
> @@ -0,0 +1,284 @@
> +#include <linux/types.h>
> +#include <linux/netdevice.h>
> +#include <linux/if_vlan.h>
> +#include <linux/if_arp.h>
> +#include <linux/ip.h>
> +#include <linux/ipv6.h>
> +#include <linux/module.h>
> +#include <linux/mpls.h>
> +#include "internal.h"
> +
> +static LIST_HEAD(ipmpls_dev_list);
> +
> +#define MAX_NEW_LABELS 2
> +
> +struct ipmpls_dev_priv {
> +	struct net_device *out_dev;
> +	struct list_head list;
> +	struct net_device *dev;
> +};
> +
> +static netdev_tx_t ipmpls_dev_xmit(struct sk_buff *skb, struct net_device *dev)
> +{
> +	struct ipmpls_dev_priv *priv = netdev_priv(dev);
> +	struct net_device *out_dev = priv->out_dev;
> +	struct mpls_shim_hdr *hdr;
> +	bool bottom_of_stack = true;
> +	int len = skb->len;
> +	const void *encap;
> +	int num_labels;
> +	unsigned ttl;
> +	const u32 *labels;
> +	int ret;
> +	int i;
> +
> +	num_labels = dst_get_encap(skb, &encap) / 4;
> +	if (!num_labels)
> +		goto drop;
> +
> +	labels = encap;
> +
> +	/* Obtain the ttl */
> +	if (skb->protocol == htons(ETH_P_IP)) {
> +		ttl = ip_hdr(skb)->ttl;
> +	} else if (skb->protocol == htons(ETH_P_IPV6)) {
> +		ttl = ipv6_hdr(skb)->hop_limit;
> +	} else if (skb->protocol == htons(ETH_P_MPLS_UC)) {
> +		ttl = mpls_entry_decode(mpls_hdr(skb)).ttl;
> +		bottom_of_stack = false;
> +	} else {
> +		goto drop;
> +	}
> +
> +	/* Now that the encap has been retrieved, there's no longer
> +	 * any need to keep the dst around so clear it out.
> +	 */
> +	skb_dst_drop(skb);
> +	skb_orphan(skb);
> +
> +	skb->inner_protocol = skb->protocol;
> +	skb->inner_network_header = skb->network_header;
> +
> +	skb_push(skb, num_labels * sizeof(*hdr));
> +	skb_reset_network_header(skb);
> +	hdr = mpls_hdr(skb);
> +
> +	for (i = num_labels - 1; i >= 0; i--) {
> +		hdr[i] = mpls_entry_encode(labels[i], ttl, 0, bottom_of_stack);
> +		bottom_of_stack = false;
> +	}
> +
> +	skb->dev = out_dev;
> +	skb->protocol = htons(ETH_P_MPLS_UC);
> +
> +	ret = dev_hard_header(skb, out_dev, ETH_P_MPLS_UC,
> +			      out_dev->dev_addr, NULL, len);
> +	if (ret >= 0)
> +		ret = dev_queue_xmit(skb);
> +	if (ret)
> +		goto drop;
> +
> +	dev->stats.tx_packets++;
> +	dev->stats.tx_bytes += len;
> +
> +	return 0;
> +
> +drop:
> +	dev->stats.tx_dropped++;
> +	kfree_skb(skb);
> +	return NETDEV_TX_OK;
> +}
> +
> +static int ipmpls_dev_init(struct net_device *dev)
> +{
> +	struct ipmpls_dev_priv *priv = netdev_priv(dev);
> +
> +	list_add_tail(&priv->list, &ipmpls_dev_list);
> +
> +	return 0;
> +}
> +
> +static void ipmpls_dev_uninit(struct net_device *dev)
> +{
> +	struct ipmpls_dev_priv *priv = netdev_priv(dev);
> +
> +	list_del_init(&priv->list);
> +}
> +
> +static void ipmpls_dev_free(struct net_device *dev)
> +{
> +	free_netdev(dev);
> +}
> +
> +static const struct net_device_ops ipmpls_netdev_ops = {
> +	.ndo_init		= ipmpls_dev_init,
> +	.ndo_start_xmit		= ipmpls_dev_xmit,
> +	.ndo_uninit		= ipmpls_dev_uninit,
> +};
> +
> +#define IPMPLS_FEATURES (NETIF_F_SG |			\
> +			 NETIF_F_FRAGLIST |		\
> +			 NETIF_F_HIGHDMA |		\
> +			 NETIF_F_VLAN_CHALLENGED)
> +
> +static void ipmpls_dev_setup(struct net_device *dev)
> +{
> +	dev->netdev_ops		= &ipmpls_netdev_ops;
> +
> +	dev->type		= ARPHRD_MPLS;
> +	dev->flags		= IFF_NOARP;
> +	netif_keep_dst(dev);
> +	dev->addr_len		= 0;
> +	dev->features		|= NETIF_F_LLTX;
> +	dev->features		|= IPMPLS_FEATURES;
> +	dev->hw_features	|= IPMPLS_FEATURES;
> +	dev->vlan_features	= 0;
> +
> +	dev->destructor = ipmpls_dev_free;
> +}
> +
> +static int ipmpls_dev_validate(struct nlattr *tb[], struct nlattr *data[])
> +{
> +	return 0;
> +}
> +
> +static int ipmpls_dev_newlink(struct net *src_net, struct net_device *dev,
> +			      struct nlattr *tb[], struct nlattr *data[])
> +{
> +	struct ipmpls_dev_priv *priv = netdev_priv(dev);
> +
> +	priv->out_dev = src_net->loopback_dev;
> +	priv->dev = dev;
> +
> +	dev->hard_header_len =
> +		priv->out_dev->hard_header_len +
> +		sizeof(struct mpls_shim_hdr) * MAX_NEW_LABELS;
> +
> +	return register_netdevice(dev);
> +}
> +
> +static void ipmpls_dev_dellink(struct net_device *dev, struct list_head *head)
> +{
> +	unregister_netdevice_queue(dev, head);
> +}
> +
> +static int ipmpls_dev_parse_encap(const struct net_device *dev,
> +				  const struct nlattr *nla,
> +				  void *encap)
> +{
> +	u32 labels;
> +
> +	if (nla_len(nla) / 4 > MAX_NEW_LABELS)
> +		return -EINVAL;
> +
> +	if (encap && nla_get_labels(nla, MAX_NEW_LABELS, &labels, encap))
> +		return -EINVAL;
> +
> +	/* Stored encap size is the same as the rtnl encap len */
> +	return nla_len(nla);
> +}
> +
> +static int ipmpls_dev_fill_encap(const struct net_device *dev,
> +				 struct sk_buff *skb, int encap_len,
> +				 const void *encap)
> +{
> +	return nla_put_labels(skb, RTA_ENCAP, encap_len / 4, encap);
> +}
> +
> +static int ipmpls_dev_match_encap(const struct net_device *dev,
> +				  const struct nlattr *nla, int encap_len,
> +				  const void *encap)
> +{
> +	unsigned nla_labels;
> +	struct mpls_shim_hdr *nla_label;
> +	const u32 *stored_labels = encap;
> +	int i;
> +
> +	/* Stored encap size is the same as the rtnl encap len */
> +	if (nla_len(nla) != encap_len)
> +		return 1;
> +
> +	nla_labels = nla_len(nla) / 4;
> +	nla_label = nla_data(nla);
> +
> +	for (i = 0; i < nla_labels; i++) {
> +		struct mpls_entry_decoded dec;
> +
> +		dec = mpls_entry_decode(nla_label + i);
> +
> +		if (stored_labels[i] != dec.label)
> +			return 1;
> +	}
> +
> +	return 0;
> +}
> +
> +static struct rtnl_link_ops ipmpls_ops = {
> +	.kind		= "ipmpls",
> +	.priv_size	= sizeof(struct ipmpls_dev_priv),
> +	.setup		= ipmpls_dev_setup,
> +	.validate	= ipmpls_dev_validate,
> +	.newlink	= ipmpls_dev_newlink,
> +	.dellink	= ipmpls_dev_dellink,
> +	.parse_encap	= ipmpls_dev_parse_encap,
> +	.fill_encap	= ipmpls_dev_fill_encap,
> +	.match_encap	= ipmpls_dev_match_encap,
> +};
> +
> +static int ipmpls_dev_notify(struct notifier_block *this, unsigned long event,
> +			     void *ptr)
> +{
> +	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> +
> +	if (event == NETDEV_UNREGISTER) {
> +		struct ipmpls_dev_priv *priv, *priv2;
> +		LIST_HEAD(list_kill);
> +
> +		/* Ignore netns device moves */
> +		if (dev->reg_state != NETREG_UNREGISTERING)
> +			goto done;
> +
> +		list_for_each_entry_safe(priv, priv2, &ipmpls_dev_list, list) {
> +			if (priv->out_dev != dev)
> +				continue;
> +
> +			ipmpls_dev_dellink(priv->dev, &list_kill);
> +		}
> +		unregister_netdevice_many(&list_kill);
> +	}
> +done:
> +	return NOTIFY_OK;
> +}
> +
> +static struct notifier_block ipmpls_dev_notifier = {
> +	.notifier_call = ipmpls_dev_notify,
> +};
> +
> +static int __init ipmpls_init(void)
> +{
> +	int err;
> +
> +	err = register_netdevice_notifier(&ipmpls_dev_notifier);
> +	if (err)
> +		goto out;
> +
> +	err = rtnl_link_register(&ipmpls_ops);
> +	if (err)
> +		goto out_unregister_notifier;
> +out:
> +	return err;
> +out_unregister_notifier:
> +	unregister_netdevice_notifier(&ipmpls_dev_notifier);
> +	goto out;
> +}
> +module_init(ipmpls_init);
> +
> +static void __exit ipmpls_exit(void)
> +{
> +	rtnl_link_unregister(&ipmpls_ops);
> +	unregister_netdevice_notifier(&ipmpls_dev_notifier);
> +}
> +module_exit(ipmpls_exit);
> +
> +MODULE_LICENSE("GPL v2");
> +MODULE_ALIAS_RTNL_LINK("ipmpls");
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html