netdev - Re: [PATCH net-next v2] mpls: support for dead routes

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5638CDE4.2080501@brocade.com>
Date:	Tue, 3 Nov 2015 15:08:20 +0000
From:	Robert Shearman <rshearma@...cade.com>
To:	Roopa Prabhu <roopa@...ulusnetworks.com>, <ebiederm@...ssion.com>
CC:	<davem@...emloft.net>, <netdev@...r.kernel.org>
Subject: Re: [PATCH net-next v2] mpls: support for dead routes

On 03/11/15 05:13, Roopa Prabhu wrote:
> From: Roopa Prabhu <roopa@...ulusnetworks.com>
>
> Adds support for RTNH_F_DEAD and RTNH_F_LINKDOWN flags on mpls
> routes due to link events. Also adds code to ignore dead
> routes during route selection
>
> Signed-off-by: Roopa Prabhu <roopa@...ulusnetworks.com>
> ---
> RFC to v1:
>          Addressed a few comments from Eric and Robert:
>          - remove support for weighted nexthops
>          - Use rt_nhn_alive in the rt structure to keep count of alive
>          routes.
>          What i have not done is: sort nexthops on link events.
>          I am not comfortable recreating or sorting nexthops on
>          every carrier change. This leaves scope for optimizing in the future
>
> v1 to v2:
> 	Fix dead nexthop checks as suggested by dave
>
>   net/mpls/af_mpls.c  | 191 ++++++++++++++++++++++++++++++++++++++++++++--------
>   net/mpls/internal.h |   3 +
>   2 files changed, 166 insertions(+), 28 deletions(-)
>
> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
> index c70d750..5e88118 100644
> --- a/net/mpls/af_mpls.c
> +++ b/net/mpls/af_mpls.c
> @@ -96,22 +96,15 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
>   }
>   EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
>
> -static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
> -					     struct sk_buff *skb, bool bos)
> +static u32 mpls_multipath_hash(struct mpls_route *rt,
> +			       struct sk_buff *skb, bool bos)
>   {
>   	struct mpls_entry_decoded dec;
>   	struct mpls_shim_hdr *hdr;
>   	bool eli_seen = false;
>   	int label_index;
> -	int nh_index = 0;
>   	u32 hash = 0;
>
> -	/* No need to look further into packet if there's only
> -	 * one path
> -	 */
> -	if (rt->rt_nhn == 1)
> -		goto out;
> -
>   	for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos;
>   	     label_index++) {
>   		if (!pskb_may_pull(skb, sizeof(*hdr) * label_index))
> @@ -165,9 +158,37 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
>   		}
>   	}
>
> -	nh_index = hash % rt->rt_nhn;
> +	return hash;
> +}
> +
> +static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
> +					     struct sk_buff *skb, bool bos)
> +{
> +	u32 hash = 0;
> +	int nh_index;
> +	int n = 0;
> +
> +	/* No need to look further into packet if there's only
> +	 * one path
> +	 */
> +	if (rt->rt_nhn == 1)
> +		goto out;
> +
> +	if (rt->rt_nhn_alive <= 0)
> +		return NULL;
> +
> +	hash = mpls_multipath_hash(rt, skb, bos);
> +	nh_index = hash % rt->rt_nhn_alive;
> +	for_nexthops(rt) {

This should be optimised in the common rt->rt_nhn_alive == rt->rt_nhn 
case by doing the direct array index and avoid the nh walk.

> +		if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
> +			continue;
> +		if (n == nh_index)
> +			return nh;
> +		n++;
> +	} endfor_nexthops(rt);
> +
>   out:
> -	return &rt->rt_nh[nh_index];
> +	return &rt->rt_nh[0];
>   }
>
>   static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
> @@ -365,6 +386,7 @@ static struct mpls_route *mpls_rt_alloc(int num_nh, u8 max_alen)
>   		     GFP_KERNEL);
>   	if (rt) {
>   		rt->rt_nhn = num_nh;
> +		rt->rt_nhn_alive = num_nh;
>   		rt->rt_max_alen = max_alen_aligned;
>   	}
>
> @@ -536,6 +558,15 @@ static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt,
>
>   	RCU_INIT_POINTER(nh->nh_dev, dev);
>
> +	if (!netif_carrier_ok(dev))
> +		nh->nh_flags |= RTNH_F_LINKDOWN;
> +
> +	if (!(dev->flags & IFF_UP))
> +		nh->nh_flags |= RTNH_F_DEAD;

Below you're testing IFF_RUNNING | IFF_LOWER_UP. Is the difference 
intended here?

> +
> +	if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
> +		rt->rt_nhn_alive--;

You don't update your new rt->rt_flags field here. This highlights the 
issue with duplicating data, which you're doing with the rt_flags field.

> +
>   	return 0;
>
>   errout:
> @@ -577,7 +608,7 @@ errout:
>   }
>
>   static int mpls_nh_build(struct net *net, struct mpls_route *rt,
> -			 struct mpls_nh *nh, int oif,
> +			 struct mpls_nh *nh, int oif, int hops,

This change isn't required.

>   			 struct nlattr *via, struct nlattr *newdst)
>   {
>   	int err = -ENOMEM;
> @@ -666,7 +697,7 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
>   		/* neither weighted multipath nor any flags
>   		 * are supported
>   		 */
> -		if (rtnh->rtnh_hops || rtnh->rtnh_flags)
> +		if (rtnh->rtnh_flags || rtnh->rtnh_flags)

As the build bot has pointed out, this change is not entirely correct.

>   			goto errout;
>
>   		attrlen = rtnh_attrlen(rtnh);
> @@ -681,8 +712,8 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
>   			goto errout;
>
>   		err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh,
> -				    rtnh->rtnh_ifindex, nla_via,
> -				    nla_newdst);
> +				    rtnh->rtnh_ifindex, rtnh->rtnh_hops,
> +				    nla_via, nla_newdst);

This change isn't required.

>   		if (err)
>   			goto errout;
>
> @@ -875,34 +906,100 @@ free:
>   	return ERR_PTR(err);
>   }
>
> -static void mpls_ifdown(struct net_device *dev)
> +static void mpls_ifdown(struct net_device *dev, int event)
>   {
>   	struct mpls_route __rcu **platform_label;
>   	struct net *net = dev_net(dev);
> -	struct mpls_dev *mdev;
>   	unsigned index;
> +	int dead;
>
>   	platform_label = rtnl_dereference(net->mpls.platform_label);
>   	for (index = 0; index < net->mpls.platform_labels; index++) {
>   		struct mpls_route *rt = rtnl_dereference(platform_label[index]);
> +
>   		if (!rt)
>   			continue;
> +		dead = 0;
>   		for_nexthops(rt) {
> +			if ((event == NETDEV_DOWN &&
> +			     (nh->nh_flags & RTNH_F_DEAD)) ||
> +			     (event == NETDEV_CHANGE &&
> +			     (nh->nh_flags & RTNH_F_LINKDOWN))) {
> +				dead++;
> +				continue;
> +			}
> +
>   			if (rtnl_dereference(nh->nh_dev) != dev)
>   				continue;
> -			nh->nh_dev = NULL;
> +			switch (event) {
> +			case NETDEV_DOWN:
> +			case NETDEV_UNREGISTER:
> +				nh->nh_flags |= RTNH_F_DEAD;
> +				/* fall through */
> +			case NETDEV_CHANGE:
> +				nh->nh_flags |= RTNH_F_LINKDOWN;
> +				rt->rt_nhn_alive--;

Are these operations atomic on all architectures?

Even if they are, I'm a bit worried about the RCU-correctness of this. I 
think the fallthrough case in mpls_select_multipath saves you, causing 
traffic to get via nh0 instead of the one selected by the hash (which is 
fine transiently).

> +				break;
> +			}
> +			if (event == NETDEV_UNREGISTER) {
> +				nh->nh_dev = NULL;

I realise this was just moved from the original code, but I think this 
should be at least RCU_INIT_POINTER(nh->nh_dev, NULL), if not 
rcu_assign_pointer.

> +				dead = rt->rt_nhn;
> +				break;
> +			}
> +			dead++;
>   		} endfor_nexthops(rt);
> +
> +		if (dead == rt->rt_nhn) {
> +			switch (event) {
> +			case NETDEV_DOWN:
> +			case NETDEV_UNREGISTER:
> +				rt->rt_flags |= RTNH_F_DEAD;
> +				/* fall through */
> +			case NETDEV_CHANGE:
> +				rt->rt_flags |= RTNH_F_LINKDOWN;
> +				rt->rt_nhn_alive = 0;

Won't rt_nhn_alive be zero already at this point?

There's also the problem that depending on the type of the last event, 
rt->rt_flags will get set differently.

E.g.

- NETDEV_DOWN for nh0 followed by NETDEV_CHANGE[down] for nh1 then 
rt->rt_flags will be RTNH_F_LINKDOWN.
- NETDEV_CHANGE[down] for nh1 followed by NETDEV_DOWN for nh0 then 
rt->rt_flags will be RTNH_F_LINKDOWN|RTNH_F_DEAD.

That doesn't seem like desirable behaviour. What are expected semantics?

> +				break;
> +			}
> +		}
>   	}
>
> -	mdev = mpls_dev_get(dev);
> -	if (!mdev)
> -		return;
> +	return;
> +}
> +
> +static void mpls_ifup(struct net_device *dev, unsigned int nh_flags)
> +{
> +	struct mpls_route __rcu **platform_label;
> +	struct net *net = dev_net(dev);
> +	unsigned index;
> +	int alive;
> +
> +	platform_label = rtnl_dereference(net->mpls.platform_label);
> +	for (index = 0; index < net->mpls.platform_labels; index++) {
> +		struct mpls_route *rt = rtnl_dereference(platform_label[index]);
> +
> +		if (!rt)
> +			continue;
> +		alive = 0;
> +		for_nexthops(rt) {
> +			struct net_device *nh_dev =
> +				rtnl_dereference(nh->nh_dev);
>
> -	mpls_dev_sysctl_unregister(mdev);
> +			if (!(nh->nh_flags & nh_flags)) {
> +				alive++;
> +				continue;
> +			}
> +			if (nh_dev != dev)
> +				continue;
> +			alive++;
> +			nh->nh_flags &= ~nh_flags;
> +		} endfor_nexthops(rt);
>
> -	RCU_INIT_POINTER(dev->mpls_ptr, NULL);
> +		if (alive > 0)
> +			rt->rt_flags &= ~nh_flags;

Again, this creates a dependence on the ordering of events.

> +		rt->rt_nhn_alive = alive;
> +	}
>
> -	kfree_rcu(mdev, rcu);
> +	return;
>   }
>
>   static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
> @@ -910,9 +1007,9 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
>   {
>   	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
>   	struct mpls_dev *mdev;
> +	unsigned int flags;
>
> -	switch(event) {
> -	case NETDEV_REGISTER:
> +	if (event == NETDEV_REGISTER) {
>   		/* For now just support ethernet devices */
>   		if ((dev->type == ARPHRD_ETHER) ||
>   		    (dev->type == ARPHRD_LOOPBACK)) {
> @@ -920,10 +1017,39 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
>   			if (IS_ERR(mdev))
>   				return notifier_from_errno(PTR_ERR(mdev));
>   		}
> -		break;
> +		return NOTIFY_OK;
> +	}
>
> +	mdev = mpls_dev_get(dev);
> +	if (!mdev)
> +		return NOTIFY_OK;
> +
> +	switch (event) {
> +	case NETDEV_DOWN:
> +		mpls_ifdown(dev, event);
> +		break;
> +	case NETDEV_UP:
> +		flags = dev_get_flags(dev);
> +		if (flags & (IFF_RUNNING | IFF_LOWER_UP))
> +			mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN);
> +		else
> +			mpls_ifup(dev, RTNH_F_DEAD);
> +		break;
> +	case NETDEV_CHANGE:
> +		flags = dev_get_flags(dev);
> +		if (flags & (IFF_RUNNING | IFF_LOWER_UP))
> +			mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN);
> +		else
> +			mpls_ifdown(dev, event);
> +		break;
>   	case NETDEV_UNREGISTER:
> -		mpls_ifdown(dev);
> +		mpls_ifdown(dev, event);
> +		mdev = mpls_dev_get(dev);
> +		if (mdev) {
> +			mpls_dev_sysctl_unregister(mdev);
> +			RCU_INIT_POINTER(dev->mpls_ptr, NULL);
> +			kfree_rcu(mdev, rcu);
> +		}
>   		break;
>   	case NETDEV_CHANGENAME:
>   		mdev = mpls_dev_get(dev);
> @@ -1237,6 +1363,10 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
>   		dev = rtnl_dereference(nh->nh_dev);
>   		if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
>   			goto nla_put_failure;
> +		if (nh->nh_flags & RTNH_F_LINKDOWN)
> +			rtm->rtm_flags |= RTNH_F_LINKDOWN;
> +		if (nh->nh_flags & RTNH_F_DEAD)
> +			rtm->rtm_flags |= RTNH_F_DEAD;
>   	} else {
>   		struct rtnexthop *rtnh;
>   		struct nlattr *mp;
> @@ -1253,6 +1383,11 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
>   			dev = rtnl_dereference(nh->nh_dev);
>   			if (dev)
>   				rtnh->rtnh_ifindex = dev->ifindex;
> +			if (nh->nh_flags & RTNH_F_LINKDOWN)
> +				rtnh->rtnh_flags |= RTNH_F_LINKDOWN;
> +			if (nh->nh_flags & RTNH_F_DEAD)
> +				rtnh->rtnh_flags |= RTNH_F_DEAD;
> +

You never read from rt->rt_flags. Was your intention to set 
rtm->rtm_flags using that field in this function?

>   			if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST,
>   							    nh->nh_labels,
>   							    nh->nh_label))
> diff --git a/net/mpls/internal.h b/net/mpls/internal.h
> index bde52ce..4f9bf2b 100644
> --- a/net/mpls/internal.h
> +++ b/net/mpls/internal.h
> @@ -41,6 +41,7 @@ enum mpls_payload_type {
>
>   struct mpls_nh { /* next hop label forwarding entry */
>   	struct net_device __rcu *nh_dev;
> +	unsigned int		nh_flags;

This could be implemented as a u8 (or even two 1-bit fields) after 
nh_via_table (making use of the 1-byte hole already there) without 
increasing the size of the structure by a fifth.

>   	u32			nh_label[MAX_NEW_LABELS];
>   	u8			nh_labels;
>   	u8			nh_via_alen;
> @@ -70,10 +71,12 @@ struct mpls_nh { /* next hop label forwarding entry */
>    */
>   struct mpls_route { /* next hop label forwarding entry */
>   	struct rcu_head		rt_rcu;
> +	unsigned int		rt_flags;

Storing this is unnecessary - it can be derived from rt_nhn_alive == 0 
and looking at the nexthop flags, which also avoids the event ordering 
problems described above.

Thanks,
Rob

>   	u8			rt_protocol;
>   	u8			rt_payload_type;
>   	u8			rt_max_alen;
>   	unsigned int		rt_nhn;
> +	unsigned int		rt_nhn_alive;
>   	struct mpls_nh		rt_nh[0];
>   };
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html