[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5638CDE4.2080501@brocade.com>
Date: Tue, 3 Nov 2015 15:08:20 +0000
From: Robert Shearman <rshearma@...cade.com>
To: Roopa Prabhu <roopa@...ulusnetworks.com>, <ebiederm@...ssion.com>
CC: <davem@...emloft.net>, <netdev@...r.kernel.org>
Subject: Re: [PATCH net-next v2] mpls: support for dead routes
On 03/11/15 05:13, Roopa Prabhu wrote:
> From: Roopa Prabhu <roopa@...ulusnetworks.com>
>
> Adds support for RTNH_F_DEAD and RTNH_F_LINKDOWN flags on mpls
> routes due to link events. Also adds code to ignore dead
> routes during route selection
>
> Signed-off-by: Roopa Prabhu <roopa@...ulusnetworks.com>
> ---
> RFC to v1:
> Addressed a few comments from Eric and Robert:
> - remove support for weighted nexthops
> - Use rt_nhn_alive in the rt structure to keep count of alive
> routes.
> What i have not done is: sort nexthops on link events.
> I am not comfortable recreating or sorting nexthops on
> every carrier change. This leaves scope for optimizing in the future
>
> v1 to v2:
> Fix dead nexthop checks as suggested by dave
>
> net/mpls/af_mpls.c | 191 ++++++++++++++++++++++++++++++++++++++++++++--------
> net/mpls/internal.h | 3 +
> 2 files changed, 166 insertions(+), 28 deletions(-)
>
> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
> index c70d750..5e88118 100644
> --- a/net/mpls/af_mpls.c
> +++ b/net/mpls/af_mpls.c
> @@ -96,22 +96,15 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
> }
> EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
>
> -static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
> - struct sk_buff *skb, bool bos)
> +static u32 mpls_multipath_hash(struct mpls_route *rt,
> + struct sk_buff *skb, bool bos)
> {
> struct mpls_entry_decoded dec;
> struct mpls_shim_hdr *hdr;
> bool eli_seen = false;
> int label_index;
> - int nh_index = 0;
> u32 hash = 0;
>
> - /* No need to look further into packet if there's only
> - * one path
> - */
> - if (rt->rt_nhn == 1)
> - goto out;
> -
> for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos;
> label_index++) {
> if (!pskb_may_pull(skb, sizeof(*hdr) * label_index))
> @@ -165,9 +158,37 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
> }
> }
>
> - nh_index = hash % rt->rt_nhn;
> + return hash;
> +}
> +
> +static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
> + struct sk_buff *skb, bool bos)
> +{
> + u32 hash = 0;
> + int nh_index;
> + int n = 0;
> +
> + /* No need to look further into packet if there's only
> + * one path
> + */
> + if (rt->rt_nhn == 1)
> + goto out;
> +
> + if (rt->rt_nhn_alive <= 0)
> + return NULL;
> +
> + hash = mpls_multipath_hash(rt, skb, bos);
> + nh_index = hash % rt->rt_nhn_alive;
> + for_nexthops(rt) {
This should be optimised in the common rt->rt_nhn_alive == rt->rt_nhn
case by doing the direct array index and avoid the nh walk.
> + if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
> + continue;
> + if (n == nh_index)
> + return nh;
> + n++;
> + } endfor_nexthops(rt);
> +
> out:
> - return &rt->rt_nh[nh_index];
> + return &rt->rt_nh[0];
> }
>
> static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
> @@ -365,6 +386,7 @@ static struct mpls_route *mpls_rt_alloc(int num_nh, u8 max_alen)
> GFP_KERNEL);
> if (rt) {
> rt->rt_nhn = num_nh;
> + rt->rt_nhn_alive = num_nh;
> rt->rt_max_alen = max_alen_aligned;
> }
>
> @@ -536,6 +558,15 @@ static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt,
>
> RCU_INIT_POINTER(nh->nh_dev, dev);
>
> + if (!netif_carrier_ok(dev))
> + nh->nh_flags |= RTNH_F_LINKDOWN;
> +
> + if (!(dev->flags & IFF_UP))
> + nh->nh_flags |= RTNH_F_DEAD;
Below you're testing IFF_RUNNING | IFF_LOWER_UP. Is the difference
intended here?
> +
> + if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
> + rt->rt_nhn_alive--;
You don't update your new rt->rt_flags field here. This highlights the
issue with duplicating data, which you're doing with the rt_flags field.
> +
> return 0;
>
> errout:
> @@ -577,7 +608,7 @@ errout:
> }
>
> static int mpls_nh_build(struct net *net, struct mpls_route *rt,
> - struct mpls_nh *nh, int oif,
> + struct mpls_nh *nh, int oif, int hops,
This change isn't required.
> struct nlattr *via, struct nlattr *newdst)
> {
> int err = -ENOMEM;
> @@ -666,7 +697,7 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
> /* neither weighted multipath nor any flags
> * are supported
> */
> - if (rtnh->rtnh_hops || rtnh->rtnh_flags)
> + if (rtnh->rtnh_flags || rtnh->rtnh_flags)
As the build bot has pointed out, this change is not entirely correct.
> goto errout;
>
> attrlen = rtnh_attrlen(rtnh);
> @@ -681,8 +712,8 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
> goto errout;
>
> err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh,
> - rtnh->rtnh_ifindex, nla_via,
> - nla_newdst);
> + rtnh->rtnh_ifindex, rtnh->rtnh_hops,
> + nla_via, nla_newdst);
This change isn't required.
> if (err)
> goto errout;
>
> @@ -875,34 +906,100 @@ free:
> return ERR_PTR(err);
> }
>
> -static void mpls_ifdown(struct net_device *dev)
> +static void mpls_ifdown(struct net_device *dev, int event)
> {
> struct mpls_route __rcu **platform_label;
> struct net *net = dev_net(dev);
> - struct mpls_dev *mdev;
> unsigned index;
> + int dead;
>
> platform_label = rtnl_dereference(net->mpls.platform_label);
> for (index = 0; index < net->mpls.platform_labels; index++) {
> struct mpls_route *rt = rtnl_dereference(platform_label[index]);
> +
> if (!rt)
> continue;
> + dead = 0;
> for_nexthops(rt) {
> + if ((event == NETDEV_DOWN &&
> + (nh->nh_flags & RTNH_F_DEAD)) ||
> + (event == NETDEV_CHANGE &&
> + (nh->nh_flags & RTNH_F_LINKDOWN))) {
> + dead++;
> + continue;
> + }
> +
> if (rtnl_dereference(nh->nh_dev) != dev)
> continue;
> - nh->nh_dev = NULL;
> + switch (event) {
> + case NETDEV_DOWN:
> + case NETDEV_UNREGISTER:
> + nh->nh_flags |= RTNH_F_DEAD;
> + /* fall through */
> + case NETDEV_CHANGE:
> + nh->nh_flags |= RTNH_F_LINKDOWN;
> + rt->rt_nhn_alive--;
Are these operations atomic on all architectures?
Even if they are, I'm a bit worried about the RCU-correctness of this. I
think the fallthrough case in mpls_select_multipath saves you, causing
traffic to get via nh0 instead of the one selected by the hash (which is
fine transiently).
> + break;
> + }
> + if (event == NETDEV_UNREGISTER) {
> + nh->nh_dev = NULL;
I realise this was just moved from the original code, but I think this
should be at least RCU_INIT_POINTER(nh->nh_dev, NULL), if not
rcu_assign_pointer.
> + dead = rt->rt_nhn;
> + break;
> + }
> + dead++;
> } endfor_nexthops(rt);
> +
> + if (dead == rt->rt_nhn) {
> + switch (event) {
> + case NETDEV_DOWN:
> + case NETDEV_UNREGISTER:
> + rt->rt_flags |= RTNH_F_DEAD;
> + /* fall through */
> + case NETDEV_CHANGE:
> + rt->rt_flags |= RTNH_F_LINKDOWN;
> + rt->rt_nhn_alive = 0;
Won't rt_nhn_alive be zero already at this point?
There's also the problem that depending on the type of the last event,
rt->rt_flags will get set differently.
E.g.
- NETDEV_DOWN for nh0 followed by NETDEV_CHANGE[down] for nh1 then
rt->rt_flags will be RTNH_F_LINKDOWN.
- NETDEV_CHANGE[down] for nh1 followed by NETDEV_DOWN for nh0 then
rt->rt_flags will be RTNH_F_LINKDOWN|RTNH_F_DEAD.
That doesn't seem like desirable behaviour. What are expected semantics?
> + break;
> + }
> + }
> }
>
> - mdev = mpls_dev_get(dev);
> - if (!mdev)
> - return;
> + return;
> +}
> +
> +static void mpls_ifup(struct net_device *dev, unsigned int nh_flags)
> +{
> + struct mpls_route __rcu **platform_label;
> + struct net *net = dev_net(dev);
> + unsigned index;
> + int alive;
> +
> + platform_label = rtnl_dereference(net->mpls.platform_label);
> + for (index = 0; index < net->mpls.platform_labels; index++) {
> + struct mpls_route *rt = rtnl_dereference(platform_label[index]);
> +
> + if (!rt)
> + continue;
> + alive = 0;
> + for_nexthops(rt) {
> + struct net_device *nh_dev =
> + rtnl_dereference(nh->nh_dev);
>
> - mpls_dev_sysctl_unregister(mdev);
> + if (!(nh->nh_flags & nh_flags)) {
> + alive++;
> + continue;
> + }
> + if (nh_dev != dev)
> + continue;
> + alive++;
> + nh->nh_flags &= ~nh_flags;
> + } endfor_nexthops(rt);
>
> - RCU_INIT_POINTER(dev->mpls_ptr, NULL);
> + if (alive > 0)
> + rt->rt_flags &= ~nh_flags;
Again, this creates a dependence on the ordering of events.
> + rt->rt_nhn_alive = alive;
> + }
>
> - kfree_rcu(mdev, rcu);
> + return;
> }
>
> static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
> @@ -910,9 +1007,9 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
> {
> struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> struct mpls_dev *mdev;
> + unsigned int flags;
>
> - switch(event) {
> - case NETDEV_REGISTER:
> + if (event == NETDEV_REGISTER) {
> /* For now just support ethernet devices */
> if ((dev->type == ARPHRD_ETHER) ||
> (dev->type == ARPHRD_LOOPBACK)) {
> @@ -920,10 +1017,39 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
> if (IS_ERR(mdev))
> return notifier_from_errno(PTR_ERR(mdev));
> }
> - break;
> + return NOTIFY_OK;
> + }
>
> + mdev = mpls_dev_get(dev);
> + if (!mdev)
> + return NOTIFY_OK;
> +
> + switch (event) {
> + case NETDEV_DOWN:
> + mpls_ifdown(dev, event);
> + break;
> + case NETDEV_UP:
> + flags = dev_get_flags(dev);
> + if (flags & (IFF_RUNNING | IFF_LOWER_UP))
> + mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN);
> + else
> + mpls_ifup(dev, RTNH_F_DEAD);
> + break;
> + case NETDEV_CHANGE:
> + flags = dev_get_flags(dev);
> + if (flags & (IFF_RUNNING | IFF_LOWER_UP))
> + mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN);
> + else
> + mpls_ifdown(dev, event);
> + break;
> case NETDEV_UNREGISTER:
> - mpls_ifdown(dev);
> + mpls_ifdown(dev, event);
> + mdev = mpls_dev_get(dev);
> + if (mdev) {
> + mpls_dev_sysctl_unregister(mdev);
> + RCU_INIT_POINTER(dev->mpls_ptr, NULL);
> + kfree_rcu(mdev, rcu);
> + }
> break;
> case NETDEV_CHANGENAME:
> mdev = mpls_dev_get(dev);
> @@ -1237,6 +1363,10 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
> dev = rtnl_dereference(nh->nh_dev);
> if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
> goto nla_put_failure;
> + if (nh->nh_flags & RTNH_F_LINKDOWN)
> + rtm->rtm_flags |= RTNH_F_LINKDOWN;
> + if (nh->nh_flags & RTNH_F_DEAD)
> + rtm->rtm_flags |= RTNH_F_DEAD;
> } else {
> struct rtnexthop *rtnh;
> struct nlattr *mp;
> @@ -1253,6 +1383,11 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
> dev = rtnl_dereference(nh->nh_dev);
> if (dev)
> rtnh->rtnh_ifindex = dev->ifindex;
> + if (nh->nh_flags & RTNH_F_LINKDOWN)
> + rtnh->rtnh_flags |= RTNH_F_LINKDOWN;
> + if (nh->nh_flags & RTNH_F_DEAD)
> + rtnh->rtnh_flags |= RTNH_F_DEAD;
> +
You never read from rt->rt_flags. Was your intention to set
rtm->rtm_flags using that field in this function?
> if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST,
> nh->nh_labels,
> nh->nh_label))
> diff --git a/net/mpls/internal.h b/net/mpls/internal.h
> index bde52ce..4f9bf2b 100644
> --- a/net/mpls/internal.h
> +++ b/net/mpls/internal.h
> @@ -41,6 +41,7 @@ enum mpls_payload_type {
>
> struct mpls_nh { /* next hop label forwarding entry */
> struct net_device __rcu *nh_dev;
> + unsigned int nh_flags;
This could be implemented as a u8 (or even two 1-bit fields) after
nh_via_table (making use of the 1-byte hole already there) without
increasing the size of the structure by a fifth.
> u32 nh_label[MAX_NEW_LABELS];
> u8 nh_labels;
> u8 nh_via_alen;
> @@ -70,10 +71,12 @@ struct mpls_nh { /* next hop label forwarding entry */
> */
> struct mpls_route { /* next hop label forwarding entry */
> struct rcu_head rt_rcu;
> + unsigned int rt_flags;
Storing this is unnecessary - it can be derived from rt_nhn_alive == 0
and looking at the nexthop flags, which also avoids the event ordering
problems described above.
Thanks,
Rob
> u8 rt_protocol;
> u8 rt_payload_type;
> u8 rt_max_alen;
> unsigned int rt_nhn;
> + unsigned int rt_nhn_alive;
> struct mpls_nh rt_nh[0];
> };
>
>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists