lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20180901004954.7145-18-dsahern@kernel.org>
Date:   Fri, 31 Aug 2018 17:49:52 -0700
From:   dsahern@...nel.org
To:     netdev@...r.kernel.org
Cc:     roopa@...ulusnetworks.com, sharpd@...ulusnetworks.com,
        idosch@...lanox.com, davem@...emloft.net,
        David Ahern <dsahern@...il.com>
Subject: [PATCH RFC net-next 17/18] net: Add support for nexthop groups

From: David Ahern <dsahern@...il.com>

Allow the creation of nexthop groups which reference other nexthop
objects to create multipath routes.

TO-DO: Add mpath support to IPv6

Signed-off-by: David Ahern <dsahern@...il.com>
---
 include/net/nexthop.h    |  77 +++++--
 net/ipv4/fib_semantics.c |   5 +-
 net/ipv4/nexthop.c       | 511 ++++++++++++++++++++++++++++++++++++++++++-----
 net/ipv4/route.c         |  16 +-
 4 files changed, 540 insertions(+), 69 deletions(-)

diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 759bb39e4ea7..654b67192337 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -28,6 +28,23 @@
 
 struct nexthop;
 
+struct nh_grp_entry {
+	struct nexthop	 *nh;
+	u32		 weight;
+	atomic_t	 upper_bound;
+
+	struct list_head nh_list;
+	struct nexthop	 *nh_parent;  /* nexthop of group with this entry */
+};
+
+struct nh_group {
+	u16			num_nh_set;
+	u16			num_nh;
+	u8			mpath:1,
+				unused:7;
+	struct nh_grp_entry	nh_entries[0];
+};
+
 struct nh_info {
 	struct hlist_node	dev_hash;
 	struct net		*net;
@@ -47,6 +64,7 @@ struct nh_info {
 
 struct nexthop {
 	struct rb_node		rb_node;
+	struct list_head	grp_list;  /* nh group entries using this nh */
 	struct list_head	fi_list;    /* v4 entries using nh */
 	struct list_head	f6i_list;   /* v6 entries using nh */
 
@@ -54,12 +72,15 @@ struct nexthop {
 
 	u8			protocol;
 	u8			nh_flags;
+	u8			is_group:1,
+				unused:7;
 
 	refcount_t		refcnt;
 	struct rcu_head		rcu;
 
 	union {
 		struct nh_info	__rcu *nh_info;
+		struct nh_group	__rcu *nh_grp;
 	};
 };
 
@@ -81,6 +102,9 @@ struct nh_config {
 		struct in6_addr	ipv6;
 	} gw;
 
+	struct nlattr	*nh_grp;
+	u16		nh_grp_type;
+
 	u32		nlflags;
 	struct nl_info	nlinfo;
 };
@@ -88,42 +112,61 @@ struct nh_config {
 void nexthop_get(struct nexthop *nh);
 void nexthop_put(struct nexthop *nh);
 
+static inline bool nexthop_cmp(struct nexthop *nh1, struct nexthop *nh2)
+{
+	return nh1 == nh2;
+}
+
 /* caller is holding rtnl; no reference taken to nexthop */
 struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
 
-static inline bool nexthop_cmp(struct nexthop *nh1, struct nexthop *nh2)
+/* called with rcu lock */
+static inline bool nexthop_is_multipath(const struct nexthop *nh)
 {
-	return nh1 == nh2;
+	if (nh->is_group) {
+		struct nh_group *nh_grp;
+
+		nh_grp = rcu_dereference(nh->nh_grp);
+		return !!nh_grp->mpath;
+	}
+	return false;
 }
 
+struct nexthop *nexthop_mpath_select(struct nexthop *nh, int nhsel);
+
+/* called with rcu lock */
 static inline int nexthop_num_path(struct nexthop *nh)
 {
+	if (nexthop_is_multipath(nh)) {
+		struct nh_group *nh_grp;
+
+		nh_grp = rcu_dereference(nh->nh_grp);
+		return nh_grp->num_nh_set;
+	}
+
 	return 1;
 }
 
-/* called with rcu lock */
+void nexthop_select_path(struct net *net, struct fib_result *res, int hash);
+
 static inline bool nexthop_has_gw(struct nexthop *nh)
 {
-	struct nh_info *nhi;
-
-	nhi = rcu_dereference(nh->nh_info);
-	return !!nhi->has_gw;
+	return !!nh->nh_info->has_gw;
 }
 
-/* called with rcu lock */
 static inline bool nexthop_is_blackhole(struct nexthop *nh)
 {
-	struct nh_info *nhi;
-
-	nhi = rcu_dereference(nh->nh_info);
-	return !!nhi->reject_nh;
+	return !nexthop_is_multipath(nh) && !!nh->nh_info->reject_nh;
 }
 
 static inline struct fib_nh *nexthop_fib_nh(struct nexthop *nh, int nhsel)
 {
 	struct nh_info *nhi;
 
-	nhi = rcu_dereference(nh->nh_info);
+	if (nexthop_is_multipath(nh))
+		nh = nexthop_mpath_select(nh, nhsel);
+
+	nhi = nh->nh_info;
 	if (nhi->family == AF_INET ||
 	    nhi->family == AF_UNSPEC)  /* dev only re-uses IPv4 struct */
 		return &nhi->fib_nh;
@@ -164,11 +207,11 @@ static inline __be32 fib_info_nh_gw(struct fib_info *fi)
  */
 static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
 {
-	struct nh_info *nhi;
+	if (nexthop_is_multipath(nh))
+		nh = nexthop_mpath_select(nh, 0);
 
-	nhi = rcu_dereference(nh->nh_info);
-	if (nhi->family == AF_INET6)
-		return &nhi->fib6_nh;
+	if (nh->nh_info->family == AF_INET6)
+		return &nh->nh_info->fib6_nh;
 
 	return NULL;
 }
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c91cdafd40ec..0ddf14512bb3 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1821,7 +1821,10 @@ void fib_select_path(struct net *net, struct fib_result *res,
 		goto check_saddr;
 
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (res->fi->fib_nhs > 1) {
+	if (res->fi->nh && nexthop_is_multipath(res->fi->nh)) {
+		h = fib_multipath_hash(net, fl4, skb, NULL);
+		nexthop_select_path(net, res, h);
+	} else if (res->fi->fib_nhs > 1) {
 		h = fib_multipath_hash(net, fl4, skb, NULL);
 		fib_select_multipath(res, h);
 	}
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 1e77fa94e562..f0b4151c661a 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -35,6 +35,8 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
 	[NHA_TABLE_ID]		= { .type = NLA_U32 },
 	[NHA_BLACKHOLE]		= { .type = NLA_FLAG },
 	[NHA_MASTER]		= { .type = NLA_U32 },
+	[NHA_GROUP_TYPE]	= { .type = NLA_U16 },
+	[NHA_GROUPS]		= { .type = NLA_FLAG },
 };
 
 static unsigned int nh_dev_hashfn(unsigned int val)
@@ -67,19 +69,35 @@ static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
 static void nexthop_free_rcu(struct rcu_head *head)
 {
 	struct nexthop *nh = container_of(head, struct nexthop, rcu);
-	struct nh_info *nhi;
 
-	nhi = rcu_dereference_raw(nh->nh_info);
-	switch (nhi->family) {
-	case AF_INET:
-	case AF_UNSPEC:
-		fib_nh_release(nhi->net, &nhi->fib_nh);
-		break;
-	case AF_INET6:
-		fib6_nh_release(&nhi->fib6_nh);
-		break;
+	if (nh->is_group) {
+		struct nh_group *nh_grp;
+		int i;
+
+		nh_grp = rcu_dereference_raw(nh->nh_grp);
+		for (i = 0; i < nh_grp->num_nh; ++i) {
+			if (!nh_grp->nh_entries[i].nh)
+				continue;
+
+			list_del(&nh_grp->nh_entries[i].nh_list);
+			nexthop_put(nh_grp->nh_entries[i].nh);
+		}
+		kfree(nh_grp);
+	} else {
+		struct nh_info *nhi;
+
+		nhi = rcu_dereference_raw(nh->nh_info);
+		switch (nhi->family) {
+		case AF_INET:
+		case AF_UNSPEC:
+			fib_nh_release(nhi->net, &nhi->fib_nh);
+			break;
+		case AF_INET6:
+			fib6_nh_release(&nhi->fib6_nh);
+			break;
+		}
+		kfree(nhi);
 	}
-	kfree(nhi);
 
 	kfree(nh);
 }
@@ -89,6 +107,33 @@ static struct nexthop *nexthop_alloc(void)
 	return kzalloc(sizeof(struct nexthop), GFP_KERNEL);
 }
 
+/* nexthop for group has variable size and may not use the kmem_cache */
+static struct nexthop *nexthop_grp_alloc(u16 num_nh)
+{
+	size_t sz = offsetof(struct nexthop, nh_grp)
+		    + sizeof(struct nh_group)
+		    + sizeof(struct nh_grp_entry) * num_nh;
+	struct nh_group *nh_grp;
+	struct nexthop *nh;
+
+	nh = nexthop_alloc();
+	if (!nh)
+		return ERR_PTR(-ENOMEM);
+
+	nh_grp = kzalloc(sz, GFP_KERNEL);
+	if (!nh_grp) {
+		kfree(nh);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	nh->is_group = 1;
+	nh_grp->num_nh = num_nh;
+	nh_grp->num_nh_set = num_nh;
+	rcu_assign_pointer(nh->nh_grp, nh_grp);
+
+	return nh;
+}
+
 static void nh_base_seq_inc(struct net *net)
 {
 	while (++net->nexthop.seq == 0)
@@ -173,23 +218,166 @@ static size_t nh_nlmsg_size_ipv4(struct nh_info *nhi)
 
 static size_t nh_nlmsg_size(struct nexthop *nh)
 {
-	struct nh_info *nhi = rtnl_dereference(nh->nh_info);
 	size_t sz = nla_total_size(4);    /* NHA_ID */
 
-	/* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
-	 * are mutually exclusive
-	 */
-	sz += nla_total_size(4);  /* NHA_OIF */
+	if (nh->is_group) {
+		struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp);
+		size_t sz2 = sizeof(struct nh_group) * nh_grp->num_nh_set;
 
-	if (nhi->family == AF_INET)
-		sz += nh_nlmsg_size_ipv4(nhi);
+		sz += nla_total_size(sz2)
+		      + nla_total_size(2);  /* NHA_GROUP_TYPE */
+	} else {
+		struct nh_info *nhi = rtnl_dereference(nh->nh_info);
 
-	else if (nhi->family == AF_INET6)
-		sz += nh_nlmsg_size_ipv6(nhi);
+		/* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
+		 * are mutually exclusive
+		 */
+		sz += nla_total_size(4);  /* NHA_OIF */
+
+		if (nhi->family == AF_INET)
+			sz += nh_nlmsg_size_ipv4(nhi);
+		else if (nhi->family == AF_INET6)
+			sz += nh_nlmsg_size_ipv6(nhi);
+	}
 
 	return sz;
 }
 
+static bool valid_group_nh(struct nexthop *nh, struct netlink_ext_ack *extack)
+{
+	if (nh->is_group) {
+		struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp);
+
+		/* nested multipath (group within a group) is not
+		 * supported
+		 */
+		if (nh_grp->mpath) {
+			NL_SET_ERR_MSG(extack,
+				       "Multipath group can not be a nexthop within a group");
+			return false;
+		}
+	} else {
+		struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+
+		if (nhi->reject_nh) {
+			NL_SET_ERR_MSG(extack,
+				       "Blackhole nexthop can not be used in a group");
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
+			       struct netlink_ext_ack *extack)
+{
+	unsigned int len = nla_len(tb[NHA_GROUP]);
+	struct nexthop_grp *nhg;
+	int i;
+
+	if (len & (sizeof(struct nh_group) - 1)) {
+		NL_SET_ERR_MSG(extack,
+			       "Invalid length for nexthop group attribute");
+		return -EINVAL;
+	}
+
+	/* convert len to number of nexthop ids */
+	len /= sizeof(*nhg);
+
+	nhg = nla_data(tb[NHA_GROUP]);
+	for (i = 0; i < len; ++i) {
+		struct nexthop *nh;
+
+		nh = nexthop_find_by_id(net, nhg->id);
+		if (!nh) {
+			NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+			return -EINVAL;
+		}
+		if (!valid_group_nh(nh, extack))
+			return -EINVAL;
+
+		nhg += 1;
+	}
+
+	for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) {
+		if (!tb[i])
+			continue;
+
+		NL_SET_ERR_MSG(extack,
+			       "No other attributes can be set in nexthop groups");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nh_grp)
+{
+	size_t len = nh_grp->num_nh_set * sizeof(struct nh_group);
+	struct nexthop_grp *p;
+	struct nlattr *nla;
+	u16 group_type = 0;
+	int i;
+
+	if (nh_grp->mpath)
+		group_type = NEXTHOP_GRP_TYPE_MPATH;
+
+	if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
+		goto nla_put_failure;
+
+	nla = nla_reserve(skb, NHA_GROUP, len);
+	if (!nla)
+		goto nla_put_failure;
+
+	p = nla_data(nla);
+	for (i = 0; i < nh_grp->num_nh; ++i) {
+		if (!nh_grp->nh_entries[i].nh)
+			continue;
+
+		p->id = nh_grp->nh_entries[i].nh->id;
+		p->weight = nh_grp->nh_entries[i].weight;
+		p += 1;
+	}
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static void nh_group_rebalance(struct nh_group *nhg)
+{
+	struct nh_grp_entry *nhge;
+	int total = 0;
+	int w = 0;
+	int i;
+
+	for (i = 0; i < nhg->num_nh; ++i) {
+		nhge = &nhg->nh_entries[i];
+
+		if (!nhge->nh)
+			continue;
+
+		total += nhge->weight;
+	}
+
+	for (i = 0; i < nhg->num_nh; ++i) {
+		int upper_bound;
+
+		nhge = &nhg->nh_entries[i];
+		if (!nhge->nh) {
+			upper_bound = -1;
+		} else {
+			w += nhge->weight;
+			upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
+							    total) - 1;
+		}
+
+		atomic_set(&nhge->upper_bound, upper_bound);
+	}
+}
+
 static const struct net_device *nh_info_dev(const struct nh_info *nhi)
 {
 	switch (nhi->family) {
@@ -219,8 +407,25 @@ bool nexthop_uses_dev(const struct nexthop *nh, const struct net_device *dev)
 	const struct nh_info *nhi;
 	bool dev_match = false;
 
-	nhi = rcu_dereference(nh->nh_info);
-	dev_match = nh_info_uses_dev(nhi, dev);
+	if (nh->is_group) {
+		const struct nh_group *nh_grp;
+		int i;
+
+		nh_grp = rcu_dereference(nh->nh_grp);
+		for (i = 0; i < nh_grp->num_nh; ++i) {
+			const struct nh_grp_entry *nhge;
+
+			nhge = &nh_grp->nh_entries[i];
+			nhi = rcu_dereference(nhge->nh->nh_info);
+			dev_match = nh_info_uses_dev(nhi, dev);
+			if (dev_match)
+				break;
+		}
+
+	} else {
+		nhi = rcu_dereference(nh->nh_info);
+		dev_match = nh_info_uses_dev(nhi, dev);
+	}
 
 	return dev_match;
 }
@@ -249,6 +454,14 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 	if (nla_put_u32(skb, NHA_ID, nh->id))
 		goto nla_put_failure;
 
+	if (nh->is_group) {
+		struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp);
+
+		if (nla_put_nh_group(skb, nh_grp))
+			goto nla_put_failure;
+		goto end;
+	}
+
 	nhi = rtnl_dereference(nh->nh_info);
 	if (nhi->reject_nh && nla_put_flag(skb, NHA_BLACKHOLE))
 		goto nla_put_failure;
@@ -281,6 +494,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
 		break;
 	}
 
+end:
 	nlmsg_end(skb, nlh);
 	return 0;
 
@@ -315,6 +529,50 @@ static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
 		rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
 }
 
+static void remove_nh_grp_entry(struct nh_grp_entry *nhge, bool rebalance)
+{
+	struct nh_group *nh_grp;
+
+	list_del(&nhge->nh_list);
+	nexthop_put(nhge->nh);
+	nhge->nh = NULL;
+
+	nh_grp = rtnl_dereference(nhge->nh_parent->nh_grp);
+	nh_grp->num_nh_set--;
+	if (rebalance)
+		nh_group_rebalance(nh_grp);
+}
+
+static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
+				       bool skip_fib, struct nl_info *nlinfo)
+{
+	struct nh_grp_entry *nhge, *tmp;
+
+	list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) {
+		struct nh_group *nh_grp;
+
+		remove_nh_grp_entry(nhge, true);
+
+		/* if this group has no more entries then remove it */
+		nh_grp = rtnl_dereference(nhge->nh_parent->nh_grp);
+		if (!nh_grp->num_nh_set)
+			remove_nexthop(net, nhge->nh_parent, skip_fib,
+				       nlinfo);
+	}
+}
+
+static void remove_nexthop_group(struct nexthop *nh)
+{
+	struct nh_group *nh_grp;
+	int i;
+
+	nh_grp = rtnl_dereference(nh->nh_grp);
+	for (i = 0; i < nh_grp->num_nh; ++i) {
+		if (nh_grp->nh_entries[i].nh)
+			remove_nh_grp_entry(&nh_grp->nh_entries[i], false);
+	}
+}
+
 static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 {
 	struct fib6_info *f6i, *tmp;
@@ -339,13 +597,19 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 static void __remove_nexthop(struct net *net, struct nexthop *nh,
 			     bool skip_fib, struct nl_info *nlinfo)
 {
-	const struct net_device *dev;
-	struct nh_info *nhi;
+	if (nh->is_group) {
+		remove_nexthop_group(nh);
+	} else {
+		const struct net_device *dev;
+		struct nh_info *nhi;
 
-	nhi = rtnl_dereference(nh->nh_info);
-	dev = nh_info_dev(nhi);
-	if (dev)
-		hlist_del(&nhi->dev_hash);
+		nhi = rtnl_dereference(nh->nh_info);
+		dev = nh_info_dev(nhi);
+		if (dev)
+			hlist_del(&nhi->dev_hash);
+
+		remove_nexthop_from_groups(net, nh, skip_fib, nlinfo);
+	}
 	if (!skip_fib)
 		__remove_nexthop_fib(net, nh);
 }
@@ -362,21 +626,46 @@ static void remove_nexthop(struct net *net, struct nexthop *nh,
 
 	nexthop_put(nh);
 
-	nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
+	if (nlinfo)
+		nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
 }
 
 static int replace_nexthop(struct net *net, struct nexthop *old,
 			   struct nexthop *new, struct netlink_ext_ack *extack)
 {
-	struct nh_info *oldi, *newi;
+	if (old->is_group) {
+		struct nh_group *oldg, *newg;
+		int i;
 
-	oldi = rtnl_dereference(old->nh_info);
-	newi = rtnl_dereference(new->nh_info);
-	rcu_assign_pointer(old->nh_info, newi);
-	rcu_assign_pointer(new->nh_info, oldi);
+		if (!new->is_group) {
+			NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
+			return -EINVAL;
+		}
+		oldg = rtnl_dereference(old->nh_grp);
+		newg = rtnl_dereference(new->nh_grp);
+		rcu_assign_pointer(old->nh_grp, newg);
+		rcu_assign_pointer(new->nh_grp, oldg);
+
+		/* update parents - used by nexthop code for cleanup */
+		for (i = 0; i < newg->num_nh; ++i)
+			newg->nh_entries[i].nh_parent = old;
+		for (i = 0; i < oldg->num_nh; ++i)
+			oldg->nh_entries[i].nh_parent = new;
+	} else {
+		struct nh_info *oldi, *newi;
 
-	newi->nh_parent = old;
-	oldi->nh_parent = new;
+		if (new->is_group) {
+			NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
+			return -EINVAL;
+		}
+		oldi = rtnl_dereference(old->nh_info);
+		newi = rtnl_dereference(new->nh_info);
+		rcu_assign_pointer(old->nh_info, newi);
+		rcu_assign_pointer(new->nh_info, oldi);
+
+		newi->nh_parent = old;
+		oldi->nh_parent = new;
+	}
 
 	old->protocol = new->protocol;
 	old->nh_flags = new->nh_flags;
@@ -491,10 +780,16 @@ int fib_check_nexthop(struct fib_info *fi, struct fib_config *cfg,
 		      struct netlink_ext_ack *extack)
 {
 	struct nexthop *nh = fi->nh;
-	struct nh_info *nhi;
 
-	nhi = rtnl_dereference(nh->nh_info);
-	if (nhi->family != AF_UNSPEC) {
+	if (nh->is_group) {
+		if (cfg->fc_scope == RT_SCOPE_HOST) {
+			NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
+			return -EINVAL;
+		}
+		return 0;
+	}
+
+	if (nh->nh_info->family != AF_UNSPEC) {
 		if (nh->nh_flags & RTNH_F_ONLINK &&
 		    cfg->fc_scope >= RT_SCOPE_LINK) {
 			NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
@@ -505,6 +800,57 @@ int fib_check_nexthop(struct fib_info *fi, struct fib_config *cfg,
 	return 0;
 }
 
+void nexthop_select_path(struct net *net, struct fib_result *res, int hash)
+{
+	struct fib_info *fi = res->fi;
+	struct nexthop *nh = fi->nh;
+	struct nh_group *nh_grp;
+	bool first = false;
+	int i;
+
+	WARN_ON(!nh->is_group);
+
+	nh_grp = rcu_dereference(nh->nh_grp);
+	for (i = 0; i < nh_grp->num_nh; ++i) {
+		struct nh_grp_entry *nhge = &nh_grp->nh_entries[i];
+		struct fib_nh *fib_nh;
+
+		if (hash > atomic_read(&nhge->upper_bound))
+			continue;
+
+		fib_nh = &nhge->nh->nh_info->fib_nh;
+
+		/* nexthops always check if it is good and does
+		 * not rely on a sysctl for this behavior
+		 */
+		if (fib_good_nh(fib_nh)) {
+			res->nh = fib_nh;
+			return;
+		}
+		if (!first) {
+			res->nh = fib_nh;
+			first = true;
+		}
+	}
+}
+
+struct nexthop *nexthop_mpath_select(struct nexthop *nh, int nhsel)
+{
+	struct nh_group *nh_grp;
+	int i, j = 0;
+
+	nh_grp = rcu_dereference(nh->nh_grp);
+	for (i = 0; i < nh_grp->num_nh; ++i) {
+		if (nh_grp->nh_entries[i].nh) {
+			if (nhsel == j)
+				return nh_grp->nh_entries[i].nh;
+			++j;
+		}
+	}
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(nexthop_mpath_select);
+
 static int nh_check_attr(struct nhmsg *nhm, struct nlattr *tb[],
 			 struct net *net, struct netlink_ext_ack *extack)
 {
@@ -557,6 +903,19 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 	if (tb[NHA_ID])
 		cfg->nh_id = nla_get_u32(tb[NHA_ID]);
 
+	if (tb[NHA_GROUP]) {
+		cfg->nh_grp = tb[NHA_GROUP];
+
+		cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
+		if (tb[NHA_GROUP_TYPE])
+			cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
+
+		if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
+			NL_SET_ERR_MSG(extack, "Invalid group type");
+			goto out;
+		}
+	}
+
 	if (tb[NHA_OIF]) {
 		cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
 
@@ -644,6 +1003,14 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
 		goto out;
 	}
 
+	if (tb[NHA_GROUP]) {
+		err = nh_check_attr_group(net, tb, extack);
+		if (err)
+			goto out;
+
+		return 0;
+	}
+
 	err = 0;
 out:
 	return err;
@@ -791,7 +1158,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
 	return err;
 }
 
-static int nh_create_ipv6(struct net *net,  struct nexthop *nh,
+static int nh_create_ipv6(struct net *net, struct nexthop *nh,
 			  struct nh_info *nhi, struct nh_config *cfg,
 			  struct netlink_ext_ack *extack)
 {
@@ -856,10 +1223,47 @@ static int nh_create_unspec(struct net *net, struct nexthop *nh,
 
 static void nexthop_init_common(struct nexthop *nh)
 {
+	INIT_LIST_HEAD(&nh->grp_list);
 	INIT_LIST_HEAD(&nh->fi_list);
 	INIT_LIST_HEAD(&nh->f6i_list);
 }
 
+static struct nexthop *nexthop_create_group(struct net *net,
+					    struct nh_config *cfg)
+{
+	struct nlattr *grps_attr = cfg->nh_grp;
+	struct nexthop_grp *entry = nla_data(grps_attr);
+	struct nh_group *nh_grp;
+	struct nexthop *nh;
+	int i;
+
+	nh = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry));
+	if (!nh)
+		return ERR_PTR(-ENOMEM);
+
+	nexthop_init_common(nh);
+
+	nh_grp = rtnl_dereference(nh->nh_grp);
+	for (i = 0; i < nh_grp->num_nh; ++i) {
+		struct nexthop *nhe;
+
+		nhe = nexthop_find_by_id(net, entry[i].id);
+		nexthop_get(nhe);
+
+		nh_grp->nh_entries[i].nh = nhe;
+		nh_grp->nh_entries[i].weight = entry[i].weight ? : 1;
+		list_add(&nh_grp->nh_entries[i].nh_list, &nhe->grp_list);
+		nh_grp->nh_entries[i].nh_parent = nh;
+	}
+
+	if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
+		nh_grp->mpath = 1;
+		nh_group_rebalance(nh_grp);
+	}
+
+	return nh;
+}
+
 static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
 				      struct netlink_ext_ack *extack)
 {
@@ -929,7 +1333,11 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
 		}
 	}
 
-	nh = nexthop_create(net, cfg, extack);
+	if (cfg->nh_grp)
+		nh = nexthop_create_group(net, cfg);
+	else
+		nh = nexthop_create(net, cfg, extack);
+
 	if (IS_ERR(nh))
 		return nh;
 
@@ -968,19 +1376,25 @@ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
 	return err;
 }
 
-static bool nh_dump_filtered(struct nexthop *nh, int dev_idx,
+static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int group_filter,
 			     int master_idx, u8 family)
 {
 	const struct net_device *dev;
 	const struct nh_info *nhi;
 
-	if (dev_idx || master_idx || family)
+	if (group_filter && !nh->is_group)
+		return true;
+
+	if ((dev_idx || master_idx || family) && nh->is_group)
 		return true;
 
 	nhi = rtnl_dereference(nh->nh_info);
-	if (family && nhi->family != family)
+	if (family && !nh->is_group && nhi->family != family)
 		return true;
 
+	if (nh->is_group)
+		return false;
+
 	dev = nh_info_dev(nhi);
 	if (dev_idx && (!dev || dev->ifindex != dev_idx))
 		return true;
@@ -998,7 +1412,7 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx,
 /* rtnl */
 static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	int dev_filter_idx = 0, master_idx = 0;
+	int group_filter = 0, dev_filter_idx = 0, master_idx = 0;
 	struct net *net = sock_net(skb->sk);
 	struct rb_root *root = &net->nexthop.root;
 	struct nlattr *tb[NHA_MAX + 1];
@@ -1010,6 +1424,9 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
 
 	if (nlmsg_parse(cb->nlh, sizeof(*nhm), tb, NHA_MAX,
 			rtm_nh_policy, NULL) >= 0) {
+		if (tb[NHA_GROUPS])
+			group_filter = 1;
+
 		if (tb[NHA_OIF])
 			dev_filter_idx = nla_get_u32(tb[NHA_OIF]);
 
@@ -1027,8 +1444,8 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
 			goto cont;
 
 		nh = rb_entry(node, struct nexthop, rb_node);
-		if (nh_dump_filtered(nh, dev_filter_idx, master_idx,
-				     nhm->nh_family))
+		if (nh_dump_filtered(nh, dev_filter_idx, group_filter,
+				     master_idx, nhm->nh_family))
 			goto cont;
 
 		err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1297c7c934a8..4c16715607e0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -112,6 +112,7 @@
 #include <net/secure_seq.h>
 #include <net/ip_tunnels.h>
 #include <net/l3mdev.h>
+#include <net/nexthop.h>
 
 #include "fib_lookup.h"
 
@@ -1887,10 +1888,17 @@ static int ip_mkroute_input(struct sk_buff *skb,
 			    struct flow_keys *hkeys)
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-	if (res->fi && res->fi->fib_nhs > 1) {
-		int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
-
-		fib_select_multipath(res, h);
+	if (res->fi) {
+		struct net *net = res->fi->fib_net;
+		int h;
+
+		if (res->fi->nh && nexthop_is_multipath(res->fi->nh)) {
+			h = fib_multipath_hash(net, NULL, skb, hkeys);
+			nexthop_select_path(net, res, h);
+		} else if (res->fi->fib_nhs > 1) {
+			h = fib_multipath_hash(net, NULL, skb, hkeys);
+			fib_select_multipath(res, h);
+		}
 	}
 #endif
 
-- 
2.11.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ