[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20180901004954.7145-18-dsahern@kernel.org>
Date: Fri, 31 Aug 2018 17:49:52 -0700
From: dsahern@...nel.org
To: netdev@...r.kernel.org
Cc: roopa@...ulusnetworks.com, sharpd@...ulusnetworks.com,
idosch@...lanox.com, davem@...emloft.net,
David Ahern <dsahern@...il.com>
Subject: [PATCH RFC net-next 17/18] net: Add support for nexthop groups
From: David Ahern <dsahern@...il.com>
Allow the creation of nexthop groups which reference other nexthop
objects to create multipath routes.
TO-DO: Add mpath support to IPv6
Signed-off-by: David Ahern <dsahern@...il.com>
---
include/net/nexthop.h | 77 +++++--
net/ipv4/fib_semantics.c | 5 +-
net/ipv4/nexthop.c | 511 ++++++++++++++++++++++++++++++++++++++++++-----
net/ipv4/route.c | 16 +-
4 files changed, 540 insertions(+), 69 deletions(-)
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 759bb39e4ea7..654b67192337 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -28,6 +28,23 @@
struct nexthop;
+struct nh_grp_entry {
+ struct nexthop *nh;
+ u32 weight;
+ atomic_t upper_bound;
+
+ struct list_head nh_list;
+ struct nexthop *nh_parent; /* nexthop of group with this entry */
+};
+
+struct nh_group {
+ u16 num_nh_set;
+ u16 num_nh;
+ u8 mpath:1,
+ unused:7;
+ struct nh_grp_entry nh_entries[0];
+};
+
struct nh_info {
struct hlist_node dev_hash;
struct net *net;
@@ -47,6 +64,7 @@ struct nh_info {
struct nexthop {
struct rb_node rb_node;
+ struct list_head grp_list; /* nh group entries using this nh */
struct list_head fi_list; /* v4 entries using nh */
struct list_head f6i_list; /* v6 entries using nh */
@@ -54,12 +72,15 @@ struct nexthop {
u8 protocol;
u8 nh_flags;
+ u8 is_group:1,
+ unused:7;
refcount_t refcnt;
struct rcu_head rcu;
union {
struct nh_info __rcu *nh_info;
+ struct nh_group __rcu *nh_grp;
};
};
@@ -81,6 +102,9 @@ struct nh_config {
struct in6_addr ipv6;
} gw;
+ struct nlattr *nh_grp;
+ u16 nh_grp_type;
+
u32 nlflags;
struct nl_info nlinfo;
};
@@ -88,42 +112,61 @@ struct nh_config {
void nexthop_get(struct nexthop *nh);
void nexthop_put(struct nexthop *nh);
+static inline bool nexthop_cmp(struct nexthop *nh1, struct nexthop *nh2)
+{
+ return nh1 == nh2;
+}
+
/* caller is holding rtnl; no reference taken to nexthop */
struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
-static inline bool nexthop_cmp(struct nexthop *nh1, struct nexthop *nh2)
+/* called with rcu lock */
+static inline bool nexthop_is_multipath(const struct nexthop *nh)
{
- return nh1 == nh2;
+ if (nh->is_group) {
+ struct nh_group *nh_grp;
+
+ nh_grp = rcu_dereference(nh->nh_grp);
+ return !!nh_grp->mpath;
+ }
+ return false;
}
+struct nexthop *nexthop_mpath_select(struct nexthop *nh, int nhsel);
+
+/* called with rcu lock */
static inline int nexthop_num_path(struct nexthop *nh)
{
+ if (nexthop_is_multipath(nh)) {
+ struct nh_group *nh_grp;
+
+ nh_grp = rcu_dereference(nh->nh_grp);
+ return nh_grp->num_nh_set;
+ }
+
return 1;
}
-/* called with rcu lock */
+void nexthop_select_path(struct net *net, struct fib_result *res, int hash);
+
static inline bool nexthop_has_gw(struct nexthop *nh)
{
- struct nh_info *nhi;
-
- nhi = rcu_dereference(nh->nh_info);
- return !!nhi->has_gw;
+ return !!nh->nh_info->has_gw;
}
-/* called with rcu lock */
static inline bool nexthop_is_blackhole(struct nexthop *nh)
{
- struct nh_info *nhi;
-
- nhi = rcu_dereference(nh->nh_info);
- return !!nhi->reject_nh;
+ return !nexthop_is_multipath(nh) && !!nh->nh_info->reject_nh;
}
static inline struct fib_nh *nexthop_fib_nh(struct nexthop *nh, int nhsel)
{
struct nh_info *nhi;
- nhi = rcu_dereference(nh->nh_info);
+ if (nexthop_is_multipath(nh))
+ nh = nexthop_mpath_select(nh, nhsel);
+
+ nhi = nh->nh_info;
if (nhi->family == AF_INET ||
nhi->family == AF_UNSPEC) /* dev only re-uses IPv4 struct */
return &nhi->fib_nh;
@@ -164,11 +207,11 @@ static inline __be32 fib_info_nh_gw(struct fib_info *fi)
*/
static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
{
- struct nh_info *nhi;
+ if (nexthop_is_multipath(nh))
+ nh = nexthop_mpath_select(nh, 0);
- nhi = rcu_dereference(nh->nh_info);
- if (nhi->family == AF_INET6)
- return &nhi->fib6_nh;
+ if (nh->nh_info->family == AF_INET6)
+ return &nh->nh_info->fib6_nh;
return NULL;
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c91cdafd40ec..0ddf14512bb3 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1821,7 +1821,10 @@ void fib_select_path(struct net *net, struct fib_result *res,
goto check_saddr;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi->fib_nhs > 1) {
+ if (res->fi->nh && nexthop_is_multipath(res->fi->nh)) {
+ h = fib_multipath_hash(net, fl4, skb, NULL);
+ nexthop_select_path(net, res, h);
+ } else if (res->fi->fib_nhs > 1) {
h = fib_multipath_hash(net, fl4, skb, NULL);
fib_select_multipath(res, h);
}
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 1e77fa94e562..f0b4151c661a 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -35,6 +35,8 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
[NHA_TABLE_ID] = { .type = NLA_U32 },
[NHA_BLACKHOLE] = { .type = NLA_FLAG },
[NHA_MASTER] = { .type = NLA_U32 },
+ [NHA_GROUP_TYPE] = { .type = NLA_U16 },
+ [NHA_GROUPS] = { .type = NLA_FLAG },
};
static unsigned int nh_dev_hashfn(unsigned int val)
@@ -67,19 +69,35 @@ static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
static void nexthop_free_rcu(struct rcu_head *head)
{
struct nexthop *nh = container_of(head, struct nexthop, rcu);
- struct nh_info *nhi;
- nhi = rcu_dereference_raw(nh->nh_info);
- switch (nhi->family) {
- case AF_INET:
- case AF_UNSPEC:
- fib_nh_release(nhi->net, &nhi->fib_nh);
- break;
- case AF_INET6:
- fib6_nh_release(&nhi->fib6_nh);
- break;
+ if (nh->is_group) {
+ struct nh_group *nh_grp;
+ int i;
+
+ nh_grp = rcu_dereference_raw(nh->nh_grp);
+ for (i = 0; i < nh_grp->num_nh; ++i) {
+ if (!nh_grp->nh_entries[i].nh)
+ continue;
+
+ list_del(&nh_grp->nh_entries[i].nh_list);
+ nexthop_put(nh_grp->nh_entries[i].nh);
+ }
+ kfree(nh_grp);
+ } else {
+ struct nh_info *nhi;
+
+ nhi = rcu_dereference_raw(nh->nh_info);
+ switch (nhi->family) {
+ case AF_INET:
+ case AF_UNSPEC:
+ fib_nh_release(nhi->net, &nhi->fib_nh);
+ break;
+ case AF_INET6:
+ fib6_nh_release(&nhi->fib6_nh);
+ break;
+ }
+ kfree(nhi);
}
- kfree(nhi);
kfree(nh);
}
@@ -89,6 +107,33 @@ static struct nexthop *nexthop_alloc(void)
return kzalloc(sizeof(struct nexthop), GFP_KERNEL);
}
+/* nexthop for group has variable size and may not use the kmem_cache */
+static struct nexthop *nexthop_grp_alloc(u16 num_nh)
+{
+ size_t sz = offsetof(struct nexthop, nh_grp)
+ + sizeof(struct nh_group)
+ + sizeof(struct nh_grp_entry) * num_nh;
+ struct nh_group *nh_grp;
+ struct nexthop *nh;
+
+ nh = nexthop_alloc();
+ if (!nh)
+ return ERR_PTR(-ENOMEM);
+
+ nh_grp = kzalloc(sz, GFP_KERNEL);
+ if (!nh_grp) {
+ kfree(nh);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ nh->is_group = 1;
+ nh_grp->num_nh = num_nh;
+ nh_grp->num_nh_set = num_nh;
+ rcu_assign_pointer(nh->nh_grp, nh_grp);
+
+ return nh;
+}
+
static void nh_base_seq_inc(struct net *net)
{
while (++net->nexthop.seq == 0)
@@ -173,23 +218,166 @@ static size_t nh_nlmsg_size_ipv4(struct nh_info *nhi)
static size_t nh_nlmsg_size(struct nexthop *nh)
{
- struct nh_info *nhi = rtnl_dereference(nh->nh_info);
size_t sz = nla_total_size(4); /* NHA_ID */
- /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
- * are mutually exclusive
- */
- sz += nla_total_size(4); /* NHA_OIF */
+ if (nh->is_group) {
+ struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp);
+ size_t sz2 = sizeof(struct nh_group) * nh_grp->num_nh_set;
- if (nhi->family == AF_INET)
- sz += nh_nlmsg_size_ipv4(nhi);
+ sz += nla_total_size(sz2)
+ + nla_total_size(2); /* NHA_GROUP_TYPE */
+ } else {
+ struct nh_info *nhi = rtnl_dereference(nh->nh_info);
- else if (nhi->family == AF_INET6)
- sz += nh_nlmsg_size_ipv6(nhi);
+ /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
+ * are mutually exclusive
+ */
+ sz += nla_total_size(4); /* NHA_OIF */
+
+ if (nhi->family == AF_INET)
+ sz += nh_nlmsg_size_ipv4(nhi);
+ else if (nhi->family == AF_INET6)
+ sz += nh_nlmsg_size_ipv6(nhi);
+ }
return sz;
}
+static bool valid_group_nh(struct nexthop *nh, struct netlink_ext_ack *extack)
+{
+ if (nh->is_group) {
+ struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp);
+
+ /* nested multipath (group within a group) is not
+ * supported
+ */
+ if (nh_grp->mpath) {
+ NL_SET_ERR_MSG(extack,
+ "Multipath group can not be a nexthop within a group");
+ return false;
+ }
+ } else {
+ struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+
+ if (nhi->reject_nh) {
+ NL_SET_ERR_MSG(extack,
+ "Blackhole nexthop can not be used in a group");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ unsigned int len = nla_len(tb[NHA_GROUP]);
+ struct nexthop_grp *nhg;
+ int i;
+
+ if (len & (sizeof(struct nh_group) - 1)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid length for nexthop group attribute");
+ return -EINVAL;
+ }
+
+ /* convert len to number of nexthop ids */
+ len /= sizeof(*nhg);
+
+ nhg = nla_data(tb[NHA_GROUP]);
+ for (i = 0; i < len; ++i) {
+ struct nexthop *nh;
+
+ nh = nexthop_find_by_id(net, nhg->id);
+ if (!nh) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+ return -EINVAL;
+ }
+ if (!valid_group_nh(nh, extack))
+ return -EINVAL;
+
+ nhg += 1;
+ }
+
+ for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ NL_SET_ERR_MSG(extack,
+ "No other attributes can be set in nexthop groups");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nh_grp)
+{
+ size_t len = nh_grp->num_nh_set * sizeof(struct nh_group);
+ struct nexthop_grp *p;
+ struct nlattr *nla;
+ u16 group_type = 0;
+ int i;
+
+ if (nh_grp->mpath)
+ group_type = NEXTHOP_GRP_TYPE_MPATH;
+
+ if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
+ goto nla_put_failure;
+
+ nla = nla_reserve(skb, NHA_GROUP, len);
+ if (!nla)
+ goto nla_put_failure;
+
+ p = nla_data(nla);
+ for (i = 0; i < nh_grp->num_nh; ++i) {
+ if (!nh_grp->nh_entries[i].nh)
+ continue;
+
+ p->id = nh_grp->nh_entries[i].nh->id;
+ p->weight = nh_grp->nh_entries[i].weight;
+ p += 1;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static void nh_group_rebalance(struct nh_group *nhg)
+{
+ struct nh_grp_entry *nhge;
+ int total = 0;
+ int w = 0;
+ int i;
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ nhge = &nhg->nh_entries[i];
+
+ if (!nhge->nh)
+ continue;
+
+ total += nhge->weight;
+ }
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ int upper_bound;
+
+ nhge = &nhg->nh_entries[i];
+ if (!nhge->nh) {
+ upper_bound = -1;
+ } else {
+ w += nhge->weight;
+ upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
+ total) - 1;
+ }
+
+ atomic_set(&nhge->upper_bound, upper_bound);
+ }
+}
+
static const struct net_device *nh_info_dev(const struct nh_info *nhi)
{
switch (nhi->family) {
@@ -219,8 +407,25 @@ bool nexthop_uses_dev(const struct nexthop *nh, const struct net_device *dev)
const struct nh_info *nhi;
bool dev_match = false;
- nhi = rcu_dereference(nh->nh_info);
- dev_match = nh_info_uses_dev(nhi, dev);
+ if (nh->is_group) {
+ const struct nh_group *nh_grp;
+ int i;
+
+ nh_grp = rcu_dereference(nh->nh_grp);
+ for (i = 0; i < nh_grp->num_nh; ++i) {
+ const struct nh_grp_entry *nhge;
+
+ nhge = &nh_grp->nh_entries[i];
+ nhi = rcu_dereference(nhge->nh->nh_info);
+ dev_match = nh_info_uses_dev(nhi, dev);
+ if (dev_match)
+ break;
+ }
+
+ } else {
+ nhi = rcu_dereference(nh->nh_info);
+ dev_match = nh_info_uses_dev(nhi, dev);
+ }
return dev_match;
}
@@ -249,6 +454,14 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
if (nla_put_u32(skb, NHA_ID, nh->id))
goto nla_put_failure;
+ if (nh->is_group) {
+ struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp);
+
+ if (nla_put_nh_group(skb, nh_grp))
+ goto nla_put_failure;
+ goto end;
+ }
+
nhi = rtnl_dereference(nh->nh_info);
if (nhi->reject_nh && nla_put_flag(skb, NHA_BLACKHOLE))
goto nla_put_failure;
@@ -281,6 +494,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
break;
}
+end:
nlmsg_end(skb, nlh);
return 0;
@@ -315,6 +529,50 @@ static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
}
+static void remove_nh_grp_entry(struct nh_grp_entry *nhge, bool rebalance)
+{
+ struct nh_group *nh_grp;
+
+ list_del(&nhge->nh_list);
+ nexthop_put(nhge->nh);
+ nhge->nh = NULL;
+
+ nh_grp = rtnl_dereference(nhge->nh_parent->nh_grp);
+ nh_grp->num_nh_set--;
+ if (rebalance)
+ nh_group_rebalance(nh_grp);
+}
+
+static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
+ bool skip_fib, struct nl_info *nlinfo)
+{
+ struct nh_grp_entry *nhge, *tmp;
+
+ list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) {
+ struct nh_group *nh_grp;
+
+ remove_nh_grp_entry(nhge, true);
+
+ /* if this group has no more entries then remove it */
+ nh_grp = rtnl_dereference(nhge->nh_parent->nh_grp);
+ if (!nh_grp->num_nh_set)
+ remove_nexthop(net, nhge->nh_parent, skip_fib,
+ nlinfo);
+ }
+}
+
+static void remove_nexthop_group(struct nexthop *nh)
+{
+ struct nh_group *nh_grp;
+ int i;
+
+ nh_grp = rtnl_dereference(nh->nh_grp);
+ for (i = 0; i < nh_grp->num_nh; ++i) {
+ if (nh_grp->nh_entries[i].nh)
+ remove_nh_grp_entry(&nh_grp->nh_entries[i], false);
+ }
+}
+
static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
{
struct fib6_info *f6i, *tmp;
@@ -339,13 +597,19 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
static void __remove_nexthop(struct net *net, struct nexthop *nh,
bool skip_fib, struct nl_info *nlinfo)
{
- const struct net_device *dev;
- struct nh_info *nhi;
+ if (nh->is_group) {
+ remove_nexthop_group(nh);
+ } else {
+ const struct net_device *dev;
+ struct nh_info *nhi;
- nhi = rtnl_dereference(nh->nh_info);
- dev = nh_info_dev(nhi);
- if (dev)
- hlist_del(&nhi->dev_hash);
+ nhi = rtnl_dereference(nh->nh_info);
+ dev = nh_info_dev(nhi);
+ if (dev)
+ hlist_del(&nhi->dev_hash);
+
+ remove_nexthop_from_groups(net, nh, skip_fib, nlinfo);
+ }
if (!skip_fib)
__remove_nexthop_fib(net, nh);
}
@@ -362,21 +626,46 @@ static void remove_nexthop(struct net *net, struct nexthop *nh,
nexthop_put(nh);
- nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
+ if (nlinfo)
+ nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
}
static int replace_nexthop(struct net *net, struct nexthop *old,
struct nexthop *new, struct netlink_ext_ack *extack)
{
- struct nh_info *oldi, *newi;
+ if (old->is_group) {
+ struct nh_group *oldg, *newg;
+ int i;
- oldi = rtnl_dereference(old->nh_info);
- newi = rtnl_dereference(new->nh_info);
- rcu_assign_pointer(old->nh_info, newi);
- rcu_assign_pointer(new->nh_info, oldi);
+ if (!new->is_group) {
+ NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
+ return -EINVAL;
+ }
+ oldg = rtnl_dereference(old->nh_grp);
+ newg = rtnl_dereference(new->nh_grp);
+ rcu_assign_pointer(old->nh_grp, newg);
+ rcu_assign_pointer(new->nh_grp, oldg);
+
+ /* update parents - used by nexthop code for cleanup */
+ for (i = 0; i < newg->num_nh; ++i)
+ newg->nh_entries[i].nh_parent = old;
+ for (i = 0; i < oldg->num_nh; ++i)
+ oldg->nh_entries[i].nh_parent = new;
+ } else {
+ struct nh_info *oldi, *newi;
- newi->nh_parent = old;
- oldi->nh_parent = new;
+ if (new->is_group) {
+ NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
+ return -EINVAL;
+ }
+ oldi = rtnl_dereference(old->nh_info);
+ newi = rtnl_dereference(new->nh_info);
+ rcu_assign_pointer(old->nh_info, newi);
+ rcu_assign_pointer(new->nh_info, oldi);
+
+ newi->nh_parent = old;
+ oldi->nh_parent = new;
+ }
old->protocol = new->protocol;
old->nh_flags = new->nh_flags;
@@ -491,10 +780,16 @@ int fib_check_nexthop(struct fib_info *fi, struct fib_config *cfg,
struct netlink_ext_ack *extack)
{
struct nexthop *nh = fi->nh;
- struct nh_info *nhi;
- nhi = rtnl_dereference(nh->nh_info);
- if (nhi->family != AF_UNSPEC) {
+ if (nh->is_group) {
+ if (cfg->fc_scope == RT_SCOPE_HOST) {
+ NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
+ return -EINVAL;
+ }
+ return 0;
+ }
+
+ if (nh->nh_info->family != AF_UNSPEC) {
if (nh->nh_flags & RTNH_F_ONLINK &&
cfg->fc_scope >= RT_SCOPE_LINK) {
NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
@@ -505,6 +800,57 @@ int fib_check_nexthop(struct fib_info *fi, struct fib_config *cfg,
return 0;
}
+void nexthop_select_path(struct net *net, struct fib_result *res, int hash)
+{
+ struct fib_info *fi = res->fi;
+ struct nexthop *nh = fi->nh;
+ struct nh_group *nh_grp;
+ bool first = false;
+ int i;
+
+ WARN_ON(!nh->is_group);
+
+ nh_grp = rcu_dereference(nh->nh_grp);
+ for (i = 0; i < nh_grp->num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nh_grp->nh_entries[i];
+ struct fib_nh *fib_nh;
+
+ if (hash > atomic_read(&nhge->upper_bound))
+ continue;
+
+ fib_nh = &nhge->nh->nh_info->fib_nh;
+
+ /* nexthops always check if it is good and does
+ * not rely on a sysctl for this behavior
+ */
+ if (fib_good_nh(fib_nh)) {
+ res->nh = fib_nh;
+ return;
+ }
+ if (!first) {
+ res->nh = fib_nh;
+ first = true;
+ }
+ }
+}
+
+struct nexthop *nexthop_mpath_select(struct nexthop *nh, int nhsel)
+{
+ struct nh_group *nh_grp;
+ int i, j = 0;
+
+ nh_grp = rcu_dereference(nh->nh_grp);
+ for (i = 0; i < nh_grp->num_nh; ++i) {
+ if (nh_grp->nh_entries[i].nh) {
+ if (nhsel == j)
+ return nh_grp->nh_entries[i].nh;
+ ++j;
+ }
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nexthop_mpath_select);
+
static int nh_check_attr(struct nhmsg *nhm, struct nlattr *tb[],
struct net *net, struct netlink_ext_ack *extack)
{
@@ -557,6 +903,19 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
if (tb[NHA_ID])
cfg->nh_id = nla_get_u32(tb[NHA_ID]);
+ if (tb[NHA_GROUP]) {
+ cfg->nh_grp = tb[NHA_GROUP];
+
+ cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
+ if (tb[NHA_GROUP_TYPE])
+ cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
+
+ if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid group type");
+ goto out;
+ }
+ }
+
if (tb[NHA_OIF]) {
cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
@@ -644,6 +1003,14 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
goto out;
}
+ if (tb[NHA_GROUP]) {
+ err = nh_check_attr_group(net, tb, extack);
+ if (err)
+ goto out;
+
+ return 0;
+ }
+
err = 0;
out:
return err;
@@ -791,7 +1158,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
return err;
}
-static int nh_create_ipv6(struct net *net, struct nexthop *nh,
+static int nh_create_ipv6(struct net *net, struct nexthop *nh,
struct nh_info *nhi, struct nh_config *cfg,
struct netlink_ext_ack *extack)
{
@@ -856,10 +1223,47 @@ static int nh_create_unspec(struct net *net, struct nexthop *nh,
static void nexthop_init_common(struct nexthop *nh)
{
+ INIT_LIST_HEAD(&nh->grp_list);
INIT_LIST_HEAD(&nh->fi_list);
INIT_LIST_HEAD(&nh->f6i_list);
}
+static struct nexthop *nexthop_create_group(struct net *net,
+ struct nh_config *cfg)
+{
+ struct nlattr *grps_attr = cfg->nh_grp;
+ struct nexthop_grp *entry = nla_data(grps_attr);
+ struct nh_group *nh_grp;
+ struct nexthop *nh;
+ int i;
+
+ nh = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry));
+ if (!nh)
+ return ERR_PTR(-ENOMEM);
+
+ nexthop_init_common(nh);
+
+ nh_grp = rtnl_dereference(nh->nh_grp);
+ for (i = 0; i < nh_grp->num_nh; ++i) {
+ struct nexthop *nhe;
+
+ nhe = nexthop_find_by_id(net, entry[i].id);
+ nexthop_get(nhe);
+
+ nh_grp->nh_entries[i].nh = nhe;
+ nh_grp->nh_entries[i].weight = entry[i].weight ? : 1;
+ list_add(&nh_grp->nh_entries[i].nh_list, &nhe->grp_list);
+ nh_grp->nh_entries[i].nh_parent = nh;
+ }
+
+ if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
+ nh_grp->mpath = 1;
+ nh_group_rebalance(nh_grp);
+ }
+
+ return nh;
+}
+
static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
struct netlink_ext_ack *extack)
{
@@ -929,7 +1333,11 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
}
}
- nh = nexthop_create(net, cfg, extack);
+ if (cfg->nh_grp)
+ nh = nexthop_create_group(net, cfg);
+ else
+ nh = nexthop_create(net, cfg, extack);
+
if (IS_ERR(nh))
return nh;
@@ -968,19 +1376,25 @@ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
return err;
}
-static bool nh_dump_filtered(struct nexthop *nh, int dev_idx,
+static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int group_filter,
int master_idx, u8 family)
{
const struct net_device *dev;
const struct nh_info *nhi;
- if (dev_idx || master_idx || family)
+ if (group_filter && !nh->is_group)
+ return true;
+
+ if ((dev_idx || master_idx || family) && nh->is_group)
return true;
nhi = rtnl_dereference(nh->nh_info);
- if (family && nhi->family != family)
+ if (family && !nh->is_group && nhi->family != family)
return true;
+ if (nh->is_group)
+ return false;
+
dev = nh_info_dev(nhi);
if (dev_idx && (!dev || dev->ifindex != dev_idx))
return true;
@@ -998,7 +1412,7 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx,
/* rtnl */
static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
{
- int dev_filter_idx = 0, master_idx = 0;
+ int group_filter = 0, dev_filter_idx = 0, master_idx = 0;
struct net *net = sock_net(skb->sk);
struct rb_root *root = &net->nexthop.root;
struct nlattr *tb[NHA_MAX + 1];
@@ -1010,6 +1424,9 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
if (nlmsg_parse(cb->nlh, sizeof(*nhm), tb, NHA_MAX,
rtm_nh_policy, NULL) >= 0) {
+ if (tb[NHA_GROUPS])
+ group_filter = 1;
+
if (tb[NHA_OIF])
dev_filter_idx = nla_get_u32(tb[NHA_OIF]);
@@ -1027,8 +1444,8 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
goto cont;
nh = rb_entry(node, struct nexthop, rb_node);
- if (nh_dump_filtered(nh, dev_filter_idx, master_idx,
- nhm->nh_family))
+ if (nh_dump_filtered(nh, dev_filter_idx, group_filter,
+ master_idx, nhm->nh_family))
goto cont;
err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1297c7c934a8..4c16715607e0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -112,6 +112,7 @@
#include <net/secure_seq.h>
#include <net/ip_tunnels.h>
#include <net/l3mdev.h>
+#include <net/nexthop.h>
#include "fib_lookup.h"
@@ -1887,10 +1888,17 @@ static int ip_mkroute_input(struct sk_buff *skb,
struct flow_keys *hkeys)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi && res->fi->fib_nhs > 1) {
- int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
-
- fib_select_multipath(res, h);
+ if (res->fi) {
+ struct net *net = res->fi->fib_net;
+ int h;
+
+ if (res->fi->nh && nexthop_is_multipath(res->fi->nh)) {
+ h = fib_multipath_hash(net, NULL, skb, hkeys);
+ nexthop_select_path(net, res, h);
+ } else if (res->fi->fib_nhs > 1) {
+ h = fib_multipath_hash(net, NULL, skb, hkeys);
+ fib_select_multipath(res, h);
+ }
}
#endif
--
2.11.0
Powered by blists - more mailing lists