[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1589854474-26854-3-git-send-email-roopa@cumulusnetworks.com>
Date: Mon, 18 May 2020 19:14:30 -0700
From: Roopa Prabhu <roopa@...ulusnetworks.com>
To: dsahern@...il.com, davem@...emloft.net
Cc: netdev@...r.kernel.org, nikolay@...ulusnetworks.com,
jiri@...lanox.com, idosch@...lanox.com, petrm@...lanox.com
Subject: [PATCH net-next 2/6] nexthop: support for fdb ecmp nexthops
From: Roopa Prabhu <roopa@...ulusnetworks.com>
This patch introduces ecmp nexthops and nexthop groups
for mac fdb entries. In subsequent patches this is used
by the vxlan driver fdb entries. The use case is
E-VPN multihoming [1,2,3] which requires bridged vxlan traffic
to be load balanced to remote switches (vteps) belonging to
the same multi-homed ethernet segment (This is analogous to
a multi-homed LAG but over vxlan).
Changes include new nexthop flag NHA_FDB for nexthops
referenced by fdb entries. These nexthops only have ip.
This patch includes appropriate checks to avoid routes
referencing such nexthops.
example:
$ip nexthop add id 12 via 172.16.1.2 fdb
$ip nexthop add id 13 via 172.16.1.3 fdb
$ip nexthop add id 102 group 12/13 fdb
$bridge fdb add 02:02:00:00:00:13 dev vxlan1000 nhid 101 self
[1] E-VPN https://tools.ietf.org/html/rfc7432
[2] E-VPN VxLAN: https://tools.ietf.org/html/rfc8365
[3] LPC talk with mention of nexthop groups for L2 ecmp
http://vger.kernel.org/lpc_net2018_talks/scaling_bridge_fdb_database_slidesV3.pdf
Signed-off-by: Roopa Prabhu <roopa@...ulusnetworks.com>
---
include/net/ip6_fib.h | 1 +
include/net/nexthop.h | 30 ++++++++++
include/uapi/linux/nexthop.h | 3 +
net/core/neighbour.c | 2 +
net/ipv4/nexthop.c | 129 ++++++++++++++++++++++++++++++++++---------
net/ipv6/route.c | 5 ++
6 files changed, 145 insertions(+), 25 deletions(-)
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index fdaf975..3f615a2 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -65,6 +65,7 @@ struct fib6_config {
struct nl_info fc_nlinfo;
struct nlattr *fc_encap;
u16 fc_encap_type;
+ bool fc_is_fdb;
};
struct fib6_node {
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index c440ccc..04dafc6 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -26,6 +26,7 @@ struct nh_config {
u8 nh_family;
u8 nh_protocol;
u8 nh_blackhole;
+ u8 nh_fdb;
u32 nh_flags;
int nh_ifindex;
@@ -52,6 +53,7 @@ struct nh_info {
u8 family;
bool reject_nh;
+ bool fdb_nh;
union {
struct fib_nh_common fib_nhc;
@@ -80,6 +82,7 @@ struct nexthop {
struct rb_node rb_node; /* entry on netns rbtree */
struct list_head fi_list; /* v4 entries using nh */
struct list_head f6i_list; /* v6 entries using nh */
+ struct list_head fdb_list; /* fdb entries using this nh */
struct list_head grp_list; /* nh group entries using this nh */
struct net *net;
@@ -88,6 +91,7 @@ struct nexthop {
u8 protocol; /* app managing this nh */
u8 nh_flags;
bool is_group;
+ bool is_fdb_nh;
refcount_t refcnt;
struct rcu_head rcu;
@@ -304,4 +308,30 @@ static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash)
int nexthop_for_each_fib6_nh(struct nexthop *nh,
int (*cb)(struct fib6_nh *nh, void *arg),
void *arg);
+
+static inline int nexthop_get_family(struct nexthop *nh)
+{
+ struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);
+
+ return nhi->family;
+}
+
+static inline
+struct fib_nh_common *nexthop_fdb_nhc(struct nexthop *nh)
+{
+ struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info);
+
+ return &nhi->fib_nhc;
+}
+
+static inline struct fib_nh_common *nexthop_path_fdb_result(struct nexthop *nh,
+ int hash)
+{
+ struct nh_info *nhi;
+ struct nexthop *nhp;
+
+ nhp = nexthop_select_path(nh, hash);
+ nhi = rcu_dereference(nhp->nh_info);
+ return &nhi->fib_nhc;
+}
#endif
diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h
index 7b61867..2d4a1e7 100644
--- a/include/uapi/linux/nexthop.h
+++ b/include/uapi/linux/nexthop.h
@@ -49,6 +49,9 @@ enum {
NHA_GROUPS, /* flag; only return nexthop groups in dump */
NHA_MASTER, /* u32; only return nexthops with given master dev */
+ NHA_FDB, /* flag; nexthop belongs to a bridge fdb */
+ /* if NHA_FDB is added, OIF, BLACKHOLE, ENCAP cannot be set */
+
__NHA_MAX,
};
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index b607ea6..37e4dba 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1771,6 +1771,7 @@ static struct neigh_table *neigh_find_table(int family)
}
const struct nla_policy nda_policy[NDA_MAX+1] = {
+ [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID },
[NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN },
[NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) },
@@ -1781,6 +1782,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = {
[NDA_IFINDEX] = { .type = NLA_U32 },
[NDA_MASTER] = { .type = NLA_U32 },
[NDA_PROTOCOL] = { .type = NLA_U8 },
+ [NDA_NH_ID] = { .type = NLA_U32 },
};
static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh,
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 992e841..d314b27 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -33,6 +33,7 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
[NHA_ENCAP] = { .type = NLA_NESTED },
[NHA_GROUPS] = { .type = NLA_FLAG },
[NHA_MASTER] = { .type = NLA_U32 },
+ [NHA_FDB] = { .type = NLA_FLAG },
};
static unsigned int nh_dev_hashfn(unsigned int val)
@@ -107,6 +108,7 @@ static struct nexthop *nexthop_alloc(void)
INIT_LIST_HEAD(&nh->fi_list);
INIT_LIST_HEAD(&nh->f6i_list);
INIT_LIST_HEAD(&nh->grp_list);
+ INIT_LIST_HEAD(&nh->fdb_list);
}
return nh;
}
@@ -227,6 +229,9 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
if (nla_put_u32(skb, NHA_ID, nh->id))
goto nla_put_failure;
+ if (nh->is_fdb_nh && nla_put_flag(skb, NHA_FDB))
+ goto nla_put_failure;
+
if (nh->is_group) {
struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
@@ -241,7 +246,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
if (nla_put_flag(skb, NHA_BLACKHOLE))
goto nla_put_failure;
goto out;
- } else {
+ } else if (!nh->is_fdb_nh) {
const struct net_device *dev;
dev = nhi->fib_nhc.nhc_dev;
@@ -387,12 +392,35 @@ static bool valid_group_nh(struct nexthop *nh, unsigned int npaths,
return true;
}
+static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family,
+ struct netlink_ext_ack *extack)
+{
+ struct nh_info *nhi;
+
+ if (!nh->is_fdb_nh) {
+ NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops");
+ return -EINVAL;
+ }
+
+ nhi = rtnl_dereference(nh->nh_info);
+ if (*nh_family == AF_UNSPEC) {
+ *nh_family = nhi->family;
+ } else if (*nh_family != nhi->family) {
+ NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
struct netlink_ext_ack *extack)
{
unsigned int len = nla_len(tb[NHA_GROUP]);
+ u8 nh_family = AF_UNSPEC;
struct nexthop_grp *nhg;
unsigned int i, j;
+ u8 nhg_fdb = 0;
if (len & (sizeof(struct nexthop_grp) - 1)) {
NL_SET_ERR_MSG(extack,
@@ -421,6 +449,8 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
}
}
+ if (tb[NHA_FDB])
+ nhg_fdb = 1;
nhg = nla_data(tb[NHA_GROUP]);
for (i = 0; i < len; ++i) {
struct nexthop *nh;
@@ -432,11 +462,20 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
}
if (!valid_group_nh(nh, len, extack))
return -EINVAL;
+
+ if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack))
+ return -EINVAL;
+
+ if (!nhg_fdb && nh->is_fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops");
+ return -EINVAL;
+ }
}
for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) {
if (!tb[i])
continue;
-
+ if (tb[NHA_FDB])
+ continue;
NL_SET_ERR_MSG(extack,
"No other attributes can be set in nexthop groups");
return -EINVAL;
@@ -500,6 +539,9 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
if (unlikely(!nhge_nh))
continue;
+ if (nhge_nh->is_fdb_nh)
+ return nhge->nh;
+
/* nexthops always check if it is good and does
* not rely on a sysctl for this behavior
*/
@@ -569,6 +611,11 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
{
struct nh_info *nhi;
+ if (nh->is_fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+ return -EINVAL;
+ }
+
/* fib6_src is unique to a fib6_info and limits the ability to cache
* routes in fib6_nh within a nexthop that is potentially shared
* across multiple fib entries. If the config wants to use source
@@ -645,6 +692,12 @@ int fib_check_nexthop(struct nexthop *nh, u8 scope,
{
int err = 0;
+ if (nh->is_fdb_nh) {
+ NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
+ err = -EINVAL;
+ goto out;
+ }
+
if (nh->is_group) {
struct nh_group *nhg;
@@ -1130,6 +1183,9 @@ static struct nexthop *nexthop_create_group(struct net *net,
nh_group_rebalance(nhg);
}
+ if (cfg->nh_fdb)
+ nh->is_fdb_nh = 1;
+
rcu_assign_pointer(nh->nh_grp, nhg);
return nh;
@@ -1157,7 +1213,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
.fc_encap = cfg->nh_encap,
.fc_encap_type = cfg->nh_encap_type,
};
- u32 tb_id = l3mdev_fib_table(cfg->dev);
+ u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN);
int err;
err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
@@ -1166,6 +1222,9 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
goto out;
}
+ if (nh->is_fdb_nh)
+ goto out;
+
/* sets nh_dev if successful */
err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
if (!err) {
@@ -1191,6 +1250,7 @@ static int nh_create_ipv6(struct net *net, struct nexthop *nh,
.fc_flags = cfg->nh_flags,
.fc_encap = cfg->nh_encap,
.fc_encap_type = cfg->nh_encap_type,
+ .fc_is_fdb = cfg->nh_fdb,
};
int err;
@@ -1232,6 +1292,9 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
nhi->family = cfg->nh_family;
nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
+ if (cfg->nh_fdb)
+ nh->is_fdb_nh = 1;
+
if (cfg->nh_blackhole) {
nhi->reject_nh = 1;
cfg->nh_ifindex = net->loopback_dev->ifindex;
@@ -1253,7 +1316,8 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
}
/* add the entry to the device based hash */
- nexthop_devhash_add(net, nhi);
+ if (!nh->is_fdb_nh)
+ nexthop_devhash_add(net, nhi);
rcu_assign_pointer(nh->nh_info, nhi);
@@ -1357,6 +1421,16 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
if (tb[NHA_ID])
cfg->nh_id = nla_get_u32(tb[NHA_ID]);
+ if (tb[NHA_FDB]) {
+ if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] ||
+ tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) {
+ NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole");
+ goto out;
+ }
+
+ cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]);
+ }
+
if (tb[NHA_GROUP]) {
if (nhm->nh_family != AF_UNSPEC) {
NL_SET_ERR_MSG(extack, "Invalid family for group");
@@ -1380,8 +1454,8 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
if (tb[NHA_BLACKHOLE]) {
if (tb[NHA_GATEWAY] || tb[NHA_OIF] ||
- tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) {
- NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway or oif");
+ tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) {
+ NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb");
goto out;
}
@@ -1390,26 +1464,28 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
goto out;
}
- if (!tb[NHA_OIF]) {
- NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops");
+ if (!cfg->nh_fdb && !tb[NHA_OIF]) {
+ NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops");
goto out;
}
- cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
- if (cfg->nh_ifindex)
- cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
+ if (!cfg->nh_fdb && tb[NHA_OIF]) {
+ cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
+ if (cfg->nh_ifindex)
+ cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
- if (!cfg->dev) {
- NL_SET_ERR_MSG(extack, "Invalid device index");
- goto out;
- } else if (!(cfg->dev->flags & IFF_UP)) {
- NL_SET_ERR_MSG(extack, "Nexthop device is not up");
- err = -ENETDOWN;
- goto out;
- } else if (!netif_carrier_ok(cfg->dev)) {
- NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
- err = -ENETDOWN;
- goto out;
+ if (!cfg->dev) {
+ NL_SET_ERR_MSG(extack, "Invalid device index");
+ goto out;
+ } else if (!(cfg->dev->flags & IFF_UP)) {
+ NL_SET_ERR_MSG(extack, "Nexthop device is not up");
+ err = -ENETDOWN;
+ goto out;
+ } else if (!netif_carrier_ok(cfg->dev)) {
+ NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
+ err = -ENETDOWN;
+ goto out;
+ }
}
err = -EINVAL;
@@ -1638,7 +1714,7 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx,
static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
int *master_idx, bool *group_filter,
- struct netlink_callback *cb)
+ bool *fdb_filter, struct netlink_callback *cb)
{
struct netlink_ext_ack *extack = cb->extack;
struct nlattr *tb[NHA_MAX + 1];
@@ -1675,6 +1751,9 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
case NHA_GROUPS:
*group_filter = true;
break;
+ case NHA_FDB:
+ *fdb_filter = true;
+ break;
default:
NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
return -EINVAL;
@@ -1693,17 +1772,17 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
/* rtnl */
static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
{
+ bool group_filter = false, fdb_filter = false;
struct nhmsg *nhm = nlmsg_data(cb->nlh);
int dev_filter_idx = 0, master_idx = 0;
struct net *net = sock_net(skb->sk);
struct rb_root *root = &net->nexthop.rb_root;
- bool group_filter = false;
struct rb_node *node;
int idx = 0, s_idx;
int err;
err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx,
- &group_filter, cb);
+ &group_filter, &fdb_filter, cb);
if (err < 0)
return err;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a8b4add..41b49e3 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3421,6 +3421,11 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
#ifdef CONFIG_IPV6_ROUTER_PREF
fib6_nh->last_probe = jiffies;
#endif
+ if (cfg->fc_is_fdb) {
+ fib6_nh->fib_nh_gw6 = cfg->fc_gateway;
+ fib6_nh->fib_nh_gw_family = AF_INET6;
+ return 0;
+ }
err = -ENODEV;
if (cfg->fc_ifindex) {
--
2.1.4
Powered by blists - more mailing lists