[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date: Mon, 4 May 2020 19:13:46 -0700
From: Roopa Prabhu <roopa@...ulusnetworks.com>
To: David Ahern <dsahern@...il.com>, David Miller <davem@...emloft.net>
Cc: netdev <netdev@...r.kernel.org>,
Nikolay Aleksandrov <nikolay@...ulusnetworks.com>,
Ido Schimmel <idosch@...lanox.com>,
Jiri Pirko <jiri@...lanox.com>,
Petr Machata <petrm@...lanox.com>
Subject: Re: [RFC PATCH net-next 1/5] nexthop: support for fdb ecmp nexthops
On Mon, May 4, 2020 at 3:28 PM Roopa Prabhu <roopa@...ulusnetworks.com> wrote:
>
> From: Roopa Prabhu <roopa@...ulusnetworks.com>
>
> This patch introduces ecmp nexthops and nexthop groups
> for mac fdb entries. In subsequent patches this is used
> by the vxlan driver fdb entries. The use case is
> E-VPN multihoming [1,2,3] which requires bridged vxlan traffic
> to be load balanced to remote switches (vteps) belonging to
> the same multi-homed ethernet segment (This is analogous to
> a multi-homed LAG but over vxlan).
>
> Changes include new nexthop flag NHA_FDB for nexthops
> referenced by fdb entries. These nexthops only have ip.
> This patch includes appropriate checks to avoid routes
> referencing such nexthops.
>
> example:
> $ip nexthop add id 12 via 172.16.1.2 fdb
> $ip nexthop add id 13 via 172.16.1.3 fdb
> $ip nexthop add id 102 group 12/13 fdb
>
> $bridge fdb add 02:02:00:00:00:13 dev vxlan1000 nhid 101 self
>
> [1] E-VPN https://tools.ietf.org/html/rfc7432
> [2] E-VPN VxLAN: https://tools.ietf.org/html/rfc8365
> [3] LPC talk with mention of nexthop groups for L2 ecmp
> http://vger.kernel.org/lpc_net2018_talks/scaling_bridge_fdb_database_slidesV3.pdf
>
> Signed-off-by: Roopa Prabhu <roopa@...ulusnetworks.com>
> ---
> include/net/nexthop.h | 14 ++++++
> include/uapi/linux/nexthop.h | 1 +
> net/ipv4/nexthop.c | 101 +++++++++++++++++++++++++++++++++----------
> 3 files changed, 93 insertions(+), 23 deletions(-)
>
> diff --git a/include/net/nexthop.h b/include/net/nexthop.h
> index c440ccc..3ad4e97 100644
> --- a/include/net/nexthop.h
> +++ b/include/net/nexthop.h
> @@ -26,6 +26,7 @@ struct nh_config {
> u8 nh_family;
> u8 nh_protocol;
> u8 nh_blackhole;
> + u8 nh_fdb;
> u32 nh_flags;
>
> int nh_ifindex;
> @@ -52,6 +53,7 @@ struct nh_info {
>
> u8 family;
> bool reject_nh;
> + bool fdb_nh;
>
> union {
> struct fib_nh_common fib_nhc;
> @@ -80,6 +82,7 @@ struct nexthop {
> struct rb_node rb_node; /* entry on netns rbtree */
> struct list_head fi_list; /* v4 entries using nh */
> struct list_head f6i_list; /* v6 entries using nh */
> + struct list_head fdb_list; /* fdb entries using this nh */
> struct list_head grp_list; /* nh group entries using this nh */
> struct net *net;
>
> @@ -88,6 +91,7 @@ struct nexthop {
> u8 protocol; /* app managing this nh */
> u8 nh_flags;
> bool is_group;
> + bool is_fdb_nh;
>
> refcount_t refcnt;
> struct rcu_head rcu;
> @@ -304,4 +308,14 @@ static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash)
> int nexthop_for_each_fib6_nh(struct nexthop *nh,
> int (*cb)(struct fib6_nh *nh, void *arg),
> void *arg);
> +
> +static inline struct nh_info *nexthop_path_fdb(struct nexthop *nh, u32 hash)
> +{
> + struct nh_info *nhi;
> +
> + nh = nexthop_select_path(nh, hash);
> + nhi = rcu_dereference(nh->nh_info);
> +
> + return nhi;
> +}
> #endif
> diff --git a/include/uapi/linux/nexthop.h b/include/uapi/linux/nexthop.h
> index 7b61867..19a234a 100644
> --- a/include/uapi/linux/nexthop.h
> +++ b/include/uapi/linux/nexthop.h
> @@ -48,6 +48,7 @@ enum {
> */
> NHA_GROUPS, /* flag; only return nexthop groups in dump */
> NHA_MASTER, /* u32; only return nexthops with given master dev */
> + NHA_FDB, /* nexthop belongs to a bridge fdb */
>
> __NHA_MAX,
> };
> diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
> index 3957364..98f8d2a 100644
> --- a/net/ipv4/nexthop.c
> +++ b/net/ipv4/nexthop.c
> @@ -33,6 +33,7 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
> [NHA_ENCAP] = { .type = NLA_NESTED },
> [NHA_GROUPS] = { .type = NLA_FLAG },
> [NHA_MASTER] = { .type = NLA_U32 },
> + [NHA_FDB] = { .type = NLA_FLAG },
> };
>
> static unsigned int nh_dev_hashfn(unsigned int val)
> @@ -107,6 +108,7 @@ static struct nexthop *nexthop_alloc(void)
> INIT_LIST_HEAD(&nh->fi_list);
> INIT_LIST_HEAD(&nh->f6i_list);
> INIT_LIST_HEAD(&nh->grp_list);
> + INIT_LIST_HEAD(&nh->fdb_list);
> }
> return nh;
> }
> @@ -227,6 +229,9 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
> if (nla_put_u32(skb, NHA_ID, nh->id))
> goto nla_put_failure;
>
> + if (nh->is_fdb_nh && nla_put_flag(skb, NHA_FDB))
> + goto nla_put_failure;
> +
> if (nh->is_group) {
> struct nh_group *nhg = rtnl_dereference(nh->nh_grp);
>
> @@ -241,7 +246,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
> if (nla_put_flag(skb, NHA_BLACKHOLE))
> goto nla_put_failure;
> goto out;
> - } else {
> + } else if (!nh->is_fdb_nh) {
> const struct net_device *dev;
>
> dev = nhi->fib_nhc.nhc_dev;
> @@ -393,6 +398,7 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
> unsigned int len = nla_len(tb[NHA_GROUP]);
> struct nexthop_grp *nhg;
> unsigned int i, j;
> + u8 nhg_fdb = 0;
>
> if (len & (sizeof(struct nexthop_grp) - 1)) {
> NL_SET_ERR_MSG(extack,
> @@ -421,6 +427,8 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
> }
> }
>
> + if (tb[NHA_FDB])
> + nhg_fdb = 1;
> nhg = nla_data(tb[NHA_GROUP]);
> for (i = 0; i < len; ++i) {
> struct nexthop *nh;
> @@ -432,11 +440,16 @@ static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
> }
> if (!valid_group_nh(nh, len, extack))
> return -EINVAL;
> + if (nhg_fdb && !nh->is_fdb_nh) {
> + NL_SET_ERR_MSG(extack, "FDB Multipath group can only have fdb nexthops");
> + return -EINVAL;
> + }
> }
> for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) {
> if (!tb[i])
> continue;
> -
> + if (tb[NHA_FDB])
> + continue;
> NL_SET_ERR_MSG(extack,
> "No other attributes can be set in nexthop groups");
> return -EINVAL;
> @@ -495,6 +508,9 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
> if (hash > atomic_read(&nhge->upper_bound))
> continue;
>
> + if (nhge->nh->is_fdb_nh)
> + return nhge->nh;
> +
> /* nexthops always check if it is good and does
> * not rely on a sysctl for this behavior
> */
> @@ -564,6 +580,11 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
> {
> struct nh_info *nhi;
>
> + if (nh->is_fdb_nh) {
> + NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
> + return -EINVAL;
> + }
> +
> /* fib6_src is unique to a fib6_info and limits the ability to cache
> * routes in fib6_nh within a nexthop that is potentially shared
> * across multiple fib entries. If the config wants to use source
> @@ -640,6 +661,12 @@ int fib_check_nexthop(struct nexthop *nh, u8 scope,
> {
> int err = 0;
>
> + if (nh->is_fdb_nh) {
> + NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop");
> + err = -EINVAL;
> + goto out;
> + }
> +
> if (nh->is_group) {
> struct nh_group *nhg;
>
> @@ -1125,6 +1152,9 @@ static struct nexthop *nexthop_create_group(struct net *net,
> nh_group_rebalance(nhg);
> }
>
> + if (cfg->nh_fdb)
> + nh->is_fdb_nh = 1;
> +
> rcu_assign_pointer(nh->nh_grp, nhg);
>
> return nh;
> @@ -1152,7 +1182,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
> .fc_encap = cfg->nh_encap,
> .fc_encap_type = cfg->nh_encap_type,
> };
> - u32 tb_id = l3mdev_fib_table(cfg->dev);
> + u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN);
> int err;
>
> err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack);
> @@ -1161,6 +1191,9 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
> goto out;
> }
>
> + if (nh->is_fdb_nh)
> + goto out;
> +
> /* sets nh_dev if successful */
> err = fib_check_nh(net, fib_nh, tb_id, 0, extack);
> if (!err) {
> @@ -1227,6 +1260,9 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
> nhi->family = cfg->nh_family;
> nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK;
>
> + if (cfg->nh_fdb)
> + nh->is_fdb_nh = 1;
> +
> if (cfg->nh_blackhole) {
> nhi->reject_nh = 1;
> cfg->nh_ifindex = net->loopback_dev->ifindex;
> @@ -1248,7 +1284,8 @@ static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
> }
>
> /* add the entry to the device based hash */
> - nexthop_devhash_add(net, nhi);
> + if (!nh->is_fdb_nh)
> + nexthop_devhash_add(net, nhi);
>
> rcu_assign_pointer(nh->nh_info, nhi);
>
> @@ -1367,6 +1404,9 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
> NL_SET_ERR_MSG(extack, "Invalid group type");
> goto out;
> }
> + if (tb[NHA_FDB])
> + cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]);
> +
> err = nh_check_attr_group(net, tb, extack);
>
> /* no other attributes should be set */
> @@ -1385,26 +1425,38 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
> goto out;
> }
>
> - if (!tb[NHA_OIF]) {
> - NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole nexthops");
> + if (tb[NHA_FDB]) {
> + if (tb[NHA_OIF] ||
> + tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) {
> + NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap or oif");
> + goto out;
> + }
> +
> + cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]);
> + }
> +
all this strict checking still missed the check for blackhole and
reject flags. I will include them in the non-RC version.
> + if (!cfg->nh_fdb && !tb[NHA_OIF]) {
> + NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops");
> goto out;
> }
>
> - cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
> - if (cfg->nh_ifindex)
> - cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
> + if (!cfg->nh_fdb && tb[NHA_OIF]) {
> + cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
> + if (cfg->nh_ifindex)
> + cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex);
>
> - if (!cfg->dev) {
> - NL_SET_ERR_MSG(extack, "Invalid device index");
> - goto out;
> - } else if (!(cfg->dev->flags & IFF_UP)) {
> - NL_SET_ERR_MSG(extack, "Nexthop device is not up");
> - err = -ENETDOWN;
> - goto out;
> - } else if (!netif_carrier_ok(cfg->dev)) {
> - NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
> - err = -ENETDOWN;
> - goto out;
> + if (!cfg->dev) {
> + NL_SET_ERR_MSG(extack, "Invalid device index");
> + goto out;
> + } else if (!(cfg->dev->flags & IFF_UP)) {
> + NL_SET_ERR_MSG(extack, "Nexthop device is not up");
> + err = -ENETDOWN;
> + goto out;
> + } else if (!netif_carrier_ok(cfg->dev)) {
> + NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down");
> + err = -ENETDOWN;
> + goto out;
> + }
> }
>
> err = -EINVAL;
> @@ -1633,7 +1685,7 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int master_idx,
>
> static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
> int *master_idx, bool *group_filter,
> - struct netlink_callback *cb)
> + bool *fdb_filter, struct netlink_callback *cb)
> {
> struct netlink_ext_ack *extack = cb->extack;
> struct nlattr *tb[NHA_MAX + 1];
> @@ -1670,6 +1722,9 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
> case NHA_GROUPS:
> *group_filter = true;
> break;
> + case NHA_FDB:
> + *fdb_filter = true;
> + break;
> default:
> NL_SET_ERR_MSG(extack, "Unsupported attribute in dump request");
> return -EINVAL;
> @@ -1688,17 +1743,17 @@ static int nh_valid_dump_req(const struct nlmsghdr *nlh, int *dev_idx,
> /* rtnl */
> static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
> {
> + bool group_filter = false, fdb_filter = false;
> struct nhmsg *nhm = nlmsg_data(cb->nlh);
> int dev_filter_idx = 0, master_idx = 0;
> struct net *net = sock_net(skb->sk);
> struct rb_root *root = &net->nexthop.rb_root;
> - bool group_filter = false;
> struct rb_node *node;
> int idx = 0, s_idx;
> int err;
>
> err = nh_valid_dump_req(cb->nlh, &dev_filter_idx, &master_idx,
> - &group_filter, cb);
> + &group_filter, &fdb_filter, cb);
> if (err < 0)
> return err;
>
> --
> 2.1.4
>
Powered by blists - more mailing lists