[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAEA6p_AgK08iXuSBbMDqzatGaJj_UFbNWiBV-dQp2r-Y71iesw@mail.gmail.com>
Date: Mon, 3 Jun 2019 11:09:55 -0700
From: Wei Wang <weiwan@...gle.com>
To: David Ahern <dsahern@...nel.org>
Cc: "David S . Miller" <davem@...emloft.net>,
Linux Kernel Network Developers <netdev@...r.kernel.org>,
idosch@...lanox.com, saeedm@...lanox.com,
Martin KaFai Lau <kafai@...com>,
David Ahern <dsahern@...il.com>
Subject: Re: [PATCH v2 net-next 4/7] ipv6: Plumb support for nexthop object in
a fib6_info
On Sun, Jun 2, 2019 at 9:08 PM David Ahern <dsahern@...nel.org> wrote:
>
> From: David Ahern <dsahern@...il.com>
>
> Add struct nexthop and nh_list list_head to fib6_info. nh_list is the
> fib6_info side of the nexthop <-> fib_info relationship. Since a fib6_info
> referencing a nexthop object can not have 'sibling' entries (the old way
> of doing multipath routes), the nh_list is a union with fib6_siblings.
>
> Add f6i_list list_head to 'struct nexthop' to track fib6_info entries
> using a nexthop instance. Update __remove_nexthop_fib to walk f6_list
> and delete fib entries using the nexthop.
>
> Add a few nexthop helpers for use when a nexthop is added to fib6_info:
> - nexthop_fib6_nh - return first fib6_nh in a nexthop object
> - fib6_info_nh_dev moved to nexthop.h and updated to use nexthop_fib6_nh
> if the fib6_info references a nexthop object
> - nexthop_path_fib6_result - similar to ipv4, select a path within a
> multipath nexthop object. If the nexthop is a blackhole, set
> fib6_result type to RTN_BLACKHOLE, and set the REJECT flag
>
> Update the fib6_info references to check for nh and take a different path
> as needed:
> - rt6_qualify_for_ecmp - if a fib entry uses a nexthop object it can NOT
> be coalesced with other fib entries into a multipath route
> - rt6_duplicate_nexthop - use nexthop_cmp if either fib6_info references
> a nexthop
> - addrconf (host routes), RA's and info entries (anything configured via
> ndisc) does not use nexthop objects
> - fib6_info_destroy_rcu - put reference to nexthop object
> - fib6_purge_rt - drop fib6_info from f6i_list
> - fib6_select_path - update to use the new nexthop_path_fib6_result when
> fib entry uses a nexthop object
> - rt6_device_match - update to catch use of nexthop object as a blackhole
> and set fib6_type and flags.
> - ip6_pol_route - detect the REJECT flag getting set for blackhole nexthop
> and jump to ip6_create_rt_rcu
> - ip6_route_info_create - don't add space for fib6_nh if fib entry is
> going to reference a nexthop object, take a reference to nexthop object,
> disallow use of source routing
> - rt6_nlmsg_size - add space for RTA_NH_ID
> - add rt6_fill_node_nexthop to add nexthop data on a dump
>
> As with ipv4, most of the changes push existing code into the else branch
> of whether the fib entry uses a nexthop object.
>
> Update the nexthop code to walk f6i_list on a nexthop deleted to remove
> fib entries referencing it.
>
> Signed-off-by: David Ahern <dsahern@...il.com>
> ---
> include/net/ip6_fib.h | 11 ++--
> include/net/ip6_route.h | 13 +++-
> include/net/nexthop.h | 50 ++++++++++++++++
> net/ipv4/nexthop.c | 44 ++++++++++++++
> net/ipv6/addrconf.c | 5 ++
> net/ipv6/ip6_fib.c | 22 +++++--
> net/ipv6/ndisc.c | 3 +-
> net/ipv6/route.c | 156 +++++++++++++++++++++++++++++++++++++++++-------
> 8 files changed, 268 insertions(+), 36 deletions(-)
>
> diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
> index ebe5d65f97e0..1a8acd51b277 100644
> --- a/include/net/ip6_fib.h
> +++ b/include/net/ip6_fib.h
> @@ -146,7 +146,10 @@ struct fib6_info {
> * destination, but not the same gateway. nsiblings is just a cache
> * to speed up lookup.
> */
> - struct list_head fib6_siblings;
> + union {
> + struct list_head fib6_siblings;
> + struct list_head nh_list;
> + };
> unsigned int fib6_nsiblings;
>
> refcount_t fib6_ref;
> @@ -170,6 +173,7 @@ struct fib6_info {
> unused:3;
>
> struct rcu_head rcu;
> + struct nexthop *nh;
> struct fib6_nh fib6_nh[0];
> };
>
> @@ -441,11 +445,6 @@ void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
> rcu_read_unlock();
> }
>
> -static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
> -{
> - return f6i->fib6_nh->fib_nh_dev;
> -}
> -
> int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
> struct fib6_config *cfg, gfp_t gfp_flags,
> struct netlink_ext_ack *extack);
> diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
> index a6ce6ea856b9..7375a165fd98 100644
> --- a/include/net/ip6_route.h
> +++ b/include/net/ip6_route.h
> @@ -27,6 +27,7 @@ struct route_info {
> #include <linux/ip.h>
> #include <linux/ipv6.h>
> #include <linux/route.h>
> +#include <net/nexthop.h>
>
> #define RT6_LOOKUP_F_IFACE 0x00000001
> #define RT6_LOOKUP_F_REACHABLE 0x00000002
> @@ -66,10 +67,13 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
> (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK);
> }
>
> +/* fib entries using a nexthop object can not be coalesced into
> + * a multipath route
> + */
> static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i)
> {
> /* the RTF_ADDRCONF flag filters out RA's */
> - return !(f6i->fib6_flags & RTF_ADDRCONF) &&
> + return !(f6i->fib6_flags & RTF_ADDRCONF) && !f6i->nh &&
> f6i->fib6_nh->fib_nh_gw_family;
> }
>
> @@ -275,8 +279,13 @@ static inline struct in6_addr *rt6_nexthop(struct rt6_info *rt,
>
> static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b)
> {
> - struct fib6_nh *nha = a->fib6_nh, *nhb = b->fib6_nh;
> + struct fib6_nh *nha, *nhb;
> +
> + if (a->nh || b->nh)
> + return nexthop_cmp(a->nh, b->nh);
>
> + nha = a->fib6_nh;
> + nhb = b->fib6_nh;
> return nha->fib_nh_dev == nhb->fib_nh_dev &&
> ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) &&
> !lwtunnel_cmp_encap(nha->fib_nh_lws, nhb->fib_nh_lws);
> diff --git a/include/net/nexthop.h b/include/net/nexthop.h
> index 2912a2d7a515..aff7b2410057 100644
> --- a/include/net/nexthop.h
> +++ b/include/net/nexthop.h
> @@ -10,6 +10,7 @@
> #define __LINUX_NEXTHOP_H
>
> #include <linux/netdevice.h>
> +#include <linux/route.h>
> #include <linux/types.h>
> #include <net/ip_fib.h>
> #include <net/ip6_fib.h>
> @@ -78,6 +79,7 @@ struct nh_group {
> struct nexthop {
> struct rb_node rb_node; /* entry on netns rbtree */
> struct list_head fi_list; /* v4 entries using nh */
> + struct list_head f6i_list; /* v6 entries using nh */
> struct list_head grp_list; /* nh group entries using this nh */
> struct net *net;
>
> @@ -255,4 +257,52 @@ static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel)
>
> return &fi->fib_nh[nhsel];
> }
> +
> +/*
> + * IPv6 variants
> + */
> +int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
> + struct netlink_ext_ack *extack);
> +
> +static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
> +{
> + struct nh_info *nhi;
> +
> + if (nexthop_is_multipath(nh)) {
> + nh = nexthop_mpath_select(nh, 0);
> + if (!nh)
> + return NULL;
> + }
> +
> + nhi = rcu_dereference_rtnl(nh->nh_info);
> + if (nhi->family == AF_INET6)
> + return &nhi->fib6_nh;
> +
> + return NULL;
> +}
> +
> +static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i)
> +{
> + struct fib6_nh *fib6_nh;
> +
> + fib6_nh = f6i->nh ? nexthop_fib6_nh(f6i->nh) : f6i->fib6_nh;
> + return fib6_nh->fib_nh_dev;
> +}
> +
> +static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash)
> +{
> + struct nexthop *nh = res->f6i->nh;
> + struct nh_info *nhi;
> +
> + nh = nexthop_select_path(nh, hash);
> +
> + nhi = rcu_dereference_rtnl(nh->nh_info);
> + if (nhi->reject_nh) {
> + res->fib6_type = RTN_BLACKHOLE;
> + res->fib6_flags |= RTF_REJECT;
> + res->nh = nexthop_fib6_nh(nh);
> + } else {
> + res->nh = &nhi->fib6_nh;
> + }
> +}
> #endif
> diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
> index 63cbb04f697f..5e48762b6b5f 100644
> --- a/net/ipv4/nexthop.c
> +++ b/net/ipv4/nexthop.c
> @@ -106,6 +106,7 @@ static struct nexthop *nexthop_alloc(void)
> nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL);
> if (nh) {
> INIT_LIST_HEAD(&nh->fi_list);
> + INIT_LIST_HEAD(&nh->f6i_list);
> INIT_LIST_HEAD(&nh->grp_list);
> }
> return nh;
> @@ -516,6 +517,41 @@ struct nexthop *nexthop_select_path(struct nexthop *nh, int hash)
> }
> EXPORT_SYMBOL_GPL(nexthop_select_path);
>
> +int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
> + struct netlink_ext_ack *extack)
> +{
> + struct nh_info *nhi;
> +
> + /* fib6_src is unique to a fib6_info and limits the ability to cache
> + * routes in fib6_nh within a nexthop that is potentially shared
> + * across multiple fib entries. If the config wants to use source
> + * routing it can not use nexthop objects. mlxsw also does not allow
> + * fib6_src on routes.
> + */
> + if (!ipv6_addr_any(&cfg->fc_src)) {
> + NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects");
> + return -EINVAL;
> + }
> +
> + if (nh->is_group) {
> + struct nh_group *nhg;
> +
> + nhg = rtnl_dereference(nh->nh_grp);
> + if (nhg->has_v4)
> + goto no_v4_nh;
> + } else {
> + nhi = rtnl_dereference(nh->nh_info);
> + if (nhi->family == AF_INET)
> + goto no_v4_nh;
> + }
> +
> + return 0;
> +no_v4_nh:
> + NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop");
> + return -EINVAL;
> +}
> +EXPORT_SYMBOL_GPL(fib6_check_nexthop);
> +
> static int nexthop_check_scope(struct nexthop *nh, u8 scope,
> struct netlink_ext_ack *extack)
> {
> @@ -658,6 +694,7 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
>
> static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
> {
> + struct fib6_info *f6i, *tmp;
> bool do_flush = false;
> struct fib_info *fi;
>
> @@ -667,6 +704,13 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
> }
> if (do_flush)
> fib_flush(net);
> +
> + /* ip6_del_rt removes the entry from this list hence the _safe */
> + list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
> + /* __ip6_del_rt does a release, so do a hold here */
> + fib6_info_hold(f6i);
Do we need fib6_info_hold_safe() here?
>
> + ipv6_stub->ip6_del_rt(net, f6i);
> + }
> }
>
> static void __remove_nexthop(struct net *net, struct nexthop *nh,
> diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
> index 6b673d4f5ca9..7549e779335d 100644
> --- a/net/ipv6/addrconf.c
> +++ b/net/ipv6/addrconf.c
> @@ -2421,6 +2421,10 @@ static struct fib6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
> goto out;
>
> for_each_fib6_node_rt_rcu(fn) {
> + /* prefix routes only use builtin fib6_nh */
> + if (rt->nh)
> + continue;
> +
> if (rt->fib6_nh->fib_nh_dev->ifindex != dev->ifindex)
> continue;
> if (no_gw && rt->fib6_nh->fib_nh_gw_family)
> @@ -6354,6 +6358,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
> list_for_each_entry(ifa, &idev->addr_list, if_list) {
> spin_lock(&ifa->lock);
> if (ifa->rt) {
> + /* host routes only use builtin fib6_nh */
> struct fib6_nh *nh = ifa->rt->fib6_nh;
> int cpu;
>
> diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
> index cdfb8500ccae..02feda73a98e 100644
> --- a/net/ipv6/ip6_fib.c
> +++ b/net/ipv6/ip6_fib.c
> @@ -159,6 +159,7 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
> if (!f6i)
> return NULL;
>
> + /* fib6_siblings is a union with nh_list, so this initializes both */
> INIT_LIST_HEAD(&f6i->fib6_siblings);
> refcount_set(&f6i->fib6_ref, 1);
>
> @@ -171,7 +172,11 @@ void fib6_info_destroy_rcu(struct rcu_head *head)
>
> WARN_ON(f6i->fib6_node);
>
> - fib6_nh_release(f6i->fib6_nh);
> + if (f6i->nh)
> + nexthop_put(f6i->nh);
> + else
> + fib6_nh_release(f6i->fib6_nh);
> +
> ip_fib_metrics_put(f6i->fib6_metrics);
> kfree(f6i);
> }
> @@ -927,6 +932,9 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
>
> fib6_drop_pcpu_from(rt, table);
>
> + if (rt->nh && !list_empty(&rt->nh_list))
> + list_del_init(&rt->nh_list);
> +
> if (refcount_read(&rt->fib6_ref) != 1) {
> /* This route is used as dummy address holder in some split
> * nodes. It is not leaked, but it still holds other resources,
> @@ -1334,6 +1342,8 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
>
> err = fib6_add_rt2node(fn, rt, info, extack);
> if (!err) {
> + if (rt->nh)
> + list_add(&rt->nh_list, &rt->nh->f6i_list);
> __fib6_update_sernum_upto_root(rt, sernum);
> fib6_start_gc(info->nl_net, rt);
> }
> @@ -2295,9 +2305,13 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
> {
> struct fib6_info *rt = v;
> struct ipv6_route_iter *iter = seq->private;
> + struct fib6_nh *fib6_nh = rt->fib6_nh;
> unsigned int flags = rt->fib6_flags;
> const struct net_device *dev;
>
> + if (rt->nh)
> + fib6_nh = nexthop_fib6_nh(rt->nh);
> +
> seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
>
> #ifdef CONFIG_IPV6_SUBTREES
> @@ -2305,14 +2319,14 @@ static int ipv6_route_seq_show(struct seq_file *seq, void *v)
> #else
> seq_puts(seq, "00000000000000000000000000000000 00 ");
> #endif
> - if (rt->fib6_nh->fib_nh_gw_family) {
> + if (fib6_nh->fib_nh_gw_family) {
> flags |= RTF_GATEWAY;
> - seq_printf(seq, "%pi6", &rt->fib6_nh->fib_nh_gw6);
> + seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
> } else {
> seq_puts(seq, "00000000000000000000000000000000");
> }
>
> - dev = rt->fib6_nh->fib_nh_dev;
> + dev = fib6_nh->fib_nh_dev;
> seq_printf(seq, " %08x %08x %08x %08x %8s\n",
> rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
> flags, dev ? dev->name : "");
> diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
> index f874dde1ee85..6e3c51109c83 100644
> --- a/net/ipv6/ndisc.c
> +++ b/net/ipv6/ndisc.c
> @@ -1289,9 +1289,8 @@ static void ndisc_router_discovery(struct sk_buff *skb)
> !in6_dev->cnf.accept_ra_rtr_pref)
> pref = ICMPV6_ROUTER_PREF_MEDIUM;
> #endif
> -
> + /* routes added from RAs do not use nexthop objects */
> rt = rt6_get_dflt_router(net, &ipv6_hdr(skb)->saddr, skb->dev);
> -
> if (rt) {
> neigh = ip6_neigh_lookup(&rt->fib6_nh->fib_nh_gw6,
> rt->fib6_nh->fib_nh_dev, NULL,
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index fada5a13bcb2..51cb5cb027ae 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -432,15 +432,21 @@ void fib6_select_path(const struct net *net, struct fib6_result *res,
> struct fib6_info *sibling, *next_sibling;
> struct fib6_info *match = res->f6i;
>
> - if (!match->fib6_nsiblings || have_oif_match)
> + if ((!match->fib6_nsiblings && !match->nh) || have_oif_match)
> goto out;
So you mentioned fib6_nsiblings and nexthop is mutually exclusive. Is
it enforced from the configuration?
>
>
> /* We might have already computed the hash for ICMPv6 errors. In such
> * case it will always be non-zero. Otherwise now is the time to do it.
> */
> - if (!fl6->mp_hash)
> + if (!fl6->mp_hash &&
> + (!match->nh || nexthop_is_multipath(match->nh)))
> fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
>
> + if (unlikely(match->nh)) {
> + nexthop_path_fib6_result(res, fl6->mp_hash);
> + return;
> + }
> +
> if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound))
> goto out;
>
> @@ -496,7 +502,13 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
> struct fib6_nh *nh;
>
> if (!oif && ipv6_addr_any(saddr)) {
> - nh = f6i->fib6_nh;
> + if (unlikely(f6i->nh)) {
> + nh = nexthop_fib6_nh(f6i->nh);
> + if (nexthop_is_blackhole(f6i->nh))
> + goto out_blackhole;
> + } else {
> + nh = f6i->fib6_nh;
> + }
> if (!(nh->fib_nh_flags & RTNH_F_DEAD))
> goto out;
> }
> @@ -515,7 +527,14 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
> goto out;
> }
>
> - nh = f6i->fib6_nh;
> + if (unlikely(f6i->nh)) {
> + nh = nexthop_fib6_nh(f6i->nh);
> + if (nexthop_is_blackhole(f6i->nh))
> + goto out_blackhole;
> + } else {
> + nh = f6i->fib6_nh;
> + }
> +
> if (nh->fib_nh_flags & RTNH_F_DEAD) {
> res->f6i = net->ipv6.fib6_null_entry;
> nh = res->f6i->fib6_nh;
> @@ -524,6 +543,12 @@ static void rt6_device_match(struct net *net, struct fib6_result *res,
> res->nh = nh;
> res->fib6_type = res->f6i->fib6_type;
> res->fib6_flags = res->f6i->fib6_flags;
> + return;
> +
> +out_blackhole:
> + res->fib6_flags |= RTF_REJECT;
> + res->fib6_type = RTN_BLACKHOLE;
> + res->nh = nh;
> }
>
> #ifdef CONFIG_IPV6_ROUTER_PREF
> @@ -1117,6 +1142,8 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
> rt = net->ipv6.ip6_null_entry;
> dst_hold(&rt->dst);
> goto out;
> + } else if (res.fib6_flags & RTF_REJECT) {
> + goto do_create;
>
> }
>
> fib6_select_path(net, &res, fl6, fl6->flowi6_oif,
> @@ -1128,6 +1155,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net *net,
> if (ip6_hold_safe(net, &rt))
> dst_use_noref(&rt->dst, jiffies);
> } else {
> +do_create:
> rt = ip6_create_rt_rcu(&res);
> }
>
> @@ -1982,6 +2010,14 @@ struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
> rcu_read_unlock();
> dst_hold(&rt->dst);
> return rt;
> + } else if (res.fib6_flags & RTF_REJECT) {
> + rt = ip6_create_rt_rcu(&res);
> + rcu_read_unlock();
> + if (!rt) {
> + rt = net->ipv6.ip6_null_entry;
> + dst_hold(&rt->dst);
> + }
> + return rt;
> }
>
Why do we need to call ip6_create_rt_rcu() to create a dst cache? Can
we directly return ip6_null_entry here? This route is anyway meant to
drop the packet. Same goes for the change in ip6_pol_route_lookup().
And for my education, how does this new nexthop logic interact with
the pcpu_rt cache and the exception table? Those 2 are currently
stored in struct fib6_nh. They are shared with fib6_siblings under the
same fib6_info. Are they also shared with nexthop for the same
fib6_info?
I don't see much changes around that area. So I assume they work as is?
>
> fib6_select_path(net, &res, fl6, oif, false, skb, strict);
> @@ -3217,7 +3253,9 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
> {
> struct net *net = cfg->fc_nlinfo.nl_net;
> struct fib6_info *rt = NULL;
> + struct nexthop *nh = NULL;
> struct fib6_table *table;
> + struct fib6_nh *fib6_nh;
> int err = -EINVAL;
> int addr_type;
>
> @@ -3270,7 +3308,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
> goto out;
>
> err = -ENOMEM;
> - rt = fib6_info_alloc(gfp_flags, true);
> + rt = fib6_info_alloc(gfp_flags, !nh);
> if (!rt)
> goto out;
>
> @@ -3310,19 +3348,35 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
> ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
> rt->fib6_src.plen = cfg->fc_src_len;
> #endif
> - err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
> - if (err)
> - goto out;
> + if (nh) {
> + if (!nexthop_get(nh)) {
> + NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
> + goto out;
> + }
> + if (rt->fib6_src.plen) {
> + NL_SET_ERR_MSG(extack, "Nexthops can not be used wtih source routing");
> + goto out;
> + }
> + rt->nh = nh;
> + fib6_nh = nexthop_fib6_nh(rt->nh);
> + } else {
> + err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
> + if (err)
> + goto out;
>
> - /* We cannot add true routes via loopback here,
> - * they would result in kernel looping; promote them to reject routes
> - */
> - addr_type = ipv6_addr_type(&cfg->fc_dst);
> - if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev, addr_type))
> - rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
> + fib6_nh = rt->fib6_nh;
> +
> + /* We cannot add true routes via loopback here, they would
> + * result in kernel looping; promote them to reject routes
> + */
> + addr_type = ipv6_addr_type(&cfg->fc_dst);
> + if (fib6_is_reject(cfg->fc_flags, rt->fib6_nh->fib_nh_dev,
> + addr_type))
> + rt->fib6_flags = RTF_REJECT | RTF_NONEXTHOP;
> + }
>
> if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
> - struct net_device *dev = fib6_info_nh_dev(rt);
> + struct net_device *dev = fib6_nh->fib_nh_dev;
>
> if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
> NL_SET_ERR_MSG(extack, "Invalid source address");
> @@ -3678,6 +3732,9 @@ static struct fib6_info *rt6_get_route_info(struct net *net,
> goto out;
>
> for_each_fib6_node_rt_rcu(fn) {
> + /* these routes do not use nexthops */
> + if (rt->nh)
> + continue;
> if (rt->fib6_nh->fib_nh_dev->ifindex != ifindex)
> continue;
> if (!(rt->fib6_flags & RTF_ROUTEINFO) ||
> @@ -3741,8 +3798,13 @@ struct fib6_info *rt6_get_dflt_router(struct net *net,
>
> rcu_read_lock();
> for_each_fib6_node_rt_rcu(&table->tb6_root) {
> - struct fib6_nh *nh = rt->fib6_nh;
> + struct fib6_nh *nh;
> +
> + /* RA routes do not use nexthops */
> + if (rt->nh)
> + continue;
>
> + nh = rt->fib6_nh;
> if (dev == nh->fib_nh_dev &&
> ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
> ipv6_addr_equal(&nh->fib_nh_gw6, addr))
> @@ -3993,7 +4055,8 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
> struct net *net = ((struct arg_dev_net_ip *)arg)->net;
> struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
>
> - if (((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
> + if (!rt->nh &&
> + ((void *)rt->fib6_nh->fib_nh_dev == dev || !dev) &&
> rt != net->ipv6.fib6_null_entry &&
> ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
> spin_lock_bh(&rt6_exception_lock);
> @@ -4021,8 +4084,13 @@ void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
> static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
> {
> struct in6_addr *gateway = (struct in6_addr *)arg;
> - struct fib6_nh *nh = rt->fib6_nh;
> + struct fib6_nh *nh;
>
> + /* RA routes do not use nexthops */
> + if (rt->nh)
> + return 0;
> +
> + nh = rt->fib6_nh;
> if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
> nh->fib_nh_gw_family && ipv6_addr_equal(gateway, &nh->fib_nh_gw6))
> return -1;
> @@ -4069,6 +4137,7 @@ static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
> return NULL;
> }
>
> +/* only called for fib entries with builtin fib6_nh */
> static bool rt6_is_dead(const struct fib6_info *rt)
> {
> if (rt->fib6_nh->fib_nh_flags & RTNH_F_DEAD ||
> @@ -4147,7 +4216,7 @@ static int fib6_ifup(struct fib6_info *rt, void *p_arg)
> const struct arg_netdev_event *arg = p_arg;
> struct net *net = dev_net(arg->dev);
>
> - if (rt != net->ipv6.fib6_null_entry &&
> + if (rt != net->ipv6.fib6_null_entry && !rt->nh &&
> rt->fib6_nh->fib_nh_dev == arg->dev) {
> rt->fib6_nh->fib_nh_flags &= ~arg->nh_flags;
> fib6_update_sernum_upto_root(net, rt);
> @@ -4172,6 +4241,7 @@ void rt6_sync_up(struct net_device *dev, unsigned char nh_flags)
> fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
> }
>
> +/* only called for fib entries with inline fib6_nh */
> static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
> const struct net_device *dev)
> {
> @@ -4232,7 +4302,7 @@ static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
> const struct net_device *dev = arg->dev;
> struct net *net = dev_net(dev);
>
> - if (rt == net->ipv6.fib6_null_entry)
> + if (rt == net->ipv6.fib6_null_entry || rt->nh)
> return 0;
>
> switch (arg->event) {
> @@ -4786,6 +4856,9 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
> {
> int nexthop_len = 0;
>
> + if (rt->nh)
> + nexthop_len += nla_total_size(4); /* RTA_NH_ID */
> +
> if (rt->fib6_nsiblings) {
> nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
> + NLA_ALIGN(sizeof(struct rtnexthop))
> @@ -4812,6 +4885,35 @@ static size_t rt6_nlmsg_size(struct fib6_info *rt)
> + nexthop_len;
> }
>
> +static int rt6_fill_node_nexthop(struct sk_buff *skb, struct nexthop *nh,
> + unsigned char *flags)
> +{
> + if (nexthop_is_multipath(nh)) {
> + struct nlattr *mp;
> +
> + mp = nla_nest_start(skb, RTA_MULTIPATH);
> + if (!mp)
> + goto nla_put_failure;
> +
> + if (nexthop_mpath_fill_node(skb, nh))
> + goto nla_put_failure;
> +
> + nla_nest_end(skb, mp);
> + } else {
> + struct fib6_nh *fib6_nh;
> +
> + fib6_nh = nexthop_fib6_nh(nh);
> + if (fib_nexthop_info(skb, &fib6_nh->nh_common,
> + flags, false) < 0)
> + goto nla_put_failure;
> + }
> +
> + return 0;
> +
> +nla_put_failure:
> + return -EMSGSIZE;
> +}
> +
> static int rt6_fill_node(struct net *net, struct sk_buff *skb,
> struct fib6_info *rt, struct dst_entry *dst,
> struct in6_addr *dest, struct in6_addr *src,
> @@ -4821,6 +4923,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
> struct rt6_info *rt6 = (struct rt6_info *)dst;
> struct rt6key *rt6_dst, *rt6_src;
> u32 *pmetrics, table, rt6_flags;
> + unsigned char nh_flags = 0;
> struct nlmsghdr *nlh;
> struct rtmsg *rtm;
> long expires = 0;
> @@ -4940,9 +5043,18 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb,
> }
>
> nla_nest_end(skb, mp);
> - } else {
> - unsigned char nh_flags = 0;
> + } else if (rt->nh) {
> + if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id))
> + goto nla_put_failure;
> +
> + if (nexthop_is_blackhole(rt->nh))
> + rtm->rtm_type = RTN_BLACKHOLE;
>
> + if (rt6_fill_node_nexthop(skb, rt->nh, &nh_flags) < 0)
> + goto nla_put_failure;
> +
> + rtm->rtm_flags |= nh_flags;
> + } else {
> if (fib_nexthop_info(skb, &rt->fib6_nh->nh_common,
> &nh_flags, false) < 0)
> goto nla_put_failure;
> --
> 2.11.0
>
Powered by blists - more mailing lists