commit edc38ecab7101487b35fc9152e166a2670467e49 Author: Cong Wang Date: Tue May 9 14:35:10 2017 -0700 ipv4: restore rt->fi for reference counting, try #2 Signed-off-by: Cong Wang diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 6692c57..0f04f4d 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -392,6 +392,7 @@ int fib_unmerge(struct net *net); /* Exported by fib_semantics.c */ int ip_fib_check_default(__be32 gw, struct net_device *dev); int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force); +void fib_put_nh_devs(struct net_device *dev); int fib_sync_down_addr(struct net_device *dev, __be32 local); int fib_sync_up(struct net_device *dev, unsigned int nh_flags); diff --git a/include/net/route.h b/include/net/route.h index 2cc0e14..4335eb7 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -69,6 +69,7 @@ struct rtable { struct list_head rt_uncached; struct uncached_list *rt_uncached_list; + struct fib_info *fi; /* for refcnt to shared metrics */ }; static inline bool rt_is_input_route(const struct rtable *rt) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 39bd1ed..59b0a1d 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1178,6 +1178,9 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo fib_disable_ip(dev, event, true); rt_flush_dev(dev); return NOTIFY_DONE; + } else if (event == NETDEV_UNREGISTER_FINAL) { + fib_put_nh_devs(dev); + return NOTIFY_DONE; } in_dev = __in_dev_get_rtnl(dev); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index da449dd..b85c5bb 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1453,6 +1453,36 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force) return ret; } +/* We have to release these nh_dev here because a dst could still hold a + * fib_info via rt->fi, we can't wait for GC, a socket could hold the dst + * for a long time. + */ +void fib_put_nh_devs(struct net_device *dev) +{ + struct fib_info *prev_fi = NULL; + unsigned int hash = fib_devindex_hashfn(dev->ifindex); + struct hlist_head *head = &fib_info_devhash[hash]; + struct fib_nh *nh; + + hlist_for_each_entry(nh, head, nh_hash) { + struct fib_info *fi = nh->nh_parent; + + if (nh->nh_dev != dev || fi == prev_fi) + continue; + prev_fi = fi; + change_nexthops(fi) { + if (nexthop_nh->nh_dev == dev) { + /* This should be safe, we are on unregister + * path, after synchronize_net() and + * rcu_barrier(), no one could see this. + */ + RCU_INIT_POINTER(nexthop_nh->nh_dev, NULL); + dev_put(dev); + } + } endfor_nexthops(fi) + } +} + /* Must be invoked inside of an RCU protected region. */ static void fib_select_default(const struct flowi4 *flp, struct fib_result *res) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 655d9ee..f647310 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1387,6 +1387,11 @@ static void ipv4_dst_destroy(struct dst_entry *dst) { struct rtable *rt = (struct rtable *) dst; + if (rt->fi) { + fib_info_put(rt->fi); + rt->fi = NULL; + } + if (!list_empty(&rt->rt_uncached)) { struct uncached_list *ul = rt->rt_uncached_list; @@ -1424,6 +1429,16 @@ static bool rt_cache_valid(const struct rtable *rt) !rt_is_expired(rt); } +static void rt_init_metrics(struct rtable *rt, struct fib_info *fi) +{ + if (fi->fib_metrics != (u32 *)dst_default_metrics) { + fib_info_hold(fi); + rt->fi = fi; + } + + dst_init_metrics(&rt->dst, fi->fib_metrics, true); +} + static void rt_set_nexthop(struct rtable *rt, __be32 daddr, const struct fib_result *res, struct fib_nh_exception *fnhe, @@ -1438,7 +1453,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr, rt->rt_gateway = nh->nh_gw; rt->rt_uses_gateway = 1; } - dst_init_metrics(&rt->dst, fi->fib_metrics, true); + rt_init_metrics(rt, fi); #ifdef CONFIG_IP_ROUTE_CLASSID rt->dst.tclassid = nh->nh_tclassid; #endif @@ -1490,6 +1505,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev, rt->rt_gateway = 0; rt->rt_uses_gateway = 0; rt->rt_table_id = 0; + rt->fi = NULL; INIT_LIST_HEAD(&rt->rt_uncached); rt->dst.output = ip_output;