[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20101219.211453.226783693.davem@davemloft.net>
Date: Sun, 19 Dec 2010 21:14:53 -0800 (PST)
From: David Miller <davem@...emloft.net>
To: eric.dumazet@...il.com
Cc: ebiederm@...ssion.com, netdev@...r.kernel.org,
daniel.lezcano@...e.fr
Subject: Re: [PATCH] ipv4: Flush per-ns routing cache more sanely.
From: Eric Dumazet <eric.dumazet@...il.com>
Date: Tue, 26 Oct 2010 21:30:22 +0200
> Le mardi 26 octobre 2010 à 12:20 -0700, David Miller a écrit :
>> From: ebiederm@...ssion.com (Eric W. Biederman)
>> Date: Tue, 26 Oct 2010 12:05:39 -0700
>>
>> >> @@ -999,7 +999,7 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
>> >> rt_cache_flush(dev_net(dev), 0);
>> >> break;
>> >> case NETDEV_UNREGISTER_BATCH:
>> >> - rt_cache_flush_batch();
>> >> + rt_cache_flush_batch(dev_net(dev));
>> >
>> > It still has this incorrect conversion in it.
>>
>> Sorry I missed that, what's the exact problem with it?
>
> Because the way _BATCH operation is performed, we call it once...
>
> rollback_registered_many() calls it for the first dev queued in the
> list.
>
> So it should be net independant.
Thanks Eric. I finally got back to fixing this issue and respinning
the patch.
Please review, in particular how I handled the RCU bits.
--------------------
ipv4: Flush per-ns routing cache more sanely.
Flush the routing cache only of entries that match the
network namespace in which the purge event occurred.
Signed-off-by: David S. Miller <davem@...emloft.net>
---
include/net/route.h | 2 +-
net/ipv4/fib_frontend.c | 6 +++-
net/ipv4/route.c | 68 ++++++++++++++++++-----------------------------
3 files changed, 32 insertions(+), 44 deletions(-)
diff --git a/include/net/route.h b/include/net/route.h
index 2700236..93e10c4 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -114,7 +114,7 @@ extern int ip_rt_init(void);
extern void ip_rt_redirect(__be32 old_gw, __be32 dst, __be32 new_gw,
__be32 src, struct net_device *dev);
extern void rt_cache_flush(struct net *net, int how);
-extern void rt_cache_flush_batch(void);
+extern void rt_cache_flush_batch(struct net *net);
extern int __ip_route_output_key(struct net *, struct rtable **, const struct flowi *flp);
extern int ip_route_output_key(struct net *, struct rtable **, struct flowi *flp);
extern int ip_route_output_flow(struct net *, struct rtable **rp, struct flowi *flp, struct sock *sk, int flags);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index d3a1112..9f8bb68 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -987,7 +987,11 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
rt_cache_flush(dev_net(dev), 0);
break;
case NETDEV_UNREGISTER_BATCH:
- rt_cache_flush_batch();
+ /* The batch unregister is only called on the first
+ * device in the list of devices being unregistered.
+ * Therefore we should not pass dev_net(dev) in here.
+ */
+ rt_cache_flush_batch(NULL);
break;
}
return NOTIFY_DONE;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ae52096..7c87d8e 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -717,13 +717,15 @@ static inline int rt_is_expired(struct rtable *rth)
* Can be called by a softirq or a process.
* In the later case, we want to be reschedule if necessary
*/
-static void rt_do_flush(int process_context)
+static void rt_do_flush(struct net *net, int process_context)
{
unsigned int i;
struct rtable *rth, *next;
- struct rtable * tail;
for (i = 0; i <= rt_hash_mask; i++) {
+ struct rtable __rcu **pprev;
+ struct rtable *list;
+
if (process_context && need_resched())
cond_resched();
rth = rcu_dereference_raw(rt_hash_table[i].chain);
@@ -731,52 +733,34 @@ static void rt_do_flush(int process_context)
continue;
spin_lock_bh(rt_hash_lock_addr(i));
-#ifdef CONFIG_NET_NS
- {
- struct rtable __rcu **prev;
- struct rtable *p;
- rth = rcu_dereference_protected(rt_hash_table[i].chain,
+ list = NULL;
+ pprev = &rt_hash_table[i].chain;
+ rth = rcu_dereference_protected(*pprev,
lockdep_is_held(rt_hash_lock_addr(i)));
- /* defer releasing the head of the list after spin_unlock */
- for (tail = rth; tail;
- tail = rcu_dereference_protected(tail->dst.rt_next,
- lockdep_is_held(rt_hash_lock_addr(i))))
- if (!rt_is_expired(tail))
- break;
- if (rth != tail)
- rt_hash_table[i].chain = tail;
-
- /* call rt_free on entries after the tail requiring flush */
- prev = &rt_hash_table[i].chain;
- for (p = rcu_dereference_protected(*prev,
- lockdep_is_held(rt_hash_lock_addr(i)));
- p != NULL;
- p = next) {
- next = rcu_dereference_protected(p->dst.rt_next,
+ while (rth) {
+ next = rcu_dereference_protected(rth->dst.rt_next,
lockdep_is_held(rt_hash_lock_addr(i)));
- if (!rt_is_expired(p)) {
- prev = &p->dst.rt_next;
+
+ if (!net ||
+ net_eq(dev_net(rth->dst.dev), net)) {
+ rcu_assign_pointer(*pprev, next);
+ rcu_assign_pointer(rth->dst.rt_next, list);
+ list = rth;
} else {
- *prev = next;
- rt_free(p);
+ pprev = &rth->dst.rt_next;
}
+ rth = next;
}
- }
-#else
- rth = rcu_dereference_protected(rt_hash_table[i].chain,
- lockdep_is_held(rt_hash_lock_addr(i)));
- rcu_assign_pointer(rt_hash_table[i].chain, NULL);
- tail = NULL;
-#endif
+
spin_unlock_bh(rt_hash_lock_addr(i));
- for (; rth != tail; rth = next) {
- next = rcu_dereference_protected(rth->dst.rt_next, 1);
- rt_free(rth);
- }
- }
+ for (; list; list = next) {
+ next = rcu_dereference_protected(list->dst.rt_next, 1);
+ rt_free(list);
+ }
+ }
}
/*
@@ -922,13 +906,13 @@ void rt_cache_flush(struct net *net, int delay)
{
rt_cache_invalidate(net);
if (delay >= 0)
- rt_do_flush(!in_softirq());
+ rt_do_flush(net, !in_softirq());
}
/* Flush previous cache invalidated entries from the cache */
-void rt_cache_flush_batch(void)
+void rt_cache_flush_batch(struct net *net)
{
- rt_do_flush(!in_softirq());
+ rt_do_flush(net, !in_softirq());
}
static void rt_emergency_hash_rebuild(struct net *net)
--
1.7.3.4
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists