[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4A12D10D.3000504@cosmosbay.com>
Date: Tue, 19 May 2009 17:32:29 +0200
From: Eric Dumazet <dada1@...mosbay.com>
To: Jarek Poplawski <jarkao2@...il.com>
CC: lav@....ru, Stephen Hemminger <shemminger@...ux-foundation.org>,
netdev@...r.kernel.org, Neil Horman <nhorman@...driver.com>
Subject: Re: Fw: [Bug 13339] New: rtable leak in ipv4/route.c
Jarek Poplawski a écrit :
> On 19-05-2009 04:35, Stephen Hemminger wrote:
>> Begin forwarded message:
>>
>> Date: Mon, 18 May 2009 14:10:20 GMT
>> From: bugzilla-daemon@...zilla.kernel.org
>> To: shemminger@...ux-foundation.org
>> Subject: [Bug 13339] New: rtable leak in ipv4/route.c
>>
>>
>> http://bugzilla.kernel.org/show_bug.cgi?id=13339
> ...
>> 2.6.29 patch has introduced flexible route cache rebuilding. Unfortunately the
>> patch has at least one critical flaw, and another problem.
>>
>> rt_intern_hash calculates rthi pointer, which is later used for new entry
>> insertion. The same loop calculates cand pointer which is used to clean the
>> list. If the pointers are the same, rtable leak occurs, as first the cand is
>> removed then the new entry is appended to it.
>>
>> This leak leads to unregister_netdevice problem (usage count > 0).
>>
>> Another problem of the patch is that it tries to insert the entries in certain
>> order, to facilitate counting of entries distinct by all but QoS parameters.
>> Unfortunately, referencing an existing rtable entry moves it to list beginning,
>> to speed up further lookups, so the carefully built order is destroyed.
We could change rt_check_expire() to be smarter and handle any order in chains.
This would let rt_intern_hash() be simpler.
As its a more performance critical path, all would be good :)
>>
>> For the first problem the simplest patch it to set rthi=0 when rthi==cand, but
>> it will also destroy the ordering.
>
> I think fixing this bug fast is more important than this
> ordering or counting. Could you send your patch proposal?
>
Here is mine, only compiled, not tested yet.
All credits for Stephen for doing the analysis of course :)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index c4c60e9..fbe77ad 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -780,12 +780,37 @@ static void rt_do_flush(int process_context)
#define FRACT_BITS 3
#define ONE (1UL << FRACT_BITS)
+static unsigned long compute_length(struct rtable *head)
+{
+ struct rtable *rth, *aux;
+ unsigned long length = 0;
+
+ for (rth = head; rth != NULL; rth = rth->u.dst.rt_next) {
+ /*
+ * We ignore items having same hash inputs
+ * so that entries for different QOS
+ * levels, and other non-hash input
+ * attributes don't unfairly skew
+ * the length computation
+ */
+ for (aux = head; ;aux = aux->u.dst.rt_next) {
+ if (aux == rth) {
+ length += ONE;
+ break;
+ }
+ if (compare_hash_inputs(&aux->fl, &rth->fl))
+ break;
+ }
+ }
+ return length;
+}
+
static void rt_check_expire(void)
{
static unsigned int rover;
unsigned int i = rover, goal;
struct rtable *rth, **rthp;
- unsigned long length = 0, samples = 0;
+ unsigned long length, samples = 0;
unsigned long sum = 0, sum2 = 0;
u64 mult;
@@ -795,7 +820,6 @@ static void rt_check_expire(void)
goal = (unsigned int)mult;
if (goal > rt_hash_mask)
goal = rt_hash_mask + 1;
- length = 0;
for (; goal > 0; goal--) {
unsigned long tmo = ip_rt_gc_timeout;
@@ -821,29 +845,11 @@ static void rt_check_expire(void)
if (time_before_eq(jiffies, rth->u.dst.expires)) {
tmo >>= 1;
rthp = &rth->u.dst.rt_next;
- /*
- * Only bump our length if the hash
- * inputs on entries n and n+1 are not
- * the same, we only count entries on
- * a chain with equal hash inputs once
- * so that entries for different QOS
- * levels, and other non-hash input
- * attributes don't unfairly skew
- * the length computation
- */
- if ((*rthp == NULL) ||
- !compare_hash_inputs(&(*rthp)->fl,
- &rth->fl))
- length += ONE;
continue;
}
} else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
tmo >>= 1;
rthp = &rth->u.dst.rt_next;
- if ((*rthp == NULL) ||
- !compare_hash_inputs(&(*rthp)->fl,
- &rth->fl))
- length += ONE;
continue;
}
@@ -851,6 +857,7 @@ static void rt_check_expire(void)
*rthp = rth->u.dst.rt_next;
rt_free(rth);
}
+ length = compute_length(rt_hash_table[i].chain);
spin_unlock_bh(rt_hash_lock_addr(i));
sum += length;
sum2 += length*length;
@@ -1068,7 +1075,6 @@ out: return 0;
static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
{
struct rtable *rth, **rthp;
- struct rtable *rthi;
unsigned long now;
struct rtable *cand, **candp;
u32 min_score;
@@ -1088,7 +1094,6 @@ restart:
}
rthp = &rt_hash_table[hash].chain;
- rthi = NULL;
spin_lock_bh(rt_hash_lock_addr(hash));
while ((rth = *rthp) != NULL) {
@@ -1134,17 +1139,6 @@ restart:
chain_length++;
rthp = &rth->u.dst.rt_next;
-
- /*
- * check to see if the next entry in the chain
- * contains the same hash input values as rt. If it does
- * This is where we will insert into the list, instead of
- * at the head. This groups entries that differ by aspects not
- * relvant to the hash function together, which we use to adjust
- * our chain length
- */
- if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
- rthi = rth;
}
if (cand) {
@@ -1205,10 +1199,7 @@ restart:
}
}
- if (rthi)
- rt->u.dst.rt_next = rthi->u.dst.rt_next;
- else
- rt->u.dst.rt_next = rt_hash_table[hash].chain;
+ rt->u.dst.rt_next = rt_hash_table[hash].chain;
#if RT_CACHE_DEBUG >= 2
if (rt->u.dst.rt_next) {
@@ -1224,10 +1215,7 @@ restart:
* previous writes to rt are comitted to memory
* before making rt visible to other CPUS.
*/
- if (rthi)
- rcu_assign_pointer(rthi->u.dst.rt_next, rt);
- else
- rcu_assign_pointer(rt_hash_table[hash].chain, rt);
+ rcu_assign_pointer(rt_hash_table[hash].chain, rt);
spin_unlock_bh(rt_hash_lock_addr(hash));
*rp = rt;
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Powered by blists - more mailing lists