linux-kernel - Re: init_timer_deferrable conversion

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20071217152943.10470215.dada1@cosmosbay.com>
Date:	Mon, 17 Dec 2007 15:29:43 +0100
From:	Eric Dumazet <dada1@...mosbay.com>
To:	"parag.warudkar@...il.com" <parag.warudkar@...il.com>
Cc:	linux-kernel@...r.kernel.org, David Miller <davem@...emloft.net>,
	"netdev@...r.kernel.org" <netdev@...r.kernel.org>
Subject: Re: init_timer_deferrable conversion

On Mon, 17 Dec 2007 09:55:04 +0100
Eric Dumazet <dada1@...mosbay.com> wrote:

> On Sun, 16 Dec 2007 22:00:23 -0500 (EST)
> Parag Warudkar <parag.warudkar@...il.com> wrote:
> 
> > In my quest to get the wake-ups from idle per second down to bare minimum, 
> > I noticed 3 places in the kernel that could benefit from 
> > using init_timer_deferrable() instead of init_timer() -
> > 
> > a) drivers/net/sky2.c - watchdog_timer. This was showing up high on 
> > Powertop's list of things that cause routine wakeups from idle. After 
> > converting to init_timer_deferrable() the wakeups went down and this one 
> > no longer shows up in powertop's list. 25% reduction.
> > 
> > b) kernel/time/clocksource.c - watchdog_timer - same story as sky2.c
> > 
> > c) net/core/neighbour.c - gc_timer - Most benefit from deferrable timer.
> 
> neigh_periodic_timer() is actually doing almost nothing per round, since it
> looks only one slot of hash table. We could probably convert it to a
> workqueue and scan whole table at once.
> 

Parag, could you please try this patch ?

[NET] ARP : Convert neigh garbage collection from softirq to workqueue

Current neigh_periodic_timer() function is fired by timer IRQ, and
scans one hash bucket out of a potentially big number.

As we are supposed to scan whole hash table in 15 seconds, this means
neigh_periodic_timer() can be fired very often. (depending on the number
of concurrent hash entries we stored in this table)

Converting this to a workqueue permits scaning whole table, minimizing icache
pollution, and firing this work every 15 seconds, independantly of hash table
size.

Signed-off-by: Eric Dumazet <dada1@...mosbay.com>

 include/net/neighbour.h |    4 -
 net/core/neighbour.c    |   89 ++++++++++++++++++--------------------
 2 files changed, 45 insertions(+), 48 deletions(-)

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index a4f2618..fdb9251 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -24,6 +24,7 @@
 
 #include <linux/err.h>
 #include <linux/sysctl.h>
+#include <linux/workqueue.h>
 #include <net/rtnetlink.h>
 
 #define NUD_IN_TIMER	(NUD_INCOMPLETE|NUD_REACHABLE|NUD_DELAY|NUD_PROBE)
@@ -155,7 +156,7 @@ struct neigh_table
 	int			gc_thresh2;
 	int			gc_thresh3;
 	unsigned long		last_flush;
-	struct timer_list 	gc_timer;
+	struct delayed_work	gc_work;
 	struct timer_list 	proxy_timer;
 	struct sk_buff_head	proxy_queue;
 	atomic_t		entries;
@@ -166,7 +167,6 @@ struct neigh_table
 	struct neighbour	**hash_buckets;
 	unsigned int		hash_mask;
 	__u32			hash_rnd;
-	unsigned int		hash_chain_gc;
 	struct pneigh_entry	**phash_buckets;
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry	*pde;
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 4b6dd1e..495ab19 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -637,75 +637,74 @@ static void neigh_connect(struct neighbour *neigh)
 		hh->hh_output = neigh->ops->hh_output;
 }
 
-static void neigh_periodic_timer(unsigned long arg)
+static void neigh_periodic_work(struct work_struct *work)
 {
-	struct neigh_table *tbl = (struct neigh_table *)arg;
+	struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work);
 	struct neighbour *n, **np;
-	unsigned long expire, now = jiffies;
+	unsigned int i;
 
 	NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs);
 
-	write_lock(&tbl->lock);
+	write_lock_bh(&tbl->lock);
 
 	/*
 	 *	periodically recompute ReachableTime from random function
 	 */
 
-	if (time_after(now, tbl->last_rand + 300 * HZ)) {
+	if (time_after(jiffies, tbl->last_rand + 300 * HZ)) {
 		struct neigh_parms *p;
-		tbl->last_rand = now;
+		tbl->last_rand = jiffies;
 		for (p = &tbl->parms; p; p = p->next)
 			p->reachable_time =
 				neigh_rand_reach_time(p->base_reachable_time);
 	}
 
-	np = &tbl->hash_buckets[tbl->hash_chain_gc];
-	tbl->hash_chain_gc = ((tbl->hash_chain_gc + 1) & tbl->hash_mask);
+	for (i = 0 ; i <= tbl->hash_mask; i++) {
+		np = &tbl->hash_buckets[i];
 
-	while ((n = *np) != NULL) {
-		unsigned int state;
+		while ((n = *np) != NULL) {
+			unsigned int state;
 
-		write_lock(&n->lock);
+			write_lock(&n->lock);
 
-		state = n->nud_state;
-		if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
-			write_unlock(&n->lock);
-			goto next_elt;
-		}
+			state = n->nud_state;
+			if (state & (NUD_PERMANENT | NUD_IN_TIMER)) {
+				write_unlock(&n->lock);
+				goto next_elt;
+			}
 
-		if (time_before(n->used, n->confirmed))
-			n->used = n->confirmed;
+			if (time_before(n->used, n->confirmed))
+				n->used = n->confirmed;
 
-		if (atomic_read(&n->refcnt) == 1 &&
-		    (state == NUD_FAILED ||
-		     time_after(now, n->used + n->parms->gc_staletime))) {
-			*np = n->next;
-			n->dead = 1;
+			if (atomic_read(&n->refcnt) == 1 &&
+			    (state == NUD_FAILED ||
+			     time_after(jiffies, n->used + n->parms->gc_staletime))) {
+				*np = n->next;
+				n->dead = 1;
+				write_unlock(&n->lock);
+				neigh_cleanup_and_release(n);
+				continue;
+			}
 			write_unlock(&n->lock);
-			neigh_cleanup_and_release(n);
-			continue;
-		}
-		write_unlock(&n->lock);
 
 next_elt:
-		np = &n->next;
+			np = &n->next;
+		}
+		/*
+		 * It's fine to release lock here, even if hash table
+		 * grows while we are preempted.
+		 */
+		write_unlock_bh(&tbl->lock);
+		cond_resched();
+		write_lock_bh(&tbl->lock);
 	}
-
 	/* Cycle through all hash buckets every base_reachable_time/2 ticks.
 	 * ARP entry timeouts range from 1/2 base_reachable_time to 3/2
 	 * base_reachable_time.
 	 */
-	expire = tbl->parms.base_reachable_time >> 1;
-	expire /= (tbl->hash_mask + 1);
-	if (!expire)
-		expire = 1;
-
-	if (expire>HZ)
-		mod_timer(&tbl->gc_timer, round_jiffies(now + expire));
-	else
-		mod_timer(&tbl->gc_timer, now + expire);
-
-	write_unlock(&tbl->lock);
+	schedule_delayed_work(&tbl->gc_work,
+			      tbl->parms.base_reachable_time >> 1);
+	write_unlock_bh(&tbl->lock);
 }
 
 static __inline__ int neigh_max_probes(struct neighbour *n)
@@ -1370,10 +1369,8 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
 	get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd));
 
 	rwlock_init(&tbl->lock);
-	setup_timer(&tbl->gc_timer, neigh_periodic_timer, (unsigned long)tbl);
-	tbl->gc_timer.expires  = now + 1;
-	add_timer(&tbl->gc_timer);
-
+	INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work);
+	schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time);
 	setup_timer(&tbl->proxy_timer, neigh_proxy_process, (unsigned long)tbl);
 	skb_queue_head_init_class(&tbl->proxy_queue,
 			&neigh_table_proxy_queue_class);
@@ -1408,7 +1405,8 @@ int neigh_table_clear(struct neigh_table *tbl)
 	struct neigh_table **tp;
 
 	/* It is not clean... Fix it to unload IPv6 module safely */
-	del_timer_sync(&tbl->gc_timer);
+	cancel_delayed_work(&tbl->gc_work);
+	flush_scheduled_work();
 	del_timer_sync(&tbl->proxy_timer);
 	pneigh_queue_purge(&tbl->proxy_queue);
 	neigh_ifdown(tbl, NULL);
@@ -1678,7 +1676,6 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
 			.ndtc_last_rand		= jiffies_to_msecs(rand_delta),
 			.ndtc_hash_rnd		= tbl->hash_rnd,
 			.ndtc_hash_mask		= tbl->hash_mask,
-			.ndtc_hash_chain_gc	= tbl->hash_chain_gc,
 			.ndtc_proxy_qlen	= tbl->proxy_queue.qlen,
 		};
 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/