netdev - [RFC] net/core/dst.c : Should'nt dst_run_gc() be more scalable and friendly ?

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20070816140914.de24335c.dada1@cosmosbay.com>
Date:	Thu, 16 Aug 2007 14:09:13 +0200
From:	Eric Dumazet <dada1@...mosbay.com>
To:	"netdev@...r.kernel.org" <netdev@...r.kernel.org>
Subject: [RFC] net/core/dst.c : Should'nt dst_run_gc() be more scalable and
 friendly ?

Hi all

When the periodic route cache flush is done, this machine suffers a lot and eventually triggers the "soft lockup" message.

dst_run_gc() is doing a scan of a possibly huge list of dst_entries, eventually freeing some (less than 1%) of them, but holding the dst_lock spinlock for the whole scan.

Then it rearms a timer to redo the full thing 1/10 s later. The slowdown can last one minute or so.
No need to say that machine is not really usable in the meantime.

I already looked at the code and I am testing a patch (included in this mail) where I limit the number of entries that can be scanned at each dst_run_gc() call, and not holding dst_lock during the scan. But then I dont know how to address the dst_dev_event() problem, since we must be able to dst_ifdown() all entries attached to one dev.

I am wondering how to really solve the problem. Could a workqueue be used here instead of a timer ?


Aug 16 06:21:37 SRV1 kernel: BUG: soft lockup detected on CPU#0!
Aug 16 06:21:37 SRV1 kernel: 
Aug 16 06:21:37 SRV1 kernel: Call Trace:
Aug 16 06:21:37 SRV1 kernel:  <IRQ>  [<ffffffff802286f0>] wake_up_process+0x10/0x20
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff80251e09>] softlockup_tick+0xe9/0x110
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803cd380>] dst_run_gc+0x0/0x140
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff802376f3>] run_local_timers+0x13/0x20
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff802379c7>] update_process_times+0x57/0x90
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff80216034>] smp_local_timer_interrupt+0x34/0x60
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff802165cc>] smp_apic_timer_interrupt+0x5c/0x80
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff8020a816>] apic_timer_interrupt+0x66/0x70
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803cd3d3>] dst_run_gc+0x53/0x140
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803cd3c6>] dst_run_gc+0x46/0x140
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff80237148>] run_timer_softirq+0x148/0x1c0
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff8023340c>] __do_softirq+0x6c/0xe0
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff8020ad6c>] call_softirq+0x1c/0x30
Aug 16 06:21:37 SRV1 kernel:  <EOI>  [<ffffffff8020cb34>] do_softirq+0x34/0x90
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff802331cf>] local_bh_enable_ip+0x3f/0x60
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff80422913>] _spin_unlock_bh+0x13/0x20
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803dfde8>] rt_garbage_collect+0x1d8/0x320
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803cd4dd>] dst_alloc+0x1d/0xa0
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803e1433>] __ip_route_output_key+0x573/0x800
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803c02e2>] sock_common_recvmsg+0x32/0x50
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803e16dc>] ip_route_output_flow+0x1c/0x60
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff80400160>] tcp_v4_connect+0x150/0x610
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803ebf07>] inet_bind_bucket_create+0x17/0x60
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff8040cd16>] inet_stream_connect+0xa6/0x2c0
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff80422981>] _spin_lock_bh+0x11/0x30
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803c0bdf>] lock_sock_nested+0xcf/0xe0
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff80422981>] _spin_lock_bh+0x11/0x30
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803be551>] sys_connect+0x71/0xa0
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803eee3f>] tcp_setsockopt+0x1f/0x30
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803c030f>] sock_common_setsockopt+0xf/0x20
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff803be4bd>] sys_setsockopt+0x9d/0xc0
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff8028881e>] sys_ioctl+0x5e/0x80
Aug 16 06:21:37 SRV1 kernel:  [<ffffffff80209c4e>] system_call+0x7e/0x83

Prelimary patch :

--- linux-2.6.22/net/core/dst.c
+++ linux-2.6.22-ed/net/core/dst.c
@@ -28,6 +28,9 @@
  * 4) All operations modify state, so a spinlock is used.
  */
 static struct dst_entry 	*dst_garbage_list;
+static struct dst_entry     *dst_garbage_wrk;
+static struct dst_entry     *dst_garbage_inuse;
+static int dst_gc_running;
 #if RT_CACHE_DEBUG >= 2
 static atomic_t			 dst_total = ATOMIC_INIT(0);
 #endif
@@ -42,26 +45,42 @@
 
 static void dst_run_gc(unsigned long dummy)
 {
-	int    delayed = 0;
-	int    work_performed;
-	struct dst_entry * dst, **dstp;
-
-	if (!spin_trylock(&dst_lock)) {
-		mod_timer(&dst_gc_timer, jiffies + HZ/10);
-		return;
-	}
-
+	int    quota = 1000;
+	int    work_performed = 0;
+	struct dst_entry *dst, *next;
+	struct dst_entry *first = NULL, *last = NULL;
+#if RT_CACHE_DEBUG >= 2
+	unsigned long t0 = jiffies;
+#endif
+	spin_lock(&dst_lock);
+	if (dst_gc_running)
+		goto out;
 	del_timer(&dst_gc_timer);
-	dstp = &dst_garbage_list;
-	work_performed = 0;
-	while ((dst = *dstp) != NULL) {
-		if (atomic_read(&dst->__refcnt)) {
-			dstp = &dst->next;
-			delayed++;
+	if (!dst_garbage_wrk) {
+		dst_garbage_wrk = dst_garbage_list;
+		dst_garbage_list = dst_garbage_inuse;
+		dst_garbage_inuse = NULL;
+	}
+	/*
+	 * before releasing dst_lock, tell others we are currently running
+	 * so they wont start timer or mess dst_garbage_wrk
+	 */
+	dst_gc_running = 1;
+	next = dst_garbage_wrk;
+	spin_unlock(&dst_lock);
+
+	while ((dst = next) != NULL) {
+		next = dst->next;
+		if (likely(atomic_read(&dst->__refcnt))) {
+			if (unlikely(last == NULL))
+				last = dst;
+			dst->next = first;
+			first = dst;
+			if (--quota == 0)
+				break;
 			continue;
 		}
-		*dstp = dst->next;
-		work_performed = 1;
+		work_performed++;
 
 		dst = dst_destroy(dst);
 		if (dst) {
@@ -77,16 +96,26 @@
 				continue;
 
 			___dst_free(dst);
-			dst->next = *dstp;
-			*dstp = dst;
-			dstp = &dst->next;
+			dst->next = next;
+			next = dst;
 		}
 	}
-	if (!dst_garbage_list) {
+	dst_garbage_wrk = next;
+
+	spin_lock(&dst_lock);
+	dst_gc_running = 0;
+	if (last != NULL) {
+		last->next = dst_garbage_inuse;
+		dst_garbage_inuse = first;
+	}
+	/*
+	 * If all lists are empty, no need to rearm a timer
+	 */
+	if (!dst_garbage_list && !dst_garbage_wrk && !dst_garbage_inuse) {
 		dst_gc_timer_inc = DST_GC_MAX;
 		goto out;
 	}
-	if (!work_performed) {
+	if (!work_performed && !dst_garbage_wrk) {
 		if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX)
 			dst_gc_timer_expires = DST_GC_MAX;
 		dst_gc_timer_inc += DST_GC_INC;
@@ -95,8 +124,12 @@
 		dst_gc_timer_expires = DST_GC_MIN;
 	}
 #if RT_CACHE_DEBUG >= 2
-	printk("dst_total: %d/%d %ld\n",
-	       atomic_read(&dst_total), delayed,  dst_gc_timer_expires);
+	if (net_msg_warn)
+		printk(KERN_DEBUG "dst_run_gc: "
+		"dst_total=%d quota=%d perf=%d expires=%ld elapsed=%lu\n",
+		atomic_read(&dst_total),
+		quota,  work_performed,
+		dst_gc_timer_expires, jiffies - t0);
 #endif
 	/* if the next desired timer is more than 4 seconds in the future
 	 * then round the timer to whole seconds
@@ -157,7 +190,7 @@
 	___dst_free(dst);
 	dst->next = dst_garbage_list;
 	dst_garbage_list = dst;
-	if (dst_gc_timer_inc > DST_GC_INC) {
+	if (dst_gc_timer_inc > DST_GC_INC && !dst_gc_running) {
 		dst_gc_timer_inc = DST_GC_INC;
 		dst_gc_timer_expires = DST_GC_MIN;
 		mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires);
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html