[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20070116025644.GA3954@localhost.localdomain>
Date: Mon, 15 Jan 2007 18:56:44 -0800
From: Ravikiran G Thirumalai <kiran@...lex86.org>
To: Andrew Morton <akpm@...l.org>
Cc: nickpiggin@...oo.com.au, linux-kernel@...r.kernel.org,
linux-mm@...ck.org, ak@...e.de, shai@...lex86.org,
pravin.shelar@...softinc.com
Subject: Re: High lock spin time for zone->lru_lock under extreme conditions
On Sat, Jan 13, 2007 at 01:20:23PM -0800, Andrew Morton wrote:
>
> Seeing the code helps.
But there was a subtle problem with hold time instrumentation here.
The code assumed the critical section exiting through
spin_unlock_irq entered critical section with spin_lock_irq, but that
might not be the case always, and the instrumentation for hold time goes bad
when that happens (as in shrink_inactive_list)
>
> > The
> > instrumentation goes like this:
> >
> > void __lockfunc _spin_lock_irq(spinlock_t *lock)
> > {
> > unsigned long long t1,t2;
> > local_irq_disable();
> > t1 = get_cycles_sync();
> > preempt_disable();
> > spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
> > _raw_spin_lock(lock);
> > t2 = get_cycles_sync();
> > lock->raw_lock.htsc = t2;
> > if (lock->spin_time < (t2 - t1))
> > lock->spin_time = t2 - t1;
> > }
> > ...
> >
> > void __lockfunc _spin_unlock_irq(spinlock_t *lock)
> > {
> > unsigned long long t1 ;
> > spin_release(&lock->dep_map, 1, _RET_IP_);
> > t1 = get_cycles_sync();
> > if (lock->cs_time < (t1 - lock->raw_lock.htsc))
> > lock->cs_time = t1 - lock->raw_lock.htsc;
> > _raw_spin_unlock(lock);
> > local_irq_enable();
> > preempt_enable();
> > }
> >
...
>
> OK, now we need to do a dump_stack() each time we discover a new max hold
> time. That might a bit tricky: the printk code does spinlocking too so
> things could go recursively deadlocky. Maybe make spin_unlock_irq() return
> the hold time then do:
What I found now after fixing the above is that hold time is not bad --
249461 cycles on the 2.6 GHZ opteron with powernow disabled in the BIOS.
The spin time is still in orders of seconds.
Hence this looks like a hardware fairness issue.
Attaching the instrumentation patch with this email FR.
Index: linux-2.6.20-rc4.spin_instru/include/asm-x86_64/spinlock.h
===================================================================
--- linux-2.6.20-rc4.spin_instru.orig/include/asm-x86_64/spinlock.h 2007-01-14 22:36:46.694248000 -0800
+++ linux-2.6.20-rc4.spin_instru/include/asm-x86_64/spinlock.h 2007-01-15 15:40:36.554248000 -0800
@@ -6,6 +6,18 @@
#include <asm/page.h>
#include <asm/processor.h>
+/* Like get_cycles, but make sure the CPU is synchronized. */
+static inline unsigned long long get_cycles_sync2(void)
+{
+ unsigned long long ret;
+ unsigned eax;
+ /* Don't do an additional sync on CPUs where we know
+ RDTSC is already synchronous. */
+ alternative_io("cpuid", ASM_NOP2, X86_FEATURE_SYNC_RDTSC,
+ "=a" (eax), "0" (1) : "ebx","ecx","edx","memory");
+ rdtscll(ret);
+ return ret;
+}
/*
* Your basic SMP spinlocks, allowing only a single CPU anywhere
*
@@ -34,6 +46,7 @@ static inline void __raw_spin_lock(raw_s
"jle 3b\n\t"
"jmp 1b\n"
"2:\t" : "=m" (lock->slock) : : "memory");
+ lock->htsc = get_cycles_sync2();
}
/*
@@ -62,6 +75,7 @@ static inline void __raw_spin_lock_flags
"jmp 4b\n"
"5:\n\t"
: "+m" (lock->slock) : "r" ((unsigned)flags) : "memory");
+ lock->htsc = get_cycles_sync2();
}
#endif
@@ -74,11 +88,16 @@ static inline int __raw_spin_trylock(raw
:"=q" (oldval), "=m" (lock->slock)
:"0" (0) : "memory");
+ if (oldval)
+ lock->htsc = get_cycles_sync2();
return oldval > 0;
}
static inline void __raw_spin_unlock(raw_spinlock_t *lock)
{
+ unsigned long long t = get_cycles_sync2();
+ if (lock->hold_time < t - lock->htsc)
+ lock->hold_time = t - lock->htsc;
asm volatile("movl $1,%0" :"=m" (lock->slock) :: "memory");
}
Index: linux-2.6.20-rc4.spin_instru/include/asm-x86_64/spinlock_types.h
===================================================================
--- linux-2.6.20-rc4.spin_instru.orig/include/asm-x86_64/spinlock_types.h 2007-01-14 22:36:46.714248000 -0800
+++ linux-2.6.20-rc4.spin_instru/include/asm-x86_64/spinlock_types.h 2007-01-15 14:23:37.204248000 -0800
@@ -7,9 +7,11 @@
typedef struct {
unsigned int slock;
+ unsigned long long hold_time;
+ unsigned long long htsc;
} raw_spinlock_t;
-#define __RAW_SPIN_LOCK_UNLOCKED { 1 }
+#define __RAW_SPIN_LOCK_UNLOCKED { 1,0,0 }
typedef struct {
unsigned int lock;
Index: linux-2.6.20-rc4.spin_instru/include/linux/spinlock.h
===================================================================
--- linux-2.6.20-rc4.spin_instru.orig/include/linux/spinlock.h 2007-01-14 22:36:48.464248000 -0800
+++ linux-2.6.20-rc4.spin_instru/include/linux/spinlock.h 2007-01-14 22:41:30.964248000 -0800
@@ -231,8 +231,8 @@ do { \
# define spin_unlock(lock) __raw_spin_unlock(&(lock)->raw_lock)
# define read_unlock(lock) __raw_read_unlock(&(lock)->raw_lock)
# define write_unlock(lock) __raw_write_unlock(&(lock)->raw_lock)
-# define spin_unlock_irq(lock) \
- do { __raw_spin_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
+# define spin_unlock_irq(lock) _spin_unlock_irq(lock)
+/* do { __raw_spin_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)*/
# define read_unlock_irq(lock) \
do { __raw_read_unlock(&(lock)->raw_lock); local_irq_enable(); } while (0)
# define write_unlock_irq(lock) \
Index: linux-2.6.20-rc4.spin_instru/include/linux/spinlock_types.h
===================================================================
--- linux-2.6.20-rc4.spin_instru.orig/include/linux/spinlock_types.h 2006-11-29 13:57:37.000000000 -0800
+++ linux-2.6.20-rc4.spin_instru/include/linux/spinlock_types.h 2007-01-15 14:27:50.664248000 -0800
@@ -19,6 +19,7 @@
typedef struct {
raw_spinlock_t raw_lock;
+ unsigned long long spin_time;
#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
unsigned int break_lock;
#endif
@@ -78,7 +79,8 @@ typedef struct {
RW_DEP_MAP_INIT(lockname) }
#else
# define __SPIN_LOCK_UNLOCKED(lockname) \
- (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED, \
+ (spinlock_t) { .raw_lock = __RAW_SPIN_LOCK_UNLOCKED,\
+ .spin_time = 0 \
SPIN_DEP_MAP_INIT(lockname) }
#define __RW_LOCK_UNLOCKED(lockname) \
(rwlock_t) { .raw_lock = __RAW_RW_LOCK_UNLOCKED, \
Index: linux-2.6.20-rc4.spin_instru/kernel/spinlock.c
===================================================================
--- linux-2.6.20-rc4.spin_instru.orig/kernel/spinlock.c 2006-11-29 13:57:37.000000000 -0800
+++ linux-2.6.20-rc4.spin_instru/kernel/spinlock.c 2007-01-15 14:29:47.374248000 -0800
@@ -99,10 +99,15 @@ EXPORT_SYMBOL(_spin_lock_irqsave);
void __lockfunc _spin_lock_irq(spinlock_t *lock)
{
+ unsigned long long t1,t2;
local_irq_disable();
+ t1 = get_cycles_sync();
preempt_disable();
spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
_raw_spin_lock(lock);
+ t2 = get_cycles_sync();
+ if (lock->spin_time < (t2 - t1))
+ lock->spin_time = t2 - t1;
}
EXPORT_SYMBOL(_spin_lock_irq);
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists