linux-kernel - [PATCH 3/3, resend] x86: fix write lock scalability 64-bit issue

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <4E258E0D020000780004E3F0@nat28.tlf.novell.com>
Date:	Tue, 19 Jul 2011 13:00:45 +0100
From:	"Jan Beulich" <JBeulich@...ell.com>
To:	<mingo@...e.hu>, <tglx@...utronix.de>, <hpa@...or.com>
Cc:	<linux-kernel@...r.kernel.org>
Subject: [PATCH 3/3, resend] x86: fix write lock scalability 64-bit
	 issue

With the write lock path simply subtracting RW_LOCK_BIAS there is, on
large systems, the theoretical possibility of overflowing the 32-bit
value that was used so far (namely if 128 or more CPUs manage to do the
subtraction, but don't get to do the inverse addition in the failure
path quickly enough).

A first measure is to modify RW_LOCK_BIAS itself - with the new value
chosen, it is good for up to 2048 CPUs each allowed to nest over 2048
times on the read path without causing an issue. Quite possibly it
would even be sufficient to adjust the bias a little further, assuming
that allowing for significantly less nesting would suffice.

However, as the original value chosen allowed for even more nesting
levels, to support more than 2048 CPUs (possible currently only for
64-bit kernels) the lock itself gets widened to 64 bits.

Signed-off-by: Jan Beulich <jbeulich@...ell.com>

---
 arch/x86/include/asm/asm.h            |    2 +
 arch/x86/include/asm/rwlock.h         |   43 +++++++++++++++++++++++++++++++++-
 arch/x86/include/asm/spinlock.h       |   37 +++++++++++++++++------------
 arch/x86/include/asm/spinlock_types.h |    6 ----
 arch/x86/lib/rwlock.S                 |   12 ++++-----
 arch/x86/lib/thunk_64.S               |    1 
 6 files changed, 73 insertions(+), 28 deletions(-)

--- 3.0-rc7-x86-rw.orig/arch/x86/include/asm/asm.h
+++ 3.0-rc7-x86-rw/arch/x86/include/asm/asm.h
@@ -3,9 +3,11 @@
 
 #ifdef __ASSEMBLY__
 # define __ASM_FORM(x)	x
+# define __ASM_FORM_COMMA(x) x,
 # define __ASM_EX_SEC	.section __ex_table, "a"
 #else
 # define __ASM_FORM(x)	" " #x " "
+# define __ASM_FORM_COMMA(x) " " #x ","
 # define __ASM_EX_SEC	" .section __ex_table,\"a\"\n"
 #endif
 
--- 3.0-rc7-x86-rw.orig/arch/x86/include/asm/rwlock.h
+++ 3.0-rc7-x86-rw/arch/x86/include/asm/rwlock.h
@@ -1,7 +1,48 @@
 #ifndef _ASM_X86_RWLOCK_H
 #define _ASM_X86_RWLOCK_H
 
-#define RW_LOCK_BIAS		 0x01000000
+#include <asm/asm.h>
+
+#if CONFIG_NR_CPUS <= 2048
+
+#ifndef __ASSEMBLY__
+typedef union {
+	s32 lock;
+	s32 write;
+} arch_rwlock_t;
+#endif
+
+#define RW_LOCK_BIAS		0x00100000
+#define READ_LOCK_SIZE(insn)	__ASM_FORM(insn##l)
+#define READ_LOCK_ATOMIC(n)	atomic_##n
+#define WRITE_LOCK_ADD(n)	__ASM_FORM_COMMA(addl n)
+#define WRITE_LOCK_SUB(n)	__ASM_FORM_COMMA(subl n)
+#define WRITE_LOCK_CMP		RW_LOCK_BIAS
+
+#else /* CONFIG_NR_CPUS > 2048 */
+
+#include <linux/const.h>
+
+#ifndef __ASSEMBLY__
+typedef union {
+	s64 lock;
+	struct {
+		u32 read;
+		s32 write;
+	};
+} arch_rwlock_t;
+#endif
+
+#define RW_LOCK_BIAS		(_AC(1,L) << 32)
+#define READ_LOCK_SIZE(insn)	__ASM_FORM(insn##q)
+#define READ_LOCK_ATOMIC(n)	atomic64_##n
+#define WRITE_LOCK_ADD(n)	__ASM_FORM(incl)
+#define WRITE_LOCK_SUB(n)	__ASM_FORM(decl)
+#define WRITE_LOCK_CMP		1
+
+#endif /* CONFIG_NR_CPUS */
+
+#define __ARCH_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
 
 /* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */
 
--- 3.0-rc7-x86-rw.orig/arch/x86/include/asm/spinlock.h
+++ 3.0-rc7-x86-rw/arch/x86/include/asm/spinlock.h
@@ -2,7 +2,6 @@
 #define _ASM_X86_SPINLOCK_H
 
 #include <asm/atomic.h>
-#include <asm/rwlock.h>
 #include <asm/page.h>
 #include <asm/processor.h>
 #include <linux/compiler.h>
@@ -234,7 +233,7 @@ static inline void arch_spin_unlock_wait
  */
 static inline int arch_read_can_lock(arch_rwlock_t *lock)
 {
-	return (int)(lock)->lock > 0;
+	return lock->lock > 0;
 }
 
 /**
@@ -243,12 +242,12 @@ static inline int arch_read_can_lock(arc
  */
 static inline int arch_write_can_lock(arch_rwlock_t *lock)
 {
-	return (lock)->lock == RW_LOCK_BIAS;
+	return lock->write == WRITE_LOCK_CMP;
 }
 
 static inline void arch_read_lock(arch_rwlock_t *rw)
 {
-	asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
+	asm volatile(LOCK_PREFIX READ_LOCK_SIZE(dec) " (%0)\n\t"
 		     "jns 1f\n"
 		     "call __read_lock_failed\n\t"
 		     "1:\n"
@@ -257,47 +256,55 @@ static inline void arch_read_lock(arch_r
 
 static inline void arch_write_lock(arch_rwlock_t *rw)
 {
-	asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
+	asm volatile(LOCK_PREFIX WRITE_LOCK_SUB(%1) "(%0)\n\t"
 		     "jz 1f\n"
 		     "call __write_lock_failed\n\t"
 		     "1:\n"
-		     ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
+		     ::LOCK_PTR_REG (&rw->write), "i" (RW_LOCK_BIAS)
+		     : "memory");
 }
 
 static inline int arch_read_trylock(arch_rwlock_t *lock)
 {
-	atomic_t *count = (atomic_t *)lock;
+	READ_LOCK_ATOMIC(t) *count = (READ_LOCK_ATOMIC(t) *)lock;
 
-	if (atomic_dec_return(count) >= 0)
+	if (READ_LOCK_ATOMIC(dec_return)(count) >= 0)
 		return 1;
-	atomic_inc(count);
+	READ_LOCK_ATOMIC(inc)(count);
 	return 0;
 }
 
 static inline int arch_write_trylock(arch_rwlock_t *lock)
 {
-	atomic_t *count = (atomic_t *)lock;
+	atomic_t *count = (atomic_t *)&lock->write;
 
-	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
+	if (atomic_sub_and_test(WRITE_LOCK_CMP, count))
 		return 1;
-	atomic_add(RW_LOCK_BIAS, count);
+	atomic_add(WRITE_LOCK_CMP, count);
 	return 0;
 }
 
 static inline void arch_read_unlock(arch_rwlock_t *rw)
 {
-	asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
+	asm volatile(LOCK_PREFIX READ_LOCK_SIZE(inc) " %0"
+		     :"+m" (rw->lock) : : "memory");
 }
 
 static inline void arch_write_unlock(arch_rwlock_t *rw)
 {
-	asm volatile(LOCK_PREFIX "addl %1, %0"
-		     : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
+	asm volatile(LOCK_PREFIX WRITE_LOCK_ADD(%1) "%0"
+		     : "+m" (rw->write) : "i" (RW_LOCK_BIAS) : "memory");
 }
 
 #define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
 #define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
 
+#undef READ_LOCK_SIZE
+#undef READ_LOCK_ATOMIC
+#undef WRITE_LOCK_ADD
+#undef WRITE_LOCK_SUB
+#undef WRITE_LOCK_CMP
+
 #define arch_spin_relax(lock)	cpu_relax()
 #define arch_read_relax(lock)	cpu_relax()
 #define arch_write_relax(lock)	cpu_relax()
--- 3.0-rc7-x86-rw.orig/arch/x86/include/asm/spinlock_types.h
+++ 3.0-rc7-x86-rw/arch/x86/include/asm/spinlock_types.h
@@ -11,10 +11,6 @@ typedef struct arch_spinlock {
 
 #define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
 
-typedef struct {
-	unsigned int lock;
-} arch_rwlock_t;
-
-#define __ARCH_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
+#include <asm/rwlock.h>
 
 #endif /* _ASM_X86_SPINLOCK_TYPES_H */
--- 3.0-rc7-x86-rw.orig/arch/x86/lib/rwlock.S
+++ 3.0-rc7-x86-rw/arch/x86/lib/rwlock.S
@@ -15,12 +15,12 @@ ENTRY(__write_lock_failed)
 	CFI_STARTPROC
 	FRAME
 0:	LOCK_PREFIX
-	addl	$RW_LOCK_BIAS, (%__lock_ptr)
+	WRITE_LOCK_ADD($RW_LOCK_BIAS) (%__lock_ptr)
 1:	rep; nop
-	cmpl	$RW_LOCK_BIAS, (%__lock_ptr)
+	cmpl	$WRITE_LOCK_CMP, (%__lock_ptr)
 	jne	1b
 	LOCK_PREFIX
-	subl	$RW_LOCK_BIAS, (%__lock_ptr)
+	WRITE_LOCK_SUB($RW_LOCK_BIAS) (%__lock_ptr)
 	jnz	0b
 	ENDFRAME
 	ret
@@ -31,12 +31,12 @@ ENTRY(__read_lock_failed)
 	CFI_STARTPROC
 	FRAME
 0:	LOCK_PREFIX
-	incl	(%__lock_ptr)
+	READ_LOCK_SIZE(inc) (%__lock_ptr)
 1:	rep; nop
-	cmpl	$1, (%__lock_ptr)
+	READ_LOCK_SIZE(cmp) $1, (%__lock_ptr)
 	js	1b
 	LOCK_PREFIX
-	decl	(%__lock_ptr)
+	READ_LOCK_SIZE(dec) (%__lock_ptr)
 	js	0b
 	ENDFRAME
 	ret
--- 3.0-rc7-x86-rw.orig/arch/x86/lib/thunk_64.S
+++ 3.0-rc7-x86-rw/arch/x86/lib/thunk_64.S
@@ -9,7 +9,6 @@
 	#include <linux/linkage.h>
 	#include <asm/dwarf2.h>
 	#include <asm/calling.h>			
-	#include <asm/rwlock.h>
 		
 	/* rdi:	arg1 ... normal C conventions. rax is saved/restored. */ 	
 	.macro thunk name,func


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/