lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <20181228153759.3132-1-ubizjak@gmail.com>
Date:   Fri, 28 Dec 2018 16:37:59 +0100
From:   Uros Bizjak <ubizjak@...il.com>
To:     linux-kernel@...r.kernel.org
Cc:     Uros Bizjak <ubizjak@...il.com>, x86@...nel.org
Subject: [PATCH] x86/asm: Use CC_SET/CC_OUT in percpu_cmpxchg16b_double

Use CC_SET(z)/CC_OUT(z) instead of explicit setz instruction.
Using these two defines, the compiler that supports generation of
condition code outputs from inline assembly flags generates one
conditional jump instruction, e.g:

	[lea    (%rdi),%rsi
	 callq  this_cpu_cmpxchg16b_emu]
	-- or --
	[cmpxchg16b %gs:(%rdi)]
	jne    199764 <kmem_cache_alloc+0x44>

instead of

	[lea    (%rdi),%rsi
	 callq  this_cpu_cmpxchg16b_emu]
	-- or --
	[cmpxchg16b %gs:(%rdi)]
	sete   %cl
	test   %cl,%cl
	je     19ae04 <kmem_cache_alloc+0x44>

The complication with percpu_cmpxchg16b_double is, that the definition
defaults to the call to this_cpu_cmpxchg16b_emu library function, which
(depending on X86_FEATURE_CX16 flag) is later patched with real cmpxchg16b
instruction. To solve this complication, the patch changes
this_cpu_cmpxchg16b_emu library function to return the result in ZF flag of
%rflags register, instead of %al register.  Please also note that instead
of popf instruction (which restores flags register to a previously saved state),
the patched function uses sti, but followed by a nop, which ends the
inhibition of interrupts early.

The patch also introduces alternative_io_tail definition. This definition
can take a tail instruction, common to all alternatives. By using this
definition, it is possible to remove setz from cmpxchg16b alternatives, saved
in .altinstr_replacement section, thus saving a few bytes from the binary.

Signed-off-by: Uros Bizjak <ubizjak@...il.com>
Cc: x86@...nel.org
---
 arch/x86/include/asm/alternative.h |  6 ++++++
 arch/x86/include/asm/percpu.h      | 14 ++++++++------
 arch/x86/lib/cmpxchg16b_emu.S      | 16 ++++++----------
 3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 0660e14690c8..49b29990950a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -205,6 +205,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)		\
 		: output : "i" (0), ## input)
 
+/* Like alternative_io, but with a common tail instruction. */
+#define alternative_io_tail(oldinstr, newinstr, feature, tail,		\
+			    output, input...)				\
+	asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) tail	\
+		: output : "i" (0), ## input)
+
 /* Like alternative_io, but for replacing a direct call with another one. */
 #define alternative_call(oldfunc, newfunc, feature, output, input...)	\
 	asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 1a19d11cfbbd..9cf2e78eb4b3 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -495,12 +495,14 @@ do {									\
 	bool __ret;							\
 	typeof(pcp1) __o1 = (o1), __n1 = (n1);				\
 	typeof(pcp2) __o2 = (o2), __n2 = (n2);				\
-	alternative_io("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu\n\t", \
-		       "cmpxchg16b " __percpu_arg(1) "\n\tsetz %0\n\t",	\
-		       X86_FEATURE_CX16,				\
-		       ASM_OUTPUT2("=a" (__ret), "+m" (pcp1),		\
-				   "+m" (pcp2), "+d" (__o2)),		\
-		       "b" (__n1), "c" (__n2), "a" (__o1) : "rsi");	\
+	alternative_io_tail("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu", \
+			    "cmpxchg16b "__percpu_arg(1),		\
+			    X86_FEATURE_CX16,				\
+			    CC_SET(z),					\
+			    ASM_OUTPUT2(CC_OUT(z) (__ret),		\
+					"+m" (pcp1), "+m" (pcp2),	\
+					"+a" (__o1), "+d" (__o2)),	\
+			    "b" (__n1), "c" (__n2) : "rsi");		\
 	__ret;								\
 })
 
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
index 9b330242e740..d8c1ae48e0d9 100644
--- a/arch/x86/lib/cmpxchg16b_emu.S
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -17,20 +17,20 @@
  * %rdx : high 64 bits of old value
  * %rbx : low 64 bits of new value
  * %rcx : high 64 bits of new value
- * %al  : Operation successful
+ *
+ * Outputs:
+ * %rflags.zf: set if the destination operand and %rdx:%rax are equal
  */
 ENTRY(this_cpu_cmpxchg16b_emu)
 
 #
-# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
-# via the ZF.  Caller will access %al to get result.
+# Emulate 'cmpxchg16b %gs:(%rsi)'
 #
 # Note that this is only useful for a cpuops operation.  Meaning that we
 # do *not* have a fully atomic operation but just an operation that is
 # *atomic* on a single cpu (as provided by the this_cpu_xx class of
 # macros).
 #
-	pushfq
 	cli
 
 	cmpq PER_CPU_VAR((%rsi)), %rax
@@ -41,13 +41,9 @@ ENTRY(this_cpu_cmpxchg16b_emu)
 	movq %rbx, PER_CPU_VAR((%rsi))
 	movq %rcx, PER_CPU_VAR(8(%rsi))
 
-	popfq
-	mov $1, %al
-	ret
-
 .Lnot_same:
-	popfq
-	xor %al,%al
+	sti
+	nop
 	ret
 
 ENDPROC(this_cpu_cmpxchg16b_emu)
-- 
2.20.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ