lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <2f5ca5e4-e250-a41c-11fb-a7f4ebc7e1c9@google.com>
Date:   Mon, 7 Feb 2022 21:45:36 -0800 (PST)
From:   Hugh Dickins <hughd@...gle.com>
To:     Borislav Petkov <bp@...e.de>
cc:     Peter Zijlstra <peterz@...radead.org>,
        linux-kernel@...r.kernel.org, x86@...nel.org
Subject: x86: should clear_user() have alternatives?

Hi Boris,

I've noticed that clear_user() is slower than it need be:

dd if=/dev/zero of=/dev/null bs=1M count=1M
1099511627776 bytes (1.1 TB) copied, 45.9641 s, 23.9 GB/s
whereas with the hacked patch below
1099511627776 bytes (1.1 TB) copied, 33.4 s, 32.9 GB/s

That was on some Intel machine: IIRC an AMD went faster.

It's because clear_user() lacks alternatives, and uses a
nowadays suboptimal implementation; whereas clear_page()
and copy_user() do support alternatives.

I came across this because reading a sparse file on tmpfs uses
copy_page_to_iter() from the ZERO_PAGE(), and I wanted to change that
to iov_iter_zero(), which sounds obviously faster - but in fact slower.

I realize that dd'ing from /dev/zero to /dev/null, and sparse files on
tmpfs, are not prime candidates for optimization; and I've no idea how
much clear_user() normally gets used for long clears.

If I were capable of compiler asm, with alternatives, and knew at what
length ERMS becomes advantageous when clearing, I would be sending you
a proper patch.  As it is, I'm hoping to tempt someone else to do the
work!  Or reject it as too niche to bother with.

Thanks,
Hugh

Hacked patch against 5.16 (5.17-rc is a little different there):

 arch/x86/lib/copy_user_64.S |   26 ++++++++++++++++++++++++++
 arch/x86/lib/usercopy_64.c  |   36 +-----------------------------------
 2 files changed, 27 insertions(+), 35 deletions(-)

--- 5.16/arch/x86/lib/copy_user_64.S	2022-01-09 14:55:34.000000000 -0800
+++ erms/arch/x86/lib/copy_user_64.S	2022-01-22 16:57:21.156968363 -0800
@@ -392,3 +392,29 @@ SYM_FUNC_START(__copy_user_nocache)
 	_ASM_EXTABLE_CPY(41b, .L_fixup_1b_copy)
 SYM_FUNC_END(__copy_user_nocache)
 EXPORT_SYMBOL(__copy_user_nocache)
+
+/*
+ * Recent CPUs have added enhanced REP MOVSB/STOSB instructions.
+ * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
+ * Assume that's best for __clear_user(), until alternatives are provided
+ * (though would be better to avoid REP STOSB for short clears, if no FSRM).
+ *
+ * Input:
+ * rdi destination
+ * rsi count
+ *
+ * Output:
+ * rax uncopied bytes or 0 if successful.
+ */
+SYM_FUNC_START(__clear_user)
+	ASM_STAC
+	movl %esi,%ecx
+	xorq %rax,%rax
+1:	rep stosb
+2:	movl %ecx,%eax
+	ASM_CLAC
+	ret
+
+	_ASM_EXTABLE_UA(1b, 2b)
+SYM_FUNC_END(__clear_user)
+EXPORT_SYMBOL(__clear_user)
--- 5.16/arch/x86/lib/usercopy_64.c	2020-12-13 14:41:30.000000000 -0800
+++ erms/arch/x86/lib/usercopy_64.c	2022-01-20 18:40:04.125752474 -0800
@@ -13,43 +13,9 @@
 /*
  * Zero Userspace
  */
-
-unsigned long __clear_user(void __user *addr, unsigned long size)
-{
-	long __d0;
-	might_fault();
-	/* no memory constraint because it doesn't change any memory gcc knows
-	   about */
-	stac();
-	asm volatile(
-		"	testq  %[size8],%[size8]\n"
-		"	jz     4f\n"
-		"	.align 16\n"
-		"0:	movq $0,(%[dst])\n"
-		"	addq   $8,%[dst]\n"
-		"	decl %%ecx ; jnz   0b\n"
-		"4:	movq  %[size1],%%rcx\n"
-		"	testl %%ecx,%%ecx\n"
-		"	jz     2f\n"
-		"1:	movb   $0,(%[dst])\n"
-		"	incq   %[dst]\n"
-		"	decl %%ecx ; jnz  1b\n"
-		"2:\n"
-		".section .fixup,\"ax\"\n"
-		"3:	lea 0(%[size1],%[size8],8),%[size8]\n"
-		"	jmp 2b\n"
-		".previous\n"
-		_ASM_EXTABLE_UA(0b, 3b)
-		_ASM_EXTABLE_UA(1b, 2b)
-		: [size8] "=&c"(size), [dst] "=&D" (__d0)
-		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
-	clac();
-	return size;
-}
-EXPORT_SYMBOL(__clear_user);
-
 unsigned long clear_user(void __user *to, unsigned long n)
 {
+	might_fault();
 	if (access_ok(to, n))
 		return __clear_user(to, n);
 	return n;

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ