lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20251124213227.123779-4-chang.seok.bae@intel.com>
Date: Mon, 24 Nov 2025 21:32:26 +0000
From: "Chang S. Bae" <chang.seok.bae@...el.com>
To: linux-kernel@...r.kernel.org
Cc: x86@...nel.org,
	tglx@...utronix.de,
	mingo@...hat.com,
	bp@...en8.de,
	dave.hansen@...ux.intel.com,
	chang.seok.bae@...el.com
Subject: [RFC PATCH 3/3] x86/lib: Use EGPRs in 64-bit checksum copy loop

The current checksum copy routine already uses all legacy GPRs for loop
unrolling. APX introduces additional GPRs. Use them to extend the
unrolling further.

Signed-off-by: Chang S. Bae <chang.seok.bae@...el.com>
---
Caveat: This is primarily an illustrative example. I have not fully
audited all call sites or large-buffer use cases (yet). The goal is to
demonstrate the potential of the extended register set.
---
 arch/x86/Kconfig                   |  6 +++
 arch/x86/Kconfig.assembler         |  6 +++
 arch/x86/include/asm/checksum_64.h | 24 +++++++++++-
 arch/x86/lib/csum-copy_64.S        | 59 ++++++++++++++++++++++++++++--
 4 files changed, 90 insertions(+), 5 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fa3b616af03a..e6d969376bf2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1890,6 +1890,12 @@ config X86_USER_SHADOW_STACK
 
 	  If unsure, say N.
 
+config X86_APX
+	bool "In-kernel APX use"
+	depends on AS_APX
+	help
+	  Experimental: enable in-kernel use of APX
+
 config INTEL_TDX_HOST
 	bool "Intel Trust Domain Extensions (TDX) host support"
 	depends on CPU_SUP_INTEL
diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler
index b1c59fb0a4c9..d208ac540609 100644
--- a/arch/x86/Kconfig.assembler
+++ b/arch/x86/Kconfig.assembler
@@ -5,3 +5,9 @@ config AS_WRUSS
 	def_bool $(as-instr64,wrussq %rax$(comma)(%rbx))
 	help
 	  Supported by binutils >= 2.31 and LLVM integrated assembler
+
+config AS_APX
+	def_bool $(as-instr64,mov %r16$(comma)%r17)
+	help
+	  Assembler support extended registers.
+	  Supported by binutils >= 2.43 (LLVM version TBD)
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index 4d4a47a3a8ab..4cbd9e71f8c3 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -10,6 +10,7 @@
 
 #include <linux/compiler.h>
 #include <asm/byteorder.h>
+#include <asm/fpu/api.h>
 
 /**
  * csum_fold - Fold and invert a 32bit checksum.
@@ -129,7 +130,28 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
 extern __wsum csum_partial(const void *buff, int len, __wsum sum);
 
 /* Do not call this directly. Use the wrappers below */
-extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len);
+extern __visible __wsum csum_partial_copy(const void *src, void *dst, int len);
+#ifndef CONFIG_X86_APX
+static inline __wsum csum_partial_copy_generic(const void *src, void *dst, int len)
+{
+	return csum_partial_copy(src, dst, len);
+}
+#else
+extern __visible __wsum csum_partial_copy_apx(const void *src, void *dst, int len);
+static inline __wsum csum_partial_copy_generic(const void *src, void *dst, int len)
+{
+	__wsum sum;
+
+	if (!cpu_has_xfeatures(XFEATURE_MASK_APX, NULL) || !irq_fpu_usable())
+		return csum_partial_copy(src, dst, len);
+
+	kernel_fpu_begin();
+	sum = csum_partial_copy_apx(src, dst, len);
+	kernel_fpu_end();
+
+	return sum;
+}
+#endif
 
 extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len);
 extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len);
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 5526bdfac041..dc99227af94f 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -119,11 +119,54 @@
 
 	shrl  $6, LEN64B
 	jz	.Lhandle_tail\@     /* < 64 */
+.if USE_APX
+	cmpl  $3, LEN64B
+	jb	.Lloop_64\@         /* < 192 */
+	clc
+	.p2align 4
+.Lloop_192\@:
+	.set  TMP9, %r16
+	.set  TMP10, %r17
+	.set  TMP11, %r18
+	.set  TMP12, %r19
+	.set  TMP13, %r20
+	.set  TMP14, %r21
+	.set  TMP15, %r22
+	.set  TMP16, %r23
+	.set  TMP17, %r24
+	.set  TMP18, %r25
+	.set  TMP19, %r26
+	.set  TMP20, %r27
+	.set  TMP21, %r28
+	.set  TMP22, %r29
+	.set  TMP23, %r30
+	.set  TMP24, %r31
+
+	.p2align 4
+	loadregs 0, INP, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8
+	loadregs 8, INP, TMP9, TMP10, TMP11, TMP12, TMP13, TMP14, TMP15, TMP16
+	loadregs 16, INP, TMP17, TMP18, TMP19, TMP20, TMP21, TMP22, TMP23, TMP24
+
+	sumregs SUM, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8
+	sumregs SUM, TMP9, TMP10, TMP11, TMP12, TMP13, TMP14, TMP15, TMP16
+	sumregs SUM, TMP17, TMP18, TMP19, TMP20, TMP21, TMP22, TMP23, TMP24
+
+	storeregs 0, OUTP, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8
+	storeregs 8, OUTP, TMP9, TMP10, TMP11, TMP12, TMP13, TMP14, TMP15, TMP16
+	storeregs 16, OUTP, TMP17, TMP18, TMP19, TMP20, TMP21, TMP22, TMP23, TMP24
+
+	incr INP, 24
+	incr OUTP, 24
 
+	sub  $3, LEN64B
+	cmp  $3, LEN64B
+	jnb	.Lloop_192\@
+.else
 	clc
 
 	.p2align 4
-.Lloop\@:
+.endif
+.Lloop_64\@:
 	loadregs 0, INP, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8
 
 	prefetch
@@ -137,7 +180,7 @@
 	incr INP, 8
 	incr OUTP, 8
 
-	jnz	.Lloop\@
+	jnz	.Lloop_64\@
 
 	adcq  ZERO, SUM
 
@@ -260,6 +303,14 @@
 	xorl %eax, %eax
 	restore_regs_and_ret
 
-SYM_FUNC_START(csum_partial_copy_generic)
+.set	USE_APX, 0
+SYM_FUNC_START(csum_partial_copy)
 	_csum_partial_copy
-SYM_FUNC_END(csum_partial_copy_generic)
+SYM_FUNC_END(csum_partial_copy)
+
+#ifdef CONFIG_X86_APX
+.set	USE_APX, 1
+SYM_FUNC_START(csum_partial_copy_apx)
+	_csum_partial_copy
+SYM_FUNC_END(csum_partial_copy_apx)
+#endif
-- 
2.51.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ