[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20251124213227.123779-4-chang.seok.bae@intel.com>
Date: Mon, 24 Nov 2025 21:32:26 +0000
From: "Chang S. Bae" <chang.seok.bae@...el.com>
To: linux-kernel@...r.kernel.org
Cc: x86@...nel.org,
tglx@...utronix.de,
mingo@...hat.com,
bp@...en8.de,
dave.hansen@...ux.intel.com,
chang.seok.bae@...el.com
Subject: [RFC PATCH 3/3] x86/lib: Use EGPRs in 64-bit checksum copy loop
The current checksum copy routine already uses all legacy GPRs for loop
unrolling. APX introduces additional GPRs. Use them to extend the
unrolling further.
Signed-off-by: Chang S. Bae <chang.seok.bae@...el.com>
---
Caveat: This is primarily an illustrative example. I have not fully
audited all call sites or large-buffer use cases (yet). The goal is to
demonstrate the potential of the extended register set.
---
arch/x86/Kconfig | 6 +++
arch/x86/Kconfig.assembler | 6 +++
arch/x86/include/asm/checksum_64.h | 24 +++++++++++-
arch/x86/lib/csum-copy_64.S | 59 ++++++++++++++++++++++++++++--
4 files changed, 90 insertions(+), 5 deletions(-)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fa3b616af03a..e6d969376bf2 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1890,6 +1890,12 @@ config X86_USER_SHADOW_STACK
If unsure, say N.
+config X86_APX
+ bool "In-kernel APX use"
+ depends on AS_APX
+ help
+ Experimental: enable in-kernel use of APX
+
config INTEL_TDX_HOST
bool "Intel Trust Domain Extensions (TDX) host support"
depends on CPU_SUP_INTEL
diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler
index b1c59fb0a4c9..d208ac540609 100644
--- a/arch/x86/Kconfig.assembler
+++ b/arch/x86/Kconfig.assembler
@@ -5,3 +5,9 @@ config AS_WRUSS
def_bool $(as-instr64,wrussq %rax$(comma)(%rbx))
help
Supported by binutils >= 2.31 and LLVM integrated assembler
+
+config AS_APX
+ def_bool $(as-instr64,mov %r16$(comma)%r17)
+ help
+ Assembler support extended registers.
+ Supported by binutils >= 2.43 (LLVM version TBD)
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
index 4d4a47a3a8ab..4cbd9e71f8c3 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -10,6 +10,7 @@
#include <linux/compiler.h>
#include <asm/byteorder.h>
+#include <asm/fpu/api.h>
/**
* csum_fold - Fold and invert a 32bit checksum.
@@ -129,7 +130,28 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
extern __wsum csum_partial(const void *buff, int len, __wsum sum);
/* Do not call this directly. Use the wrappers below */
-extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len);
+extern __visible __wsum csum_partial_copy(const void *src, void *dst, int len);
+#ifndef CONFIG_X86_APX
+static inline __wsum csum_partial_copy_generic(const void *src, void *dst, int len)
+{
+ return csum_partial_copy(src, dst, len);
+}
+#else
+extern __visible __wsum csum_partial_copy_apx(const void *src, void *dst, int len);
+static inline __wsum csum_partial_copy_generic(const void *src, void *dst, int len)
+{
+ __wsum sum;
+
+ if (!cpu_has_xfeatures(XFEATURE_MASK_APX, NULL) || !irq_fpu_usable())
+ return csum_partial_copy(src, dst, len);
+
+ kernel_fpu_begin();
+ sum = csum_partial_copy_apx(src, dst, len);
+ kernel_fpu_end();
+
+ return sum;
+}
+#endif
extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len);
extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len);
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 5526bdfac041..dc99227af94f 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -119,11 +119,54 @@
shrl $6, LEN64B
jz .Lhandle_tail\@ /* < 64 */
+.if USE_APX
+ cmpl $3, LEN64B
+ jb .Lloop_64\@ /* < 192 */
+ clc
+ .p2align 4
+.Lloop_192\@:
+ .set TMP9, %r16
+ .set TMP10, %r17
+ .set TMP11, %r18
+ .set TMP12, %r19
+ .set TMP13, %r20
+ .set TMP14, %r21
+ .set TMP15, %r22
+ .set TMP16, %r23
+ .set TMP17, %r24
+ .set TMP18, %r25
+ .set TMP19, %r26
+ .set TMP20, %r27
+ .set TMP21, %r28
+ .set TMP22, %r29
+ .set TMP23, %r30
+ .set TMP24, %r31
+
+ .p2align 4
+ loadregs 0, INP, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8
+ loadregs 8, INP, TMP9, TMP10, TMP11, TMP12, TMP13, TMP14, TMP15, TMP16
+ loadregs 16, INP, TMP17, TMP18, TMP19, TMP20, TMP21, TMP22, TMP23, TMP24
+
+ sumregs SUM, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8
+ sumregs SUM, TMP9, TMP10, TMP11, TMP12, TMP13, TMP14, TMP15, TMP16
+ sumregs SUM, TMP17, TMP18, TMP19, TMP20, TMP21, TMP22, TMP23, TMP24
+
+ storeregs 0, OUTP, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8
+ storeregs 8, OUTP, TMP9, TMP10, TMP11, TMP12, TMP13, TMP14, TMP15, TMP16
+ storeregs 16, OUTP, TMP17, TMP18, TMP19, TMP20, TMP21, TMP22, TMP23, TMP24
+
+ incr INP, 24
+ incr OUTP, 24
+ sub $3, LEN64B
+ cmp $3, LEN64B
+ jnb .Lloop_192\@
+.else
clc
.p2align 4
-.Lloop\@:
+.endif
+.Lloop_64\@:
loadregs 0, INP, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8
prefetch
@@ -137,7 +180,7 @@
incr INP, 8
incr OUTP, 8
- jnz .Lloop\@
+ jnz .Lloop_64\@
adcq ZERO, SUM
@@ -260,6 +303,14 @@
xorl %eax, %eax
restore_regs_and_ret
-SYM_FUNC_START(csum_partial_copy_generic)
+.set USE_APX, 0
+SYM_FUNC_START(csum_partial_copy)
_csum_partial_copy
-SYM_FUNC_END(csum_partial_copy_generic)
+SYM_FUNC_END(csum_partial_copy)
+
+#ifdef CONFIG_X86_APX
+.set USE_APX, 1
+SYM_FUNC_START(csum_partial_copy_apx)
+ _csum_partial_copy
+SYM_FUNC_END(csum_partial_copy_apx)
+#endif
--
2.51.0
Powered by blists - more mailing lists