linux-kernel - Re: [RFC PATCH 3/3] x86/lib: Use EGPRs in 64-bit checksum copy loop

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20251125103750.2d2d7bbd@pumpkin>
Date: Tue, 25 Nov 2025 10:37:50 +0000
From: david laight <david.laight@...box.com>
To: "Chang S. Bae" <chang.seok.bae@...el.com>
Cc: linux-kernel@...r.kernel.org, x86@...nel.org, tglx@...utronix.de,
 mingo@...hat.com, bp@...en8.de, dave.hansen@...ux.intel.com
Subject: Re: [RFC PATCH 3/3] x86/lib: Use EGPRs in 64-bit checksum copy loop

On Mon, 24 Nov 2025 21:32:26 +0000
"Chang S. Bae" <chang.seok.bae@...el.com> wrote:

> The current checksum copy routine already uses all legacy GPRs for loop
> unrolling. APX introduces additional GPRs. Use them to extend the
> unrolling further.

I very much doubt that unrolling this loop has any performance gain.
IIRC you can get a loop with just two 'memory read' and 'adcq' instructions
in it to execute a 'adcq' every clock.
It ought to be possible to do the same even with the extra 'memory write'.
(You can execute a '2 clock loop', but not a '1 clock loop'.)
Whatever you do the 'loop control' instruction are independent of the copy
and adcq ones and will run in parallel.
For the fastest loop, change the memory accesses to be negative
offsets from the end of the buffer.
Indeed, I think the Intel cpu (I've not done any tests on amd ones)
end up queuing up the adcq and writes (from many loop iterations)
waiting for the reads to complete.

But is this function even worth having at all?
The fast checksum routing does 1.5 to 2 'adcq' per clock.
On modern cpu 'rep movsb' will (usually) copy memory at (IIRC) 32
bytes/clock (IIRC 64 on intel if the destination is aligned).
Put together than is faster than the 1 adcq per clock maximum
of the 'copy and checksum' loop.

The only issue will be buffers over 2k which are likely to generate
extra reads into a 4k L1 data cache.

But it is worse than that.
This code (or something very similar) gets used to checksum data
during copy_to/from_user for sockets.
This goes back a long way and I suspect the 'killer ap' was nfsd
running over UDP (with 8k+ UDP datagrams).
Modern NICs all (well all anyone cares about) to IP checksum offload.
So you don't need to checksum on send() - I'm sure that is still
enabled even though you pretty much never want it.
The checksum on recv() can only happen for UDP, but massively
complicates the code paths and will normally not be needed.

	David

> 
> Signed-off-by: Chang S. Bae <chang.seok.bae@...el.com>
> ---
> Caveat: This is primarily an illustrative example. I have not fully
> audited all call sites or large-buffer use cases (yet). The goal is to
> demonstrate the potential of the extended register set.
> ---
>  arch/x86/Kconfig                   |  6 +++
>  arch/x86/Kconfig.assembler         |  6 +++
>  arch/x86/include/asm/checksum_64.h | 24 +++++++++++-
>  arch/x86/lib/csum-copy_64.S        | 59 ++++++++++++++++++++++++++++--
>  4 files changed, 90 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index fa3b616af03a..e6d969376bf2 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -1890,6 +1890,12 @@ config X86_USER_SHADOW_STACK
>  
>  	  If unsure, say N.
>  
> +config X86_APX
> +	bool "In-kernel APX use"
> +	depends on AS_APX
> +	help
> +	  Experimental: enable in-kernel use of APX
> +
>  config INTEL_TDX_HOST
>  	bool "Intel Trust Domain Extensions (TDX) host support"
>  	depends on CPU_SUP_INTEL
> diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler
> index b1c59fb0a4c9..d208ac540609 100644
> --- a/arch/x86/Kconfig.assembler
> +++ b/arch/x86/Kconfig.assembler
> @@ -5,3 +5,9 @@ config AS_WRUSS
>  	def_bool $(as-instr64,wrussq %rax$(comma)(%rbx))
>  	help
>  	  Supported by binutils >= 2.31 and LLVM integrated assembler
> +
> +config AS_APX
> +	def_bool $(as-instr64,mov %r16$(comma)%r17)
> +	help
> +	  Assembler support extended registers.
> +	  Supported by binutils >= 2.43 (LLVM version TBD)
> diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
> index 4d4a47a3a8ab..4cbd9e71f8c3 100644
> --- a/arch/x86/include/asm/checksum_64.h
> +++ b/arch/x86/include/asm/checksum_64.h
> @@ -10,6 +10,7 @@
>  
>  #include <linux/compiler.h>
>  #include <asm/byteorder.h>
> +#include <asm/fpu/api.h>
>  
>  /**
>   * csum_fold - Fold and invert a 32bit checksum.
> @@ -129,7 +130,28 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
>  extern __wsum csum_partial(const void *buff, int len, __wsum sum);
>  
>  /* Do not call this directly. Use the wrappers below */
> -extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len);
> +extern __visible __wsum csum_partial_copy(const void *src, void *dst, int len);
> +#ifndef CONFIG_X86_APX
> +static inline __wsum csum_partial_copy_generic(const void *src, void *dst, int len)
> +{
> +	return csum_partial_copy(src, dst, len);
> +}
> +#else
> +extern __visible __wsum csum_partial_copy_apx(const void *src, void *dst, int len);
> +static inline __wsum csum_partial_copy_generic(const void *src, void *dst, int len)
> +{
> +	__wsum sum;
> +
> +	if (!cpu_has_xfeatures(XFEATURE_MASK_APX, NULL) || !irq_fpu_usable())
> +		return csum_partial_copy(src, dst, len);
> +
> +	kernel_fpu_begin();
> +	sum = csum_partial_copy_apx(src, dst, len);
> +	kernel_fpu_end();
> +
> +	return sum;
> +}
> +#endif
>  
>  extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len);
>  extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len);
> diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
> index 5526bdfac041..dc99227af94f 100644
> --- a/arch/x86/lib/csum-copy_64.S
> +++ b/arch/x86/lib/csum-copy_64.S
> @@ -119,11 +119,54 @@
>  
>  	shrl  $6, LEN64B
>  	jz	.Lhandle_tail\@     /* < 64 */
> +.if USE_APX
> +	cmpl  $3, LEN64B
> +	jb	.Lloop_64\@         /* < 192 */
> +	clc
> +	.p2align 4
> +.Lloop_192\@:
> +	.set  TMP9, %r16
> +	.set  TMP10, %r17
> +	.set  TMP11, %r18
> +	.set  TMP12, %r19
> +	.set  TMP13, %r20
> +	.set  TMP14, %r21
> +	.set  TMP15, %r22
> +	.set  TMP16, %r23
> +	.set  TMP17, %r24
> +	.set  TMP18, %r25
> +	.set  TMP19, %r26
> +	.set  TMP20, %r27
> +	.set  TMP21, %r28
> +	.set  TMP22, %r29
> +	.set  TMP23, %r30
> +	.set  TMP24, %r31
> +
> +	.p2align 4
> +	loadregs 0, INP, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8
> +	loadregs 8, INP, TMP9, TMP10, TMP11, TMP12, TMP13, TMP14, TMP15, TMP16
> +	loadregs 16, INP, TMP17, TMP18, TMP19, TMP20, TMP21, TMP22, TMP23, TMP24
> +
> +	sumregs SUM, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8
> +	sumregs SUM, TMP9, TMP10, TMP11, TMP12, TMP13, TMP14, TMP15, TMP16
> +	sumregs SUM, TMP17, TMP18, TMP19, TMP20, TMP21, TMP22, TMP23, TMP24
> +
> +	storeregs 0, OUTP, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8
> +	storeregs 8, OUTP, TMP9, TMP10, TMP11, TMP12, TMP13, TMP14, TMP15, TMP16
> +	storeregs 16, OUTP, TMP17, TMP18, TMP19, TMP20, TMP21, TMP22, TMP23, TMP24
> +
> +	incr INP, 24
> +	incr OUTP, 24
>  
> +	sub  $3, LEN64B
> +	cmp  $3, LEN64B
> +	jnb	.Lloop_192\@
> +.else
>  	clc
>  
>  	.p2align 4
> -.Lloop\@:
> +.endif
> +.Lloop_64\@:
>  	loadregs 0, INP, TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8
>  
>  	prefetch
> @@ -137,7 +180,7 @@
>  	incr INP, 8
>  	incr OUTP, 8
>  
> -	jnz	.Lloop\@
> +	jnz	.Lloop_64\@
>  
>  	adcq  ZERO, SUM
>  
> @@ -260,6 +303,14 @@
>  	xorl %eax, %eax
>  	restore_regs_and_ret
>  
> -SYM_FUNC_START(csum_partial_copy_generic)
> +.set	USE_APX, 0
> +SYM_FUNC_START(csum_partial_copy)
>  	_csum_partial_copy
> -SYM_FUNC_END(csum_partial_copy_generic)
> +SYM_FUNC_END(csum_partial_copy)
> +
> +#ifdef CONFIG_X86_APX
> +.set	USE_APX, 1
> +SYM_FUNC_START(csum_partial_copy_apx)
> +	_csum_partial_copy
> +SYM_FUNC_END(csum_partial_copy_apx)
> +#endif