linux-kernel - RE: [PATCH RFC] [X86] Optimize memcpy by avoiding memory false dependece

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Wed, 30 Jun 2010 16:59:18 +0800
From:	"Ma, Ling" <ling.ma@...el.com>
To:	"mingo@...e.hu" <mingo@...e.hu>
CC:	"hpa@...or.com" <hpa@...or.com>,
	"tglx@...utronix.de" <tglx@...utronix.de>,
	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
Subject: RE: [PATCH RFC] [X86] Optimize memcpy by avoiding memory false
 dependece

Hi Ingo

We extract some compared results by attachment micro-benchmarks on Corei7.
(gcc -O2 -o memcpy-kernel memcpy-kernel.c )

LAT: Len  127, alignment  4/16:  improvement: 2X
LAT: Len  127, alignment  0/16:  improvement: 2X
LAT: Len 1024, alignment  4/16:  improvement: 1.5X
LAT: Len 1024, alignment  0/ 0:   no change
LAT: Len 4096, alignment  4/16:  improvement :1.6X
LAT: Len 4096, alignment  0/ 8:   improvement:1.37X
LAT: Len 8192, alignment 16/ 0:   no change
LAT: Len 8192, alignment  0/16:  improvement 1.45X

Any comments from you ?

Thanks
Ling

> -----Original Message-----
> From: Ma, Ling
> Sent: Tuesday, June 29, 2010 3:24 AM
> To: mingo@...e.hu
> Cc: hpa@...or.com; tglx@...utronix.de; linux-kernel@...r.kernel.org; Ma,
> Ling
> Subject: [PATCH RFC] [X86] Optimize memcpy by avoiding memory false
> dependece
> 
> From: Ma Ling <ling.ma@...el.com>
> 
> All read operations after allocation stage can run speculatively,
> all write operation will run in program order, and if addresses are
> different read may run before older write operation, otherwise wait
> until write commit. However CPU don't check each address bit,
> so read could fail to recognize different address even they
> are in different page.For example if rsi is 0xf004, rdi is 0xe008,
> in following operation there will generate big performance latency.
> 1. movq (%rsi),	%rax
> 2. movq %rax,	(%rdi)
> 3. movq 8(%rsi), %rax
> 4. movq %rax,	8(%rdi)
> 
> If %rsi and rdi were in really the same meory page, there are TRUE
> read-after-write dependence because instruction 2 write 0x008 and
> instruction 3 read 0x00c, the two address are overlap partially.
> Actually there are in different page and no any issues,
> but without checking each address bit CPU could think they are
> in the same page, and instruction 3 have to wait for instruction 2
> to write data into cache from write buffer, then load data from cache,
> the cost time read spent is equal to mfence instruction. We may avoid it by
> tuning operation sequence as follow.
> 
> 1. movq 8(%rsi), %rax
> 2. movq %rax,	8(%rdi)
> 3. movq (%rsi),	%rax
> 4. movq %rax,	(%rdi)
> 
> Instruction 3 read 0x004, instruction 2 write address 0x010, no any
> dependence.
> At last on Core2 we gain 1.83x speedup compared with original instruction
> sequence.
> In this patch we first handle small size(less 20bytes), then jump to
> different copy mode. Based on our micro-benchmark small bytes from 1 to 127
> bytes,
> we got up to 2X improvement, and up to 1.5X improvement for 1024 bytes on
> Corei7.
> (We use our micro-benchmark, and will do further test according to your
> requirment)
> 
> Thanks
> Ling
> 
> ---
>  arch/x86/lib/memcpy_64.S |  158
> ++++++++++++++++++++++++++++++----------------
>  1 files changed, 103 insertions(+), 55 deletions(-)
> 
> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
> index f82e884..5902438 100644
> --- a/arch/x86/lib/memcpy_64.S
> +++ b/arch/x86/lib/memcpy_64.S
> @@ -40,84 +40,132 @@
>  ENTRY(__memcpy)
>  ENTRY(memcpy)
>  	CFI_STARTPROC
> +	movq %rdi, %rax
> 
>  	/*
> -	 * Put the number of full 64-byte blocks into %ecx.
> -	 * Tail portion is handled at the end:
> +	 * Use 32bit CMP here to avoid long NOP padding.
>  	 */
> -	movq %rdi, %rax
> -	movl %edx, %ecx
> -	shrl   $6, %ecx
> -	jz .Lhandle_tail
> +	cmp  $0x20, %edx
> +	jb .Lhandle_tail
> 
> -	.p2align 4
> -.Lloop_64:
>  	/*
> -	 * We decrement the loop index here - and the zero-flag is
> -	 * checked at the end of the loop (instructions inbetween do
> -	 * not change the zero flag):
> +	 * We check whether memory false dependece could occur,
> +	 * then jump to corresponding copy mode.
>  	 */
> -	decl %ecx
> +	cmp  %dil, %sil
> +	jl .Lcopy_backward
> +	subl $0x20, %edx
> +.Lcopy_forward_loop:
> +	subq $0x20,	%rdx
> 
>  	/*
> -	 * Move in blocks of 4x16 bytes:
> +	 * Move in blocks of 4x8 bytes:
>  	 */
> -	movq 0*8(%rsi),		%r11
> -	movq 1*8(%rsi),		%r8
> -	movq %r11,		0*8(%rdi)
> -	movq %r8,		1*8(%rdi)
> -
> -	movq 2*8(%rsi),		%r9
> -	movq 3*8(%rsi),		%r10
> -	movq %r9,		2*8(%rdi)
> -	movq %r10,		3*8(%rdi)
> -
> -	movq 4*8(%rsi),		%r11
> -	movq 5*8(%rsi),		%r8
> -	movq %r11,		4*8(%rdi)
> -	movq %r8,		5*8(%rdi)
> -
> -	movq 6*8(%rsi),		%r9
> -	movq 7*8(%rsi),		%r10
> -	movq %r9,		6*8(%rdi)
> -	movq %r10,		7*8(%rdi)
> -
> -	leaq 64(%rsi), %rsi
> -	leaq 64(%rdi), %rdi
> -
> -	jnz  .Lloop_64
> +	movq 0*8(%rsi),	%r8
> +	movq 1*8(%rsi),	%r9
> +	movq 2*8(%rsi),	%r10
> +	movq 3*8(%rsi),	%r11
> +	leaq 4*8(%rsi),	%rsi
> +
> +	movq %r8,	0*8(%rdi)
> +	movq %r9,	1*8(%rdi)
> +	movq %r10,	2*8(%rdi)
> +	movq %r11,	3*8(%rdi)
> +	leaq 4*8(%rdi),	%rdi
> +	jae  .Lcopy_forward_loop
> +	addq $0x20,	%rdx
> +	jmp  .Lhandle_tail
> +
> +.Lcopy_backward:
> +	/*
> +	 * Calculate copy position to tail.
> +	 */
> +	addq %rdx,	%rsi
> +	addq %rdx,	%rdi
> +	subq $0x20,	%rdx
> +	/*
> +	 * At most 3 ALU operations in one cycle,
> +	 * so append NOPS in the same 16bytes trunk.
> +	 */
> +	.p2align 4
> +.Lcopy_backward_loop:
> +	subq $0x20,	%rdx
> +	movq -1*8(%rsi),	%r8
> +	movq -2*8(%rsi),	%r9
> +	movq -3*8(%rsi),	%r10
> +	movq -4*8(%rsi),	%r11
> +	leaq -4*8(%rsi),	%rsi
> +	movq %r8,		-1*8(%rdi)
> +	movq %r9,		-2*8(%rdi)
> +	movq %r10,		-3*8(%rdi)
> +	movq %r11,		-4*8(%rdi)
> +	leaq -4*8(%rdi),	%rdi
> +	jae  .Lcopy_backward_loop
> 
> +	/*
> +	 * Calculate copy position to head.
> +	 */
> +	addq $0x20,	%rdx
> +	subq %rdx,	%rsi
> +	subq %rdx,	%rdi
>  .Lhandle_tail:
> -	movl %edx, %ecx
> -	andl  $63, %ecx
> -	shrl   $3, %ecx
> -	jz   .Lhandle_7
> +	cmpq $16,	%rdx
> +	jb   .Lless_16bytes
> 
> +	/*
> +	 * Move data from 16 bytes to 31 bytes.
> +	 */
> +	movq 0*8(%rsi), %r8
> +	movq 1*8(%rsi),	%r9
> +	movq -2*8(%rsi, %rdx),	%r10
> +	movq -1*8(%rsi, %rdx),	%r11
> +	movq %r8,	0*8(%rdi)
> +	movq %r9,	1*8(%rdi)
> +	movq %r10,	-2*8(%rdi, %rdx)
> +	movq %r11,	-1*8(%rdi, %rdx)
> +	retq
>  	.p2align 4
> -.Lloop_8:
> -	decl %ecx
> -	movq (%rsi),		%r8
> -	movq %r8,		(%rdi)
> -	leaq 8(%rdi),		%rdi
> -	leaq 8(%rsi),		%rsi
> -	jnz  .Lloop_8
> -
> -.Lhandle_7:
> -	movl %edx, %ecx
> -	andl $7, %ecx
> -	jz .Lend
> +.Lless_16bytes:
> +	cmpq $8,	%rdx
> +	jb   .Lless_8bytes
> +	/*
> +	 * Move data from 8 bytes to 15 bytes.
> +	 */
> +	movq 0*8(%rsi),	%r8
> +	movq -1*8(%rsi, %rdx),	%r9
> +	movq %r8,	0*8(%rdi)
> +	movq %r9,	-1*8(%rdi, %rdx)
> +	retq
> +	.p2align 4
> +.Lless_8bytes:
> +	cmpq $4,	%rdx
> +	jb   .Lless_3bytes
> 
> +	/*
> +	 * Move data from 4 bytes to 7 bytes.
> +	 */
> +	movl (%rsi), %ecx
> +	movl -4(%rsi, %rdx), %r8d
> +	movl %ecx, (%rdi)
> +	movl %r8d, -4(%rdi, %rdx)
> +	retq
>  	.p2align 4
> +.Lless_3bytes:
> + 	cmpl $0, %edx
> +	je .Lend
> +	/*
> +	 * Move data from 1 bytes to 3 bytes.
> +	 */
>  .Lloop_1:
>  	movb (%rsi), %r8b
>  	movb %r8b, (%rdi)
>  	incq %rdi
>  	incq %rsi
> -	decl %ecx
> +	decl %edx
>  	jnz .Lloop_1
> 
>  .Lend:
> -	ret
> +	retq
>  	CFI_ENDPROC
>  ENDPROC(memcpy)
>  ENDPROC(__memcpy)
> --
> 1.6.5.2


View attachment "memcpy-kernel.c" of type "text/plain" (7630 bytes)