linux-kernel - RE: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <C10D3FB0CD45994C8A51FEC1227CE22F15C916E769@shsmsx502.ccr.corp.intel.com>
Date:	Fri, 8 Oct 2010 15:42:45 +0800
From:	"Ma, Ling" <ling.ma@...el.com>
To:	"miaox@...fujitsu.com" <miaox@...fujitsu.com>,
	Ingo Molnar <mingo@...hat.com>,
	Andi Kleen <andi@...stfloor.org>,
	"H. Peter Anvin" <hpa@...or.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	"Zhao, Yakui" <yakui.zhao@...el.com>
CC:	Linux Kernel <linux-kernel@...r.kernel.org>
Subject: RE: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy()
 for unaligned copy

Could you please give us full address for each comparison result,we will do some tests on my machine.
For unaligned cases older cpus will crossing cache line and slow down caused by load and store, but for nhm, no necessary to care about it.
By the way in kernel 64bit mode, our access mode should be around 8byte aligned.

Thanks
Ling 

> -----Original Message-----
> From: Miao Xie [mailto:miaox@...fujitsu.com]
> Sent: Friday, October 08, 2010 3:28 PM
> To: Ingo Molnar; Andi Kleen; Ma, Ling; H. Peter Anvin; Thomas Gleixner; Zhao,
> Yakui
> Cc: Linux Kernel
> Subject: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for
> unaligned copy
> 
> memcpy of x86_64 hasn't been optimized for the unaligned copy like other
> architecture, this patch fixed this problem.
> 
> I have tested this patch by my benchmark tool(doing 500 bytes memory copy
> for 5,000,000 times)with various alignments and buffer sizes on my Core2
> box.
> 
> Len	Src/Dst	Old memcpy	New memcpy
> 	align
> ---	-------	-------------	-------------
> 1	0/0	0s 47015us	0s 28265us
> 1	0/4	0s 28201us	0s 28199us
> 1	4/0	0s 28200us	0s 28199us
> 1	4/4	0s 28199us	0s 28206us
> 7	0/0	0s 24441us	0s 24438us
> 7	0/4	0s 24439us	0s 24438us
> 7	4/0	0s 24439us	0s 24438us
> 7	4/4	0s 24439us	0s 24439us
> 8	0/0	0s 20699us	0s 20687us
> 8	0/4	0s 20689us	0s 20901us
> 8	4/0	0s 20692us	0s 20679us
> 8	4/4	0s 20679us	0s 20679us
> 16	0/0	0s 18807us	0s 18802us
> 16	0/4	0s 26319us	0s 18800us
> 16	4/0	0s 18800us	0s 18806us
> 16	4/4	0s 26317us	0s 18803us
> 32	0/0	0s 35728us	0s 18800us
> 32	0/4	0s 35716us	0s 18800us
> 32	4/0	0s 35717us	0s 18800us
> 32	4/4	0s 35724us	0s 18803us
> 48	0/0	0s 26897us	0s 30080us
> 48	0/4	0s 33837us	0s 33838us
> 48	4/0	0s 27600us	0s 30079us
> 48	4/4	0s 30087us	0s 33854us
> 64	0/0	0s 41369us	0s 45115us
> 64	0/4	0s 62042us	0s 65800us
> 64	4/0	0s 56400us	0s 58278us
> 64	4/4	0s 84596us	0s 84606us
> 80	0/0	0s 35877us	0s 37611us
> 80	0/4	0s 77083us	0s 56404us
> 80	4/0	0s 52652us	0s 55611us
> 80	4/4	0s 75200us	0s 78968us
> 128	0/0	0s 52642us	0s 56403us
> 128	0/4	0s 95883us	0s 95891us
> 128	4/0	0s 114683us	0s 108511us
> 128	4/4	0s 144780us	0s 110927us
> 256	0/0	0s 80832us	0s 86489us
> 256	0/4	0s 178586us	0s 163562us
> 256	4/0	0s 208670us	0s 181719us
> 256	4/4	0s 270705us	0s 148525us
> 512	0/0	0s 156049us	0s 148348us
> 512	0/4	0s 313933us	0s 298908us
> 512	4/0	0s 411671us	0s 329025us
> 512	4/4	0s 516971us	0s 208746us
> 1024	0/0	0s 297067us	0s 274019us
> 1024	0/4	0s 584703us	0s 569604us
> 1024	4/0	0s 818104us	0s 616419us
> 1024	4/4	1s 22839us	0s 328953us
> 2048	0/0	0s 577077us	0s 524148us
> 2048	0/4	1s 125953us	1s 111258us
> 2048	4/0	1s 894000us	1s 202724us
> 2048	4/4	2s 331807us	0s 822437us
> 4096	0/0	1s 25881us	1s 34128us
> 4096	0/4	2s 619273us	2s 606489us
> 4096	4/0	3s 553989us	2s 390272us
> 4096	4/4	4s 737789us	1s 433213us
> 
> Signed-off-by: Miao Xie <miaox@...fujitsu.com>
> ---
>  arch/x86/lib/memcpy_64.S |  135
> +++++++++++++++++++++++++++++++++++++++++++++-
>  1 files changed, 134 insertions(+), 1 deletions(-)
> 
> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
> index 75ef61e..b0224f8 100644
> --- a/arch/x86/lib/memcpy_64.S
> +++ b/arch/x86/lib/memcpy_64.S
> @@ -46,9 +46,39 @@ ENTRY(memcpy)
>  	 * Use 32bit CMP here to avoid long NOP padding.
>  	 */
>  	cmp  $0x20, %edx
> -	jb .Lhandle_tail
> +	jbe .Lhandle_tail
> 
>  	/*
> +	 * the code for unaligned copy is good for large-size copy(>100),
> +	 * so if the size is small, we needn't check dst and src is aligned
> +	 * or not.
> +	 */
> +	cmp $100, %edx
> +	jb .Lboth_aligned
> +
> +	/*
> +	 * unaligned access always leads to bad performance, so in order to
> +	 * avoid unaligned access, we align the address(both src and dest)
> +	 * first, and then copy from a aligned src to an aligned dst by using
> +	 * shifts.
> +	 * But we found if src is aligned, although dest is unaligned, the
> +	 * performance of generic memory copy (That is reading data aligned
> +	 * from the source and writing data unaligned to the dest) is better
> +	 * than the one that uses shifts to avoid unaligned access.
> +	 * So if src is aligned, we needn't check dest is aligned or not, just
> +	 * goto .Lboth_aligned
> +	 */
> +	test $7, %esi		/* src align check */
> +	jz .Lboth_aligned
> +
> +	/* if dest and src both are unaligned, goto unaligned copy */
> +	test $7, %edi
> +	jnz .Ldst_unaligned
> +
> +	jmp .Lsrc_unaligned_dst_aligned
> +
> +.Lboth_aligned:
> +	/*
>  	 * We check whether memory false dependece could occur,
>  	 * then jump to corresponding copy mode.
>  	 */
> @@ -166,6 +196,109 @@ ENTRY(memcpy)
> 
>  .Lend:
>  	retq
> +
> +	.p2align 4
> +.Ldst_unaligned:
> +	movq %rdi, %rcx
> +	andq $7, %rcx		/* Align the destination */
> +	negq %rcx
> +	andq $7, %rcx
> +	subq %rcx, %rdx
> +
> +	/* tune dst address */
> +	movq (%rsi), %r8
> +	movq %r8, (%rdi)
> +	addq %rcx, %rdi
> +	addq %rcx, %rsi
> +
> +	test $7, %esi		/* src align check */
> +	jz .Lboth_aligned
> +
> +	.p2align 4
> +.Lsrc_unaligned_dst_aligned:
> +	push %rbx
> +	push %r12
> +	push %r13
> +	push %r14
> +	push %r15
> +	/*
> +	 * Calculate how to shift a word read at the memory operation
> +	 * aligned srcp to make it aligned for copy.
> +	 */
> +	movq %rsi, %r14
> +	andq $7, %r14
> +	shlq $3, %r14
> +
> +	movq $64, %r15
> +	subq %r14, %r15
> +
> +	andq $-8, %rsi		/* src aligned */
> +	movq 0*8(%rsi), %r8
> +
> +	movq %rdx, %rbx
> +	shrq $5, %rbx
> +	jz .Lsrc_unaligned_less32
> +
> +	/*
> +	 * %r8 : store src[0]
> +	 * %r9 : store src[1]
> +	 * %r10: store src[2]
> +	 * %r11: store src[3]
> +	 * %r12: store src[4]
> +	 * %r13: store the tmp data
> +	 */
> +	.p2align 4
> +.Lsrc_unaligned_loop32:
> +	movq 1*8(%rsi), %r9
> +	movq 2*8(%rsi), %r10
> +	movq 3*8(%rsi), %r11
> +	movq 4*8(%rsi), %r12
> +
> +	movq %r9, %r13
> +	movb %r14b, %cl
> +	shrq %cl, %r8
> +	shrq %cl, %r13
> +	movb %r15b, %cl
> +	shlq  %cl, %r9
> +	orq %r8, %r9
> +	movq %r10, %r8
> +	shlq  %cl, %r10
> +	orq %r13, %r10
> +
> +	movq %r11, %r13
> +	movb %r14b, %cl
> +	shrq %cl, %r8
> +	shrq %cl, %r13
> +	movb %r15b, %cl
> +	shlq  %cl, %r11
> +	orq %r8, %r11
> +	movq %r12, %r8
> +	shlq  %cl, %r12
> +	orq %r13, %r12
> +
> +	movq %r9, 0*8(%rdi)
> +	movq %r10, 1*8(%rdi)
> +	movq %r11, 2*8(%rdi)
> +	movq %r12, 3*8(%rdi)
> +
> +	leaq 4*8(%rdi), %rdi
> +	leaq 4*8(%rsi), %rsi
> +	decq %rbx
> +	jnz .Lsrc_unaligned_loop32
> +
> +	.p2align 4
> +.Lsrc_unaligned_less32:
> +	shrq $3, %r14
> +	addq %r14, %rsi
> +	pop %r15
> +	pop %r14
> +	pop %r13
> +	pop %r12
> +	pop %rbx
> +	andq $31, %rdx
> +	jnz .Lhandle_tail
> +	retq
> +
>  	CFI_ENDPROC
>  ENDPROC(memcpy)
>  ENDPROC(__memcpy)
> --
> 1.7.0.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/