linux-kernel - Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Fri, 08 Oct 2010 17:02:05 +0800
From:	Miao Xie <miaox@...fujitsu.com>
To:	"Ma, Ling" <ling.ma@...el.com>
CC:	Ingo Molnar <mingo@...hat.com>, Andi Kleen <andi@...stfloor.org>,
	"H. Peter Anvin" <hpa@...or.com>,
	Thomas Gleixner <tglx@...utronix.de>,
	"Zhao, Yakui" <yakui.zhao@...el.com>,
	Linux Kernel <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy()
 for unaligned copy

On Fri, 8 Oct 2010 15:42:45 +0800, Ma, Ling wrote:
> Could you please give us full address for each comparison result,we will do some tests on my machine.
> For unaligned cases older cpus will crossing cache line and slow down caused by load and store, but for nhm, no necessary to care about it.
> By the way in kernel 64bit mode, our access mode should be around 8byte aligned.

Would you need my benchmark tool? I think it is helpful for your test.

Thanks
Miao

> Thanks
> Ling
>
>> -----Original Message-----
>> From: Miao Xie [mailto:miaox@...fujitsu.com]
>> Sent: Friday, October 08, 2010 3:28 PM
>> To: Ingo Molnar; Andi Kleen; Ma, Ling; H. Peter Anvin; Thomas Gleixner; Zhao,
>> Yakui
>> Cc: Linux Kernel
>> Subject: [PATCH V2 -tip] lib,x86_64: improve the performance of memcpy() for
>> unaligned copy
>>
>> memcpy of x86_64 hasn't been optimized for the unaligned copy like other
>> architecture, this patch fixed this problem.
>>
>> I have tested this patch by my benchmark tool(doing 500 bytes memory copy
>> for 5,000,000 times)with various alignments and buffer sizes on my Core2
>> box.
>>
>> Len	Src/Dst	Old memcpy	New memcpy
>> 	align
>> ---	-------	-------------	-------------
>> 1	0/0	0s 47015us	0s 28265us
>> 1	0/4	0s 28201us	0s 28199us
>> 1	4/0	0s 28200us	0s 28199us
>> 1	4/4	0s 28199us	0s 28206us
>> 7	0/0	0s 24441us	0s 24438us
>> 7	0/4	0s 24439us	0s 24438us
>> 7	4/0	0s 24439us	0s 24438us
>> 7	4/4	0s 24439us	0s 24439us
>> 8	0/0	0s 20699us	0s 20687us
>> 8	0/4	0s 20689us	0s 20901us
>> 8	4/0	0s 20692us	0s 20679us
>> 8	4/4	0s 20679us	0s 20679us
>> 16	0/0	0s 18807us	0s 18802us
>> 16	0/4	0s 26319us	0s 18800us
>> 16	4/0	0s 18800us	0s 18806us
>> 16	4/4	0s 26317us	0s 18803us
>> 32	0/0	0s 35728us	0s 18800us
>> 32	0/4	0s 35716us	0s 18800us
>> 32	4/0	0s 35717us	0s 18800us
>> 32	4/4	0s 35724us	0s 18803us
>> 48	0/0	0s 26897us	0s 30080us
>> 48	0/4	0s 33837us	0s 33838us
>> 48	4/0	0s 27600us	0s 30079us
>> 48	4/4	0s 30087us	0s 33854us
>> 64	0/0	0s 41369us	0s 45115us
>> 64	0/4	0s 62042us	0s 65800us
>> 64	4/0	0s 56400us	0s 58278us
>> 64	4/4	0s 84596us	0s 84606us
>> 80	0/0	0s 35877us	0s 37611us
>> 80	0/4	0s 77083us	0s 56404us
>> 80	4/0	0s 52652us	0s 55611us
>> 80	4/4	0s 75200us	0s 78968us
>> 128	0/0	0s 52642us	0s 56403us
>> 128	0/4	0s 95883us	0s 95891us
>> 128	4/0	0s 114683us	0s 108511us
>> 128	4/4	0s 144780us	0s 110927us
>> 256	0/0	0s 80832us	0s 86489us
>> 256	0/4	0s 178586us	0s 163562us
>> 256	4/0	0s 208670us	0s 181719us
>> 256	4/4	0s 270705us	0s 148525us
>> 512	0/0	0s 156049us	0s 148348us
>> 512	0/4	0s 313933us	0s 298908us
>> 512	4/0	0s 411671us	0s 329025us
>> 512	4/4	0s 516971us	0s 208746us
>> 1024	0/0	0s 297067us	0s 274019us
>> 1024	0/4	0s 584703us	0s 569604us
>> 1024	4/0	0s 818104us	0s 616419us
>> 1024	4/4	1s 22839us	0s 328953us
>> 2048	0/0	0s 577077us	0s 524148us
>> 2048	0/4	1s 125953us	1s 111258us
>> 2048	4/0	1s 894000us	1s 202724us
>> 2048	4/4	2s 331807us	0s 822437us
>> 4096	0/0	1s 25881us	1s 34128us
>> 4096	0/4	2s 619273us	2s 606489us
>> 4096	4/0	3s 553989us	2s 390272us
>> 4096	4/4	4s 737789us	1s 433213us
>>
>> Signed-off-by: Miao Xie<miaox@...fujitsu.com>
>> ---
>>   arch/x86/lib/memcpy_64.S |  135
>> +++++++++++++++++++++++++++++++++++++++++++++-
>>   1 files changed, 134 insertions(+), 1 deletions(-)
>>
>> diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
>> index 75ef61e..b0224f8 100644
>> --- a/arch/x86/lib/memcpy_64.S
>> +++ b/arch/x86/lib/memcpy_64.S
>> @@ -46,9 +46,39 @@ ENTRY(memcpy)
>>   	 * Use 32bit CMP here to avoid long NOP padding.
>>   	 */
>>   	cmp  $0x20, %edx
>> -	jb .Lhandle_tail
>> +	jbe .Lhandle_tail
>>
>>   	/*
>> +	 * the code for unaligned copy is good for large-size copy(>100),
>> +	 * so if the size is small, we needn't check dst and src is aligned
>> +	 * or not.
>> +	 */
>> +	cmp $100, %edx
>> +	jb .Lboth_aligned
>> +
>> +	/*
>> +	 * unaligned access always leads to bad performance, so in order to
>> +	 * avoid unaligned access, we align the address(both src and dest)
>> +	 * first, and then copy from a aligned src to an aligned dst by using
>> +	 * shifts.
>> +	 * But we found if src is aligned, although dest is unaligned, the
>> +	 * performance of generic memory copy (That is reading data aligned
>> +	 * from the source and writing data unaligned to the dest) is better
>> +	 * than the one that uses shifts to avoid unaligned access.
>> +	 * So if src is aligned, we needn't check dest is aligned or not, just
>> +	 * goto .Lboth_aligned
>> +	 */
>> +	test $7, %esi		/* src align check */
>> +	jz .Lboth_aligned
>> +
>> +	/* if dest and src both are unaligned, goto unaligned copy */
>> +	test $7, %edi
>> +	jnz .Ldst_unaligned
>> +
>> +	jmp .Lsrc_unaligned_dst_aligned
>> +
>> +.Lboth_aligned:
>> +	/*
>>   	 * We check whether memory false dependece could occur,
>>   	 * then jump to corresponding copy mode.
>>   	 */
>> @@ -166,6 +196,109 @@ ENTRY(memcpy)
>>
>>   .Lend:
>>   	retq
>> +
>> +	.p2align 4
>> +.Ldst_unaligned:
>> +	movq %rdi, %rcx
>> +	andq $7, %rcx		/* Align the destination */
>> +	negq %rcx
>> +	andq $7, %rcx
>> +	subq %rcx, %rdx
>> +
>> +	/* tune dst address */
>> +	movq (%rsi), %r8
>> +	movq %r8, (%rdi)
>> +	addq %rcx, %rdi
>> +	addq %rcx, %rsi
>> +
>> +	test $7, %esi		/* src align check */
>> +	jz .Lboth_aligned
>> +
>> +	.p2align 4
>> +.Lsrc_unaligned_dst_aligned:
>> +	push %rbx
>> +	push %r12
>> +	push %r13
>> +	push %r14
>> +	push %r15
>> +	/*
>> +	 * Calculate how to shift a word read at the memory operation
>> +	 * aligned srcp to make it aligned for copy.
>> +	 */
>> +	movq %rsi, %r14
>> +	andq $7, %r14
>> +	shlq $3, %r14
>> +
>> +	movq $64, %r15
>> +	subq %r14, %r15
>> +
>> +	andq $-8, %rsi		/* src aligned */
>> +	movq 0*8(%rsi), %r8
>> +
>> +	movq %rdx, %rbx
>> +	shrq $5, %rbx
>> +	jz .Lsrc_unaligned_less32
>> +
>> +	/*
>> +	 * %r8 : store src[0]
>> +	 * %r9 : store src[1]
>> +	 * %r10: store src[2]
>> +	 * %r11: store src[3]
>> +	 * %r12: store src[4]
>> +	 * %r13: store the tmp data
>> +	 */
>> +	.p2align 4
>> +.Lsrc_unaligned_loop32:
>> +	movq 1*8(%rsi), %r9
>> +	movq 2*8(%rsi), %r10
>> +	movq 3*8(%rsi), %r11
>> +	movq 4*8(%rsi), %r12
>> +
>> +	movq %r9, %r13
>> +	movb %r14b, %cl
>> +	shrq %cl, %r8
>> +	shrq %cl, %r13
>> +	movb %r15b, %cl
>> +	shlq  %cl, %r9
>> +	orq %r8, %r9
>> +	movq %r10, %r8
>> +	shlq  %cl, %r10
>> +	orq %r13, %r10
>> +
>> +	movq %r11, %r13
>> +	movb %r14b, %cl
>> +	shrq %cl, %r8
>> +	shrq %cl, %r13
>> +	movb %r15b, %cl
>> +	shlq  %cl, %r11
>> +	orq %r8, %r11
>> +	movq %r12, %r8
>> +	shlq  %cl, %r12
>> +	orq %r13, %r12
>> +
>> +	movq %r9, 0*8(%rdi)
>> +	movq %r10, 1*8(%rdi)
>> +	movq %r11, 2*8(%rdi)
>> +	movq %r12, 3*8(%rdi)
>> +
>> +	leaq 4*8(%rdi), %rdi
>> +	leaq 4*8(%rsi), %rsi
>> +	decq %rbx
>> +	jnz .Lsrc_unaligned_loop32
>> +
>> +	.p2align 4
>> +.Lsrc_unaligned_less32:
>> +	shrq $3, %r14
>> +	addq %r14, %rsi
>> +	pop %r15
>> +	pop %r14
>> +	pop %r13
>> +	pop %r12
>> +	pop %rbx
>> +	andq $31, %rdx
>> +	jnz .Lhandle_tail
>> +	retq
>> +
>>   	CFI_ENDPROC
>>   ENDPROC(memcpy)
>>   ENDPROC(__memcpy)
>> --
>> 1.7.0.1
>
>


Download attachment "benchmark.tar.gz" of type "application/x-gzip" (3132 bytes)