linux-kernel - [PATCH -tip] lib,x86_64: improve the performance of memcpy() for unaligned copy

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite: Windows password security audit tool. GUI, reports in PDF.

[<prev] [next>] [day] [month] [year] [list]

Date:	Thu, 23 Sep 2010 20:01:43 +0800
From:	Miao Xie <miaox@...fujitsu.com>
To:	Ingo Molnar <mingo@...hat.com>, Andi Kleen <andi@...stfloor.org>,
	Ma Ling <ling.ma@...el.com>
CC:	Linux Kernel <linux-kernel@...r.kernel.org>
Subject: [PATCH -tip] lib,x86_64: improve the performance of memcpy() for
 unaligned copy

memcpy of x86_64 hasn't been optimized for the unaligned copy like other
architecture, this patch fixed this problem.

I have tested this patch by doing 500 bytes memory copy for 5,000,000 times
with various alignments and buffer sizes on my x86_64 box

Len	Src/Dest	Old memcpy	New memcpy
	alignment
---	---------	-------------	-------------
32	0/0		0s 59553us	0s 39597us
32	0/4		0s 37675us	0s 39583us
32	4/0		0s 35720us	0s 39568us
32	4/4		0s 35721us	0s 39564us
256	0/0		0s 88783us	0s 86759us
256	0/4		0s 182896us	0s 166298us
256	4/0		0s 209244us	0s 191853us
256	4/4		0s 262847us	0s 165768us
512	0/0		0s 156486us	0s 148370us
512	0/4		0s 318856us	0s 302365us
512	4/0		0s 412763us	0s 338468us
512	4/4		0s 518688us	0s 218969us
1024	0/0		0s 298076us	0s 268443us
1024	0/4		0s 592114us	0s 575168us
1024	4/0		0s 819969us	0s 654752us
1024	4/4		1s 16405us	0s 343318us

Signed-off-by: Miao Xie <miaox@...fujitsu.com>
---
 arch/x86/lib/memcpy_64.S |  134 ++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 134 insertions(+), 0 deletions(-)

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 75ef61e..32dade1 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -49,6 +49,35 @@ ENTRY(memcpy)
 	jb .Lhandle_tail
 
 	/*
+	 * the code for unaligned copy is good for large-size copy(>80),
+	 * so if the size is small, we needn't check dst and src is aligned
+	 * or not.
+	 */
+	cmp $80, %edx
+	jbe .Lboth_aligned
+
+	/*
+	 * we found if src is aligned and dest is unaligned, using
+	 * both-aligned copy is better than unaligned copy. So if src is
+	 * aligned, we needn't check dest is aligned or not, just goto
+	 * both-aligned copy.
+	 */
+	movq %rsi, %rcx
+	andq $7, %rcx		/* src align check */
+	jz .Lboth_aligned
+
+	/* if dest and src both are unaligned, goto unaligned copy */
+	movq %rdi, %rcx
+	andq $7, %rcx		/* dst align check */
+	jnz .Ldst_unaligned
+
+	/* if src is unaligned and dest is aligned, goto unaligned copy */
+	movq %rsi, %rcx
+	andq $7, %rcx		/* src align check */
+	jnz .Lsrc_unaligned_dst_aligned
+
+.Lboth_aligned:
+	/*
 	 * We check whether memory false dependece could occur,
 	 * then jump to corresponding copy mode.
 	 */
@@ -166,6 +195,111 @@ ENTRY(memcpy)
 
 .Lend:
 	retq
+
+	.p2align 4
+.Ldst_unaligned:
+	negq %rcx
+	andq $7, %rcx
+	subq %rcx, %rdx
+
+	/* tune dst address */
+	movq (%rsi), %r8
+	movq %r8, (%rdi)
+	addq %rcx, %rdi
+	addq %rcx, %rsi
+
+	cmp $0x20, %rdx
+	jb .Lhandle_tail
+
+	movq %rsi, %rcx
+	andq $7, %rcx		/* src align check */
+	jz .Lboth_aligned
+
+	.p2align 4
+.Lsrc_unaligned_dst_aligned:
+	push %rbx
+	push %r12
+	push %r13
+	push %r14
+	push %r15
+	/*
+	 * Calculate how to shift a word read at the memory operation
+	 * aligned srcp to make it aligned for copy.
+	 */
+	movq %rsi, %r14
+	andq $7, %r14
+	shlq $3, %r14
+	
+	movq $64, %r15
+	subq %r14, %r15
+
+	andq $-8, %rsi		/* src aligned */
+	movq 0*8(%rsi), %r8
+
+	movq %rdx, %rbx
+	shrq $5, %rbx
+	jz .Lsrc_unaligned_less32
+
+	/*
+	 * %r8 : store src[0]
+	 * %r9 : store src[1]
+	 * %r10: store src[2]
+	 * %r11: store src[3]
+	 * %r12: store src[4]
+	 * %r13: store the tmp data
+	 */ 
+	.p2align 4
+.Lsrc_unaligned_loop32:
+	movq 1*8(%rsi), %r9
+	movq 2*8(%rsi), %r10
+	movq 3*8(%rsi), %r11
+	movq 4*8(%rsi), %r12
+
+	movq %r9, %r13
+	movb %r14b, %cl
+	shrq %cl, %r8
+	shrq %cl, %r13
+	movb %r15b, %cl
+	shlq  %cl, %r9
+	orq %r8, %r9
+	movq %r10, %r8
+	shlq  %cl, %r10
+	orq %r13, %r10
+
+	movq %r11, %r13
+	movb %r14b, %cl
+	shrq %cl, %r8
+	shrq %cl, %r13
+	movb %r15b, %cl
+	shlq  %cl, %r11
+	orq %r8, %r11	
+	movq %r12, %r8
+	shlq  %cl, %r12
+	orq %r13, %r12
+
+	movq %r9, 0*8(%rdi)
+	movq %r10, 1*8(%rdi)
+	movq %r11, 2*8(%rdi)
+	movq %r12, 3*8(%rdi)
+	
+	leaq 4*8(%rdi), %rdi
+	leaq 4*8(%rsi), %rsi
+	decq %rbx
+	jnz .Lsrc_unaligned_loop32
+
+	.p2align 4
+.Lsrc_unaligned_less32:
+	shrq $3, %r14
+	addq %r14, %rsi
+	pop %r15
+	pop %r14
+	pop %r13
+	pop %r12
+	pop %rbx
+	andq $31, %rdx
+	jnz .Lhandle_tail
+	retq
+	
 	CFI_ENDPROC
 ENDPROC(memcpy)
 ENDPROC(__memcpy)
-- 
1.7.0.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/