/* * ym_memcpy - AVX version of memcpy * * Input: * rdi destination * rsi source * rdx count * * Output: * rax original destination */ .globl ym_memcpy .type ym_memcpy, @function ym_memcpy: mov %rdi, %rax /* Target align */ movzbq %dil, %rcx negb %cl andb $0x1f, %cl subq %rcx, %rdx rep movsb movq %rdx, %rcx andq $0x1ff, %rdx shrq $9, %rcx jz .trailer movb %sil, %r8b andb $0x1f, %r8b test %r8b, %r8b jz .repeat_a .align 32 .repeat_ua: vmovups 0x0(%rsi), %ymm0 vmovups 0x20(%rsi), %ymm1 vmovups 0x40(%rsi), %ymm2 vmovups 0x60(%rsi), %ymm3 vmovups 0x80(%rsi), %ymm4 vmovups 0xa0(%rsi), %ymm5 vmovups 0xc0(%rsi), %ymm6 vmovups 0xe0(%rsi), %ymm7 vmovups 0x100(%rsi), %ymm8 vmovups 0x120(%rsi), %ymm9 vmovups 0x140(%rsi), %ymm10 vmovups 0x160(%rsi), %ymm11 vmovups 0x180(%rsi), %ymm12 vmovups 0x1a0(%rsi), %ymm13 vmovups 0x1c0(%rsi), %ymm14 vmovups 0x1e0(%rsi), %ymm15 vmovaps %ymm0, 0x0(%rdi) vmovaps %ymm1, 0x20(%rdi) vmovaps %ymm2, 0x40(%rdi) vmovaps %ymm3, 0x60(%rdi) vmovaps %ymm4, 0x80(%rdi) vmovaps %ymm5, 0xa0(%rdi) vmovaps %ymm6, 0xc0(%rdi) vmovaps %ymm7, 0xe0(%rdi) vmovaps %ymm8, 0x100(%rdi) vmovaps %ymm9, 0x120(%rdi) vmovaps %ymm10, 0x140(%rdi) vmovaps %ymm11, 0x160(%rdi) vmovaps %ymm12, 0x180(%rdi) vmovaps %ymm13, 0x1a0(%rdi) vmovaps %ymm14, 0x1c0(%rdi) vmovaps %ymm15, 0x1e0(%rdi) /* advance pointers */ addq $0x200, %rsi addq $0x200, %rdi subq $1, %rcx jnz .repeat_ua jz .trailer .align 32 .repeat_a: prefetchnta 0x80(%rsi) prefetchnta 0x100(%rsi) prefetchnta 0x180(%rsi) vmovaps 0x0(%rsi), %ymm0 vmovaps 0x20(%rsi), %ymm1 vmovaps 0x40(%rsi), %ymm2 vmovaps 0x60(%rsi), %ymm3 vmovaps 0x80(%rsi), %ymm4 vmovaps 0xa0(%rsi), %ymm5 vmovaps 0xc0(%rsi), %ymm6 vmovaps 0xe0(%rsi), %ymm7 vmovaps 0x100(%rsi), %ymm8 vmovaps 0x120(%rsi), %ymm9 vmovaps 0x140(%rsi), %ymm10 vmovaps 0x160(%rsi), %ymm11 vmovaps 0x180(%rsi), %ymm12 vmovaps 0x1a0(%rsi), %ymm13 vmovaps 0x1c0(%rsi), %ymm14 vmovaps 0x1e0(%rsi), %ymm15 vmovaps %ymm0, 0x0(%rdi) vmovaps %ymm1, 0x20(%rdi) vmovaps %ymm2, 0x40(%rdi) vmovaps %ymm3, 0x60(%rdi) vmovaps %ymm4, 0x80(%rdi) vmovaps %ymm5, 0xa0(%rdi) vmovaps %ymm6, 0xc0(%rdi) vmovaps %ymm7, 0xe0(%rdi) vmovaps %ymm8, 0x100(%rdi) vmovaps %ymm9, 0x120(%rdi) vmovaps %ymm10, 0x140(%rdi) vmovaps %ymm11, 0x160(%rdi) vmovaps %ymm12, 0x180(%rdi) vmovaps %ymm13, 0x1a0(%rdi) vmovaps %ymm14, 0x1c0(%rdi) vmovaps %ymm15, 0x1e0(%rdi) /* advance pointers */ addq $0x200, %rsi addq $0x200, %rdi subq $1, %rcx jnz .repeat_a .align 32 .trailer: movq %rdx, %rcx shrq $3, %rcx rep; movsq movq %rdx, %rcx andq $0x7, %rcx rep; movsb retq