[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <C10D3FB0CD45994C8A51FEC1227CE22F239E58D6FA@shsmsx502.ccr.corp.intel.com>
Date: Mon, 20 Jun 2011 11:42:42 +0800
From: "Ma, Ling" <ling.ma@...el.com>
To: "Ma, Ling" <ling.ma@...el.com>, "mingo@...e.hu" <mingo@...e.hu>
CC: "hpa@...or.com" <hpa@...or.com>,
"tglx@...utronix.de" <tglx@...utronix.de>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
Subject: RE: [PATCH RFC V2] [x86] Optimize copy-page by reducing impact from
HW prefetch
New experiment shows, for 4096 bytes no improvement on snb,
10~15% improvement on Core2, 11.6% improvement on 64bit atom.
Thanks
Ling
> -----Original Message-----
> From: Ma, Ling
> Sent: Saturday, June 18, 2011 7:24 AM
> To: mingo@...e.hu
> Cc: hpa@...or.com; tglx@...utronix.de; linux-kernel@...r.kernel.org; Ma,
> Ling
> Subject: [PATCH RFC V2] [x86] Optimize copy-page by reducing impact
> from HW prefetch
>
> From: Ma Ling <ling.ma@...el.com>
>
> Program's temporal & spatial locality introduce cache unit to overcome
> the processor-memory performance gap, hardware prefetch is very
> important
> to improve performance by reducing cache miss. Modern CPU micro-
> architecture
> mainly support two kinds of prefetch mechanism in L1 data cache:
>
> a. Data cache unit (DCU) prefetcher. Data spatial locality ask us to
> provide
> adjacent data while handling current data. larger cache line size
> is one choice, but it would cause more cached data to be evicted and
> latency
> to load, so we simply prefetch next line when accessing current data.
> This mode only prefetch data of ascending address.
>
> b. Instruction pointer (IP)- based strided prefetcher. Based on
> Load/write
> instruction address the mechanism predicate to prefetch data with
> adaptive stride,
> including ascending and descending address
>
> DCU mode is good when time program data operation spend is longer than
> that of
> prefetch next line, however copy-page function breaks the assumption,
> DCU mode is hardly helpful, specially we append software prefetch and
> data is
> in cache, so bus traffic is more busy that impact perforamnce seriously.
>
> In this patch we introduce backward copy to successfully avoid HW
> prfetch
> impact(DCU prefetcher), and simplify original code.
> The performance is improved about 15% on core2, 36% on snb respectively.
> (We use our micro-benchmark, and will do further test according to your
> requirment)
>
> Thanks
> Ling
>
> ---
> arch/x86/lib/copy_page_64.S | 124 +++++++++++++++++++----------------
> -------
> 1 files changed, 56 insertions(+), 68 deletions(-)
>
> diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
> index 6fec2d1..0a60705 100644
> --- a/arch/x86/lib/copy_page_64.S
> +++ b/arch/x86/lib/copy_page_64.S
> @@ -1,4 +1,5 @@
> /* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
> +/* Updated 2011 by Ma Ling to introduce backward copy */
>
> #include <linux/linkage.h>
> #include <asm/dwarf2.h>
> @@ -17,83 +18,70 @@ ENDPROC(copy_page_c)
>
> /* Could vary the prefetch distance based on SMP/UP */
>
> +/*
> + * By backward copy we manage to reduce impact from HW prefetch
> + * when data is in L1 cache, and get benefit when data is not in L1
> cache.
> + */
> ENTRY(copy_page)
> CFI_STARTPROC
> - subq $3*8,%rsp
> - CFI_ADJUST_CFA_OFFSET 3*8
> - movq %rbx,(%rsp)
> - CFI_REL_OFFSET rbx, 0
> - movq %r12,1*8(%rsp)
> - CFI_REL_OFFSET r12, 1*8
> - movq %r13,2*8(%rsp)
> - CFI_REL_OFFSET r13, 2*8
> -
> - movl $(4096/64)-5,%ecx
> - .p2align 4
> + lea 4096(%rsi), %rsi
> + lea 4096(%rdi), %rdi
> + mov $(4096/64)-5, %cl
> + mov $5, %dl
> + /*
> + * Nop force following instruction to be 16 bytes aligned.
> + */
> + nop
> .Loop64:
> - dec %rcx
> -
> - movq (%rsi), %rax
> - movq 8 (%rsi), %rbx
> - movq 16 (%rsi), %rdx
> - movq 24 (%rsi), %r8
> - movq 32 (%rsi), %r9
> - movq 40 (%rsi), %r10
> - movq 48 (%rsi), %r11
> - movq 56 (%rsi), %r12
> -
> - prefetcht0 5*64(%rsi)
> -
> - movq %rax, (%rdi)
> - movq %rbx, 8 (%rdi)
> - movq %rdx, 16 (%rdi)
> - movq %r8, 24 (%rdi)
> - movq %r9, 32 (%rdi)
> - movq %r10, 40 (%rdi)
> - movq %r11, 48 (%rdi)
> - movq %r12, 56 (%rdi)
> -
> - leaq 64 (%rsi), %rsi
> - leaq 64 (%rdi), %rdi
> + prefetchnta -5*64(%rsi)
> + dec %cl
> +
> + movq -0x8*1(%rsi), %rax
> + movq -0x8*2(%rsi), %r8
> + movq -0x8*3(%rsi), %r9
> + movq -0x8*4(%rsi), %r10
> + movq %rax, -0x8*1(%rdi)
> + movq %r8, -0x8*2(%rdi)
> + movq %r9, -0x8*3(%rdi)
> + movq %r10, -0x8*4(%rdi)
> +
> + movq -0x8*5(%rsi), %rax
> + movq -0x8*6(%rsi), %r8
> + movq -0x8*7(%rsi), %r9
> + movq -0x8*8(%rsi), %r10
> + leaq -64(%rsi), %rsi
> + movq %rax, -0x8*5(%rdi)
> + movq %r8, -0x8*6(%rdi)
> + movq %r9, -0x8*7(%rdi)
> + movq %r10, -0x8*8(%rdi)
> + leaq -64(%rdi), %rdi
>
> jnz .Loop64
>
> - movl $5,%ecx
> - .p2align 4
> .Loop2:
> - decl %ecx
> -
> - movq (%rsi), %rax
> - movq 8 (%rsi), %rbx
> - movq 16 (%rsi), %rdx
> - movq 24 (%rsi), %r8
> - movq 32 (%rsi), %r9
> - movq 40 (%rsi), %r10
> - movq 48 (%rsi), %r11
> - movq 56 (%rsi), %r12
> -
> - movq %rax, (%rdi)
> - movq %rbx, 8 (%rdi)
> - movq %rdx, 16 (%rdi)
> - movq %r8, 24 (%rdi)
> - movq %r9, 32 (%rdi)
> - movq %r10, 40 (%rdi)
> - movq %r11, 48 (%rdi)
> - movq %r12, 56 (%rdi)
> -
> - leaq 64(%rdi),%rdi
> - leaq 64(%rsi),%rsi
> -
> + dec %dl
> +
> + movq -0x8*1(%rsi), %rax
> + movq -0x8*2(%rsi), %r8
> + movq -0x8*3(%rsi), %r9
> + movq -0x8*4(%rsi), %r10
> + movq %rax, -0x8*1(%rdi)
> + movq %r8, -0x8*2(%rdi)
> + movq %r9, -0x8*3(%rdi)
> + movq %r10, -0x8*4(%rdi)
> +
> + movq -0x8*5(%rsi), %rax
> + movq -0x8*6(%rsi), %r8
> + movq -0x8*7(%rsi), %r9
> + movq -0x8*8(%rsi), %r10
> + leaq -64(%rsi), %rsi
> + movq %rax, -0x8*5(%rdi)
> + movq %r8, -0x8*6(%rdi)
> + movq %r9, -0x8*7(%rdi)
> + movq %r10, -0x8*8(%rdi)
> + leaq -64(%rdi), %rdi
> jnz .Loop2
>
> - movq (%rsp),%rbx
> - CFI_RESTORE rbx
> - movq 1*8(%rsp),%r12
> - CFI_RESTORE r12
> - movq 2*8(%rsp),%r13
> - CFI_RESTORE r13
> - addq $3*8,%rsp
> - CFI_ADJUST_CFA_OFFSET -3*8
> ret
> .Lcopy_page_end:
> CFI_ENDPROC
> --
> 1.6.5.2
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists