[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20201014083300.19077-4-ankur.a.arora@oracle.com>
Date: Wed, 14 Oct 2020 01:32:54 -0700
From: Ankur Arora <ankur.a.arora@...cle.com>
To: linux-kernel@...r.kernel.org, linux-mm@...ck.org
Cc: kirill@...temov.name, mhocko@...nel.org,
boris.ostrovsky@...cle.com, konrad.wilk@...cle.com,
Ankur Arora <ankur.a.arora@...cle.com>,
Peter Zijlstra <peterz@...radead.org>,
Ingo Molnar <mingo@...hat.com>,
Arnaldo Carvalho de Melo <acme@...nel.org>,
Mark Rutland <mark.rutland@....com>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
Jiri Olsa <jolsa@...hat.com>,
Namhyung Kim <namhyung@...nel.org>
Subject: [PATCH 3/8] perf bench: add memset_movnti()
Clone memset_movnti() from arch/x86/lib/memset_64.S.
perf bench mem memset on -f x86-64-movnt on Intel Broadwellx, Skylakex
and AMD Rome:
Intel Broadwellx:
$ for i in 2 8 32 128 512; do
perf bench mem memset -f x86-64-movnt -s ${i}MB
done
# Output pruned.
# Running 'mem/memset' benchmark:
# function 'x86-64-movnt' (movnt-based memset() in arch/x86/lib/memset_64.S)
# Copying 2MB bytes ...
11.837121 GB/sec
# Copying 8MB bytes ...
11.783560 GB/sec
# Copying 32MB bytes ...
11.868591 GB/sec
# Copying 128MB bytes ...
11.865211 GB/sec
# Copying 512MB bytes ...
11.864085 GB/sec
Intel Skylakex:
$ for i in 2 8 32 128 512; do
perf bench mem memset -f x86-64-movnt -s ${i}MB
done
# Running 'mem/memset' benchmark:
# function 'x86-64-movnt' (movnt-based memset() in arch/x86/lib/memset_64.S)
# Copying 2MB bytes ...
6.361971 GB/sec
# Copying 8MB bytes ...
6.300403 GB/sec
# Copying 32MB bytes ...
6.288992 GB/sec
# Copying 128MB bytes ...
6.328793 GB/sec
# Copying 512MB bytes ...
6.324471 GB/sec
AMD Rome:
$ for i in 2 8 32 128 512; do
perf bench mem memset -f x86-64-movnt -s ${i}MB
done
# Running 'mem/memset' benchmark:
# function 'x86-64-movnt' (movnt-based memset() in arch/x86/lib/memset_64.S)
# Copying 2MB bytes ...
10.993199 GB/sec
# Copying 8MB bytes ...
14.221784 GB/sec
# Copying 32MB bytes ...
14.293337 GB/sec
# Copying 128MB bytes ...
15.238947 GB/sec
# Copying 512MB bytes ...
16.476093 GB/sec
Signed-off-by: Ankur Arora <ankur.a.arora@...cle.com>
---
tools/arch/x86/lib/memset_64.S | 68 ++++++++++++++++------------
tools/perf/bench/mem-memset-x86-64-asm-def.h | 6 ++-
2 files changed, 43 insertions(+), 31 deletions(-)
diff --git a/tools/arch/x86/lib/memset_64.S b/tools/arch/x86/lib/memset_64.S
index fd5d25a474b7..bfbf6d06f81e 100644
--- a/tools/arch/x86/lib/memset_64.S
+++ b/tools/arch/x86/lib/memset_64.S
@@ -26,7 +26,7 @@ SYM_FUNC_START(__memset)
*
* Otherwise, use original memset function.
*/
- ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+ ALTERNATIVE_2 "jmp memset_movq", "", X86_FEATURE_REP_GOOD, \
"jmp memset_erms", X86_FEATURE_ERMS
movq %rdi,%r9
@@ -65,7 +65,8 @@ SYM_FUNC_START(memset_erms)
ret
SYM_FUNC_END(memset_erms)
-SYM_FUNC_START(memset_orig)
+.macro MEMSET_MOV OP fence
+SYM_FUNC_START(memset_\OP)
movq %rdi,%r10
/* expand byte value */
@@ -76,64 +77,71 @@ SYM_FUNC_START(memset_orig)
/* align dst */
movl %edi,%r9d
andl $7,%r9d
- jnz .Lbad_alignment
-.Lafter_bad_alignment:
+ jnz .Lbad_alignment_\@
+.Lafter_bad_alignment_\@:
movq %rdx,%rcx
shrq $6,%rcx
- jz .Lhandle_tail
+ jz .Lhandle_tail_\@
.p2align 4
-.Lloop_64:
+.Lloop_64_\@:
decq %rcx
- movq %rax,(%rdi)
- movq %rax,8(%rdi)
- movq %rax,16(%rdi)
- movq %rax,24(%rdi)
- movq %rax,32(%rdi)
- movq %rax,40(%rdi)
- movq %rax,48(%rdi)
- movq %rax,56(%rdi)
+ \OP %rax,(%rdi)
+ \OP %rax,8(%rdi)
+ \OP %rax,16(%rdi)
+ \OP %rax,24(%rdi)
+ \OP %rax,32(%rdi)
+ \OP %rax,40(%rdi)
+ \OP %rax,48(%rdi)
+ \OP %rax,56(%rdi)
leaq 64(%rdi),%rdi
- jnz .Lloop_64
+ jnz .Lloop_64_\@
/* Handle tail in loops. The loops should be faster than hard
to predict jump tables. */
.p2align 4
-.Lhandle_tail:
+.Lhandle_tail_\@:
movl %edx,%ecx
andl $63&(~7),%ecx
- jz .Lhandle_7
+ jz .Lhandle_7_\@
shrl $3,%ecx
.p2align 4
-.Lloop_8:
+.Lloop_8_\@:
decl %ecx
- movq %rax,(%rdi)
+ \OP %rax,(%rdi)
leaq 8(%rdi),%rdi
- jnz .Lloop_8
+ jnz .Lloop_8_\@
-.Lhandle_7:
+.Lhandle_7_\@:
andl $7,%edx
- jz .Lende
+ jz .Lende_\@
.p2align 4
-.Lloop_1:
+.Lloop_1_\@:
decl %edx
movb %al,(%rdi)
leaq 1(%rdi),%rdi
- jnz .Lloop_1
+ jnz .Lloop_1_\@
-.Lende:
+.Lende_\@:
+ .if \fence
+ sfence
+ .endif
movq %r10,%rax
ret
-.Lbad_alignment:
+.Lbad_alignment_\@:
cmpq $7,%rdx
- jbe .Lhandle_7
+ jbe .Lhandle_7_\@
movq %rax,(%rdi) /* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
subq %r8,%rdx
- jmp .Lafter_bad_alignment
-.Lfinal:
-SYM_FUNC_END(memset_orig)
+ jmp .Lafter_bad_alignment_\@
+.Lfinal_\@:
+SYM_FUNC_END(memset_\OP)
+.endm
+
+MEMSET_MOV OP=movq fence=0
+MEMSET_MOV OP=movnti fence=1
diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h
index dac6d2b7c39b..53ead7f91313 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
-MEMSET_FN(memset_orig,
+MEMSET_FN(memset_movq,
"x86-64-unrolled",
"unrolled memset() in arch/x86/lib/memset_64.S")
@@ -11,3 +11,7 @@ MEMSET_FN(__memset,
MEMSET_FN(memset_erms,
"x86-64-stosb",
"movsb-based memset() in arch/x86/lib/memset_64.S")
+
+MEMSET_FN(memset_movnti,
+ "x86-64-movnt",
+ "movnt-based memset() in arch/x86/lib/memset_64.S")
--
2.9.3
Powered by blists - more mailing lists