[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20211020170305.376118-2-ankur.a.arora@oracle.com>
Date: Wed, 20 Oct 2021 10:02:52 -0700
From: Ankur Arora <ankur.a.arora@...cle.com>
To: linux-kernel@...r.kernel.org, linux-mm@...ck.org, x86@...nel.org
Cc: mingo@...nel.org, bp@...en8.de, luto@...nel.org,
akpm@...ux-foundation.org, mike.kravetz@...cle.com,
jon.grimm@....com, kvm@...r.kernel.org, konrad.wilk@...cle.com,
boris.ostrovsky@...cle.com, Ankur Arora <ankur.a.arora@...cle.com>
Subject: [PATCH v2 01/14] x86/asm: add memset_movnti()
Add an uncached (based on MOVNTI) implementation of memset().
memset_movnti() only needs to differ from memset_orig() in the opcode
used in the inner loop, so move the memset_orig() logic into a macro,
and use that to generate memset_movq() and memset_movnti().
Signed-off-by: Ankur Arora <ankur.a.arora@...cle.com>
---
arch/x86/lib/memset_64.S | 68 ++++++++++++++++++++++------------------
1 file changed, 38 insertions(+), 30 deletions(-)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 9827ae267f96..ef2a091563d9 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -25,7 +25,7 @@ SYM_FUNC_START(__memset)
*
* Otherwise, use original memset function.
*/
- ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+ ALTERNATIVE_2 "jmp memset_movq", "", X86_FEATURE_REP_GOOD, \
"jmp memset_erms", X86_FEATURE_ERMS
movq %rdi,%r9
@@ -66,7 +66,8 @@ SYM_FUNC_START_LOCAL(memset_erms)
ret
SYM_FUNC_END(memset_erms)
-SYM_FUNC_START_LOCAL(memset_orig)
+.macro MEMSET_MOV OP fence
+SYM_FUNC_START_LOCAL(memset_\OP)
movq %rdi,%r10
/* expand byte value */
@@ -77,64 +78,71 @@ SYM_FUNC_START_LOCAL(memset_orig)
/* align dst */
movl %edi,%r9d
andl $7,%r9d
- jnz .Lbad_alignment
-.Lafter_bad_alignment:
+ jnz .Lbad_alignment_\@
+.Lafter_bad_alignment_\@:
movq %rdx,%rcx
shrq $6,%rcx
- jz .Lhandle_tail
+ jz .Lhandle_tail_\@
.p2align 4
-.Lloop_64:
+.Lloop_64_\@:
decq %rcx
- movq %rax,(%rdi)
- movq %rax,8(%rdi)
- movq %rax,16(%rdi)
- movq %rax,24(%rdi)
- movq %rax,32(%rdi)
- movq %rax,40(%rdi)
- movq %rax,48(%rdi)
- movq %rax,56(%rdi)
+ \OP %rax,(%rdi)
+ \OP %rax,8(%rdi)
+ \OP %rax,16(%rdi)
+ \OP %rax,24(%rdi)
+ \OP %rax,32(%rdi)
+ \OP %rax,40(%rdi)
+ \OP %rax,48(%rdi)
+ \OP %rax,56(%rdi)
leaq 64(%rdi),%rdi
- jnz .Lloop_64
+ jnz .Lloop_64_\@
/* Handle tail in loops. The loops should be faster than hard
to predict jump tables. */
.p2align 4
-.Lhandle_tail:
+.Lhandle_tail_\@:
movl %edx,%ecx
andl $63&(~7),%ecx
- jz .Lhandle_7
+ jz .Lhandle_7_\@
shrl $3,%ecx
.p2align 4
-.Lloop_8:
+.Lloop_8_\@:
decl %ecx
- movq %rax,(%rdi)
+ \OP %rax,(%rdi)
leaq 8(%rdi),%rdi
- jnz .Lloop_8
+ jnz .Lloop_8_\@
-.Lhandle_7:
+.Lhandle_7_\@:
andl $7,%edx
- jz .Lende
+ jz .Lende_\@
.p2align 4
-.Lloop_1:
+.Lloop_1_\@:
decl %edx
movb %al,(%rdi)
leaq 1(%rdi),%rdi
- jnz .Lloop_1
+ jnz .Lloop_1_\@
-.Lende:
+.Lende_\@:
+ .if \fence
+ sfence
+ .endif
movq %r10,%rax
ret
-.Lbad_alignment:
+.Lbad_alignment_\@:
cmpq $7,%rdx
- jbe .Lhandle_7
+ jbe .Lhandle_7_\@
movq %rax,(%rdi) /* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
subq %r8,%rdx
- jmp .Lafter_bad_alignment
-.Lfinal:
-SYM_FUNC_END(memset_orig)
+ jmp .Lafter_bad_alignment_\@
+.Lfinal_\@:
+SYM_FUNC_END(memset_\OP)
+.endm
+
+MEMSET_MOV OP=movq fence=0
+MEMSET_MOV OP=movnti fence=1
--
2.29.2
Powered by blists - more mailing lists