[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJmZWFHTG8cR77zrUpKF81tphcTQ1fyDO6vqnY57ptcs2yM=-A@mail.gmail.com>
Date: Tue, 25 Mar 2025 19:42:09 -0300
From: Herton Krzesinski <hkrzesin@...hat.com>
To: David Laight <david.laight.linux@...il.com>
Cc: Linus Torvalds <torvalds@...ux-foundation.org>, Mateusz Guzik <mjguzik@...il.com>, x86@...nel.org,
tglx@...utronix.de, mingo@...hat.com, bp@...en8.de,
dave.hansen@...ux.intel.com, hpa@...or.com, olichtne@...hat.com,
atomasov@...hat.com, aokuliar@...hat.com, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] x86: handle the tail in rep_movs_alternative() with an
overlapping store
On Fri, Mar 21, 2025 at 5:47 PM David Laight
<david.laight.linux@...il.com> wrote:
>
> On Thu, 20 Mar 2025 16:53:32 -0700
> Linus Torvalds <torvalds@...ux-foundation.org> wrote:
>
> > On Thu, 20 Mar 2025 at 14:17, Linus Torvalds
> > <torvalds@...ux-foundation.org> wrote:
> > >
> > > On Thu, 20 Mar 2025 at 12:33, Mateusz Guzik <mjguzik@...il.com> wrote:
> > > >
> > > > I have a recollection that handling the tail after rep movsq with an
> > > > overlapping store was suffering a penalty big enough to warrant a
> > > > "normal" copy instead, avoiding the just written to area.
> > >
> > > Ahh. Good point. The rep movsq might indeed end up having odd effects
> > > with subsequent aliasing memory operations.
> > >
> > > Consider myself convinced.
> >
> > Actually, I think there's a solution for this.
> >
> > Do not do the last 0-7 bytes as a word that overlaps with the tail of
> > the 'rep movs'
> >
> > Do the last 8-15 bytes *non-overlapping* (well, they overlap each
> > other, but not the 'rep movs')
> >
> > Something UNTESTED like the appended, in other words. The large case
> > then ends up without any conditionals, looking something like this:
> >
> > mov %rcx,%rax
> > shr $0x3,%rcx
> > dec %rcx
> > and $0x7,%eax
> > rep movsq %ds:(%rsi),%es:(%rdi)
> > mov (%rsi),%rcx
> > mov %rcx,(%rdi)
> > mov (%rsi,%rax,1),%rcx
> > mov %rcx,(%rdi,%rax,1)
> > xor %ecx,%ecx
> > ret
>
> I think you can save the 'tail end' copying the same 8 bytes twice by doing:
> sub $9,%rcx
> mov %rcx,%rax
> shr $3,%rcx
> and $7,%rax
> inc %rax
> before the 'rep movsq'.
Not sure how above will work handling the remaining in %rax?
Anyway, another version may be like this to avoid
the rep movs penalty? Not sure if doing it before would be ok?
index fc9fb5d06174..a0f9655e364c 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -62,10 +62,15 @@ SYM_FUNC_START(rep_movs_alternative)
je .Lexit
cmp $8,%ecx
jae .Lword
- jmp .Lcopy_user_tail
+4: movq -8(%rsi,%rcx),%rax
+5: movq %rax,-8(%rdi,%rcx)
+ xorl %ecx,%ecx
+ RET
_ASM_EXTABLE_UA( 2b, .Lcopy_user_tail)
_ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA( 4b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA( 5b, .Lcopy_user_tail)
.Llarge:
0: ALTERNATIVE "jmp .Llarge_movsq", "rep movsb", X86_FEATURE_ERMS
@@ -74,18 +79,20 @@ SYM_FUNC_START(rep_movs_alternative)
_ASM_EXTABLE_UA( 0b, 1b)
.Llarge_movsq:
+ /* copy tail byte first, to avoid overlapping
+ penalty with rep movsq */
+0: movq -8(%rsi,%rcx),%rax
+1: movq %rax,-8(%rdi,%rcx)
movq %rcx,%rax
shrq $3,%rcx
- andl $7,%eax
-0: rep movsq
- movl %eax,%ecx
- testl %ecx,%ecx
- jne .Lcopy_user_tail
+2: rep movsq
+ xorl %ecx,%ecx
RET
-
-1: leaq (%rax,%rcx,8),%rcx
+3: movq %rax,%rcx
jmp .Lcopy_user_tail
- _ASM_EXTABLE_UA( 0b, 1b)
+ _ASM_EXTABLE_UA( 0b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA( 1b, .Lcopy_user_tail)
+ _ASM_EXTABLE_UA( 2b, 3b)
SYM_FUNC_END(rep_movs_alternative)
EXPORT_SYMBOL(rep_movs_alternative)
I have been trying to also measure the impact of changes like above, however,
it seems I don't get improvement or it's limited due impact of
profiling, I tried
to uninline/move copy_user_generic() like this:
diff --git a/arch/x86/include/asm/uaccess_64.h
b/arch/x86/include/asm/uaccess_64.h
index c52f0133425b..2ae442c8a4b5 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -115,25 +115,8 @@ static inline bool __access_ok(const void __user
*ptr, unsigned long size)
__must_check unsigned long
rep_movs_alternative(void *to, const void *from, unsigned len);
-static __always_inline __must_check unsigned long
-copy_user_generic(void *to, const void *from, unsigned long len)
-{
- stac();
- /*
- * If CPU has FSRM feature, use 'rep movs'.
- * Otherwise, use rep_movs_alternative.
- */
- asm volatile(
- "1:\n\t"
- ALTERNATIVE("rep movsb",
- "call rep_movs_alternative",
ALT_NOT(X86_FEATURE_FSRM))
- "2:\n"
- _ASM_EXTABLE_UA(1b, 2b)
- :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
- : : "memory", "rax");
- clac();
- return len;
-}
+__must_check unsigned long
+copy_user_generic(void *to, const void *from, unsigned long len);
static __always_inline __must_check unsigned long
raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index e9251b89a9e9..4585349f8f33 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -142,3 +142,24 @@ void __memcpy_flushcache(void *_dst, const void
*_src, size_t size)
}
EXPORT_SYMBOL_GPL(__memcpy_flushcache);
#endif
+
+__must_check unsigned long
+copy_user_generic(void *to, const void *from, unsigned long len)
+{
+ stac();
+ /*
+ * If CPU has FSRM feature, use 'rep movs'.
+ * Otherwise, use rep_movs_alternative.
+ */
+ asm volatile(
+ "1:\n\t"
+ ALTERNATIVE("rep movsb",
+ "call rep_movs_alternative",
ALT_NOT(X86_FEATURE_FSRM))
+ "2:\n"
+ _ASM_EXTABLE_UA(1b, 2b)
+ :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
+ : : "memory", "rax");
+ clac();
+ return len;
+}
+EXPORT_SYMBOL_GPL(copy_user_generic);
And then, using bpftrace with this script to try to measure execution times:
#############################
fentry:copy_user_generic
/strcontains(comm,"iperf3")/
{
/*printf("start %ul %p\n", args.len, kptr(args.to));*/
@start[kptr(args.to),args.len] = nsecs;
}
fexit:copy_user_generic
/strcontains(comm,"iperf3") && @start[kptr(args.to)-args.len,args.len]/
{
/*printf("end %ul %p\n", args.len, kptr(args.to)-args.len);*/
$len = args.len;
$len >>= 1;
$log_len = 0;
while ($len) {
$len >>= 1;
$log_len++;
}
$log1 = 1;
$log1 <<= $log_len;
$log2 = $log1;
$log2 <<= 1;
$dalign = (uint64)(kptr(args.to) - args.len);
$dalign &= 0x7;
@us[$dalign,$log1,$log2] = hist((nsecs -
@start[kptr(args.to)-args.len,args.len]));
delete(@start, (kptr(args.to)-args.len,args.len))
}
END
{
clear(@start);
}
#############################
But the result is mixed at least in case of this change, I can't prove
an improvement
with it.
>
> David
>
> >
> > with some added complexity - but not a lot - in the exception fixup cases.
> >
> > This is once again intentionally whitespace-damaged, because I don't
> > want people applying this mindlessly. Somebody needs to double-check
> > my logic, and verify that this also avoids the cost from the aliasing
> > with the rep movs.
> >
> > Linus
> ...
>
Powered by blists - more mailing lists