linux-kernel - Re: [PATCH] x86: handle the tail in rep_movs_alternative() with an overlapping store

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJmZWFHTG8cR77zrUpKF81tphcTQ1fyDO6vqnY57ptcs2yM=-A@mail.gmail.com>
Date: Tue, 25 Mar 2025 19:42:09 -0300
From: Herton Krzesinski <hkrzesin@...hat.com>
To: David Laight <david.laight.linux@...il.com>
Cc: Linus Torvalds <torvalds@...ux-foundation.org>, Mateusz Guzik <mjguzik@...il.com>, x86@...nel.org, 
	tglx@...utronix.de, mingo@...hat.com, bp@...en8.de, 
	dave.hansen@...ux.intel.com, hpa@...or.com, olichtne@...hat.com, 
	atomasov@...hat.com, aokuliar@...hat.com, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] x86: handle the tail in rep_movs_alternative() with an
 overlapping store

On Fri, Mar 21, 2025 at 5:47 PM David Laight
<david.laight.linux@...il.com> wrote:
>
> On Thu, 20 Mar 2025 16:53:32 -0700
> Linus Torvalds <torvalds@...ux-foundation.org> wrote:
>
> > On Thu, 20 Mar 2025 at 14:17, Linus Torvalds
> > <torvalds@...ux-foundation.org> wrote:
> > >
> > > On Thu, 20 Mar 2025 at 12:33, Mateusz Guzik <mjguzik@...il.com> wrote:
> > > >
> > > > I have a recollection that handling the tail after rep movsq with an
> > > > overlapping store was suffering a penalty big enough to warrant a
> > > > "normal" copy instead, avoiding the just written to area.
> > >
> > > Ahh. Good point. The rep movsq might indeed end up having odd effects
> > > with subsequent aliasing memory operations.
> > >
> > > Consider myself convinced.
> >
> > Actually, I think there's a solution for this.
> >
> > Do not do the last 0-7 bytes as a word that overlaps with the tail of
> > the 'rep movs'
> >
> > Do the last 8-15 bytes *non-overlapping* (well, they overlap each
> > other, but not the 'rep movs')
> >
> > Something UNTESTED like the appended, in other words. The large case
> > then ends up without any conditionals, looking something like this:
> >
> >         mov    %rcx,%rax
> >         shr    $0x3,%rcx
> >         dec    %rcx
> >         and    $0x7,%eax
> >         rep movsq %ds:(%rsi),%es:(%rdi)
> >         mov    (%rsi),%rcx
> >         mov    %rcx,(%rdi)
> >         mov    (%rsi,%rax,1),%rcx
> >         mov    %rcx,(%rdi,%rax,1)
> >         xor    %ecx,%ecx
> >         ret
>
> I think you can save the 'tail end' copying the same 8 bytes twice by doing:
>         sub     $9,%rcx
>         mov     %rcx,%rax
>         shr     $3,%rcx
>         and     $7,%rax
>         inc     %rax
> before the 'rep movsq'.

Not sure how above will work handling the remaining in %rax?

Anyway, another version may be like this to avoid
the rep movs penalty? Not sure if doing it before would be ok?

index fc9fb5d06174..a0f9655e364c 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -62,10 +62,15 @@ SYM_FUNC_START(rep_movs_alternative)
        je .Lexit
        cmp $8,%ecx
        jae .Lword
-       jmp .Lcopy_user_tail
+4:     movq -8(%rsi,%rcx),%rax
+5:     movq %rax,-8(%rdi,%rcx)
+       xorl %ecx,%ecx
+       RET

        _ASM_EXTABLE_UA( 2b, .Lcopy_user_tail)
        _ASM_EXTABLE_UA( 3b, .Lcopy_user_tail)
+       _ASM_EXTABLE_UA( 4b, .Lcopy_user_tail)
+       _ASM_EXTABLE_UA( 5b, .Lcopy_user_tail)

 .Llarge:
 0:     ALTERNATIVE "jmp .Llarge_movsq", "rep movsb", X86_FEATURE_ERMS
@@ -74,18 +79,20 @@ SYM_FUNC_START(rep_movs_alternative)
        _ASM_EXTABLE_UA( 0b, 1b)

 .Llarge_movsq:
+       /* copy tail byte first, to avoid overlapping
+          penalty with rep movsq */
+0:     movq -8(%rsi,%rcx),%rax
+1:     movq %rax,-8(%rdi,%rcx)
        movq %rcx,%rax
        shrq $3,%rcx
-       andl $7,%eax
-0:     rep movsq
-       movl %eax,%ecx
-       testl %ecx,%ecx
-       jne .Lcopy_user_tail
+2:     rep movsq
+       xorl %ecx,%ecx
        RET
-
-1:     leaq (%rax,%rcx,8),%rcx
+3:     movq %rax,%rcx
        jmp .Lcopy_user_tail

-       _ASM_EXTABLE_UA( 0b, 1b)
+       _ASM_EXTABLE_UA( 0b, .Lcopy_user_tail)
+       _ASM_EXTABLE_UA( 1b, .Lcopy_user_tail)
+       _ASM_EXTABLE_UA( 2b, 3b)
 SYM_FUNC_END(rep_movs_alternative)
 EXPORT_SYMBOL(rep_movs_alternative)



I have been trying to also measure the impact of changes like above, however,
it seems I don't get improvement or it's limited due impact of
profiling, I tried
to uninline/move copy_user_generic() like this:

diff --git a/arch/x86/include/asm/uaccess_64.h
b/arch/x86/include/asm/uaccess_64.h
index c52f0133425b..2ae442c8a4b5 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -115,25 +115,8 @@ static inline bool __access_ok(const void __user
*ptr, unsigned long size)
 __must_check unsigned long
 rep_movs_alternative(void *to, const void *from, unsigned len);

-static __always_inline __must_check unsigned long
-copy_user_generic(void *to, const void *from, unsigned long len)
-{
-       stac();
-       /*
-        * If CPU has FSRM feature, use 'rep movs'.
-        * Otherwise, use rep_movs_alternative.
-        */
-       asm volatile(
-               "1:\n\t"
-               ALTERNATIVE("rep movsb",
-                           "call rep_movs_alternative",
ALT_NOT(X86_FEATURE_FSRM))
-               "2:\n"
-               _ASM_EXTABLE_UA(1b, 2b)
-               :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
-               : : "memory", "rax");
-       clac();
-       return len;
-}
+__must_check unsigned long
+copy_user_generic(void *to, const void *from, unsigned long len);

 static __always_inline __must_check unsigned long
 raw_copy_from_user(void *dst, const void __user *src, unsigned long size)
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index e9251b89a9e9..4585349f8f33 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -142,3 +142,24 @@ void __memcpy_flushcache(void *_dst, const void
*_src, size_t size)
 }
 EXPORT_SYMBOL_GPL(__memcpy_flushcache);
 #endif
+
+__must_check unsigned long
+copy_user_generic(void *to, const void *from, unsigned long len)
+{
+       stac();
+       /*
+        * If CPU has FSRM feature, use 'rep movs'.
+        * Otherwise, use rep_movs_alternative.
+        */
+       asm volatile(
+               "1:\n\t"
+               ALTERNATIVE("rep movsb",
+                           "call rep_movs_alternative",
ALT_NOT(X86_FEATURE_FSRM))
+               "2:\n"
+               _ASM_EXTABLE_UA(1b, 2b)
+               :"+c" (len), "+D" (to), "+S" (from), ASM_CALL_CONSTRAINT
+               : : "memory", "rax");
+       clac();
+       return len;
+}
+EXPORT_SYMBOL_GPL(copy_user_generic);


And then, using bpftrace with this script to try to measure execution times:

#############################
fentry:copy_user_generic
/strcontains(comm,"iperf3")/
{
        /*printf("start %ul %p\n", args.len, kptr(args.to));*/
        @start[kptr(args.to),args.len] = nsecs;
}

fexit:copy_user_generic
/strcontains(comm,"iperf3") && @start[kptr(args.to)-args.len,args.len]/
{
        /*printf("end %ul %p\n", args.len, kptr(args.to)-args.len);*/

        $len = args.len;
        $len >>= 1;
        $log_len = 0;
        while ($len) {
                $len >>= 1;
                $log_len++;
        }
        $log1 = 1;
        $log1 <<= $log_len;
        $log2 = $log1;
        $log2 <<= 1;
        $dalign = (uint64)(kptr(args.to) - args.len);
        $dalign &= 0x7;

        @us[$dalign,$log1,$log2] = hist((nsecs -
@start[kptr(args.to)-args.len,args.len]));
        delete(@start, (kptr(args.to)-args.len,args.len))
}

END
{
        clear(@start);
}
#############################

But the result is mixed at least in case of this change, I can't prove
an improvement
with it.

>
>         David
>
> >
> > with some added complexity - but not a lot - in the exception fixup cases.
> >
> > This is once again intentionally whitespace-damaged, because I don't
> > want people applying this mindlessly. Somebody needs to double-check
> > my logic, and verify that this also avoids the cost from the aliasing
> > with the rep movs.
> >
> >                    Linus
> ...
>