lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CACuRN0Pd8VFTz55qzXvJeqOEt2ZGi--j1wDyqnAt=q_42ES++w@mail.gmail.com>
Date:   Fri, 4 Jun 2021 18:56:54 +0900
From:   Akira Tsukamoto <akira.tsukamoto@...il.com>
To:     Paul Walmsley <paul.walmsley@...ive.com>,
        Palmer Dabbelt <palmer@...belt.com>,
        Albert Ou <aou@...s.berkeley.edu>, Gary Guo <gary@...yguo.net>,
        Nick Hu <nickhu@...estech.com>,
        Nylon Chen <nylon7@...estech.com>,
        Akira Tsukamoto <akira.tsukamoto@...il.com>,
        linux-riscv@...ts.infradead.org,
        Linux kernel mailing list <linux-kernel@...r.kernel.org>
Subject: [PATCH 1/1] riscv: prevent pipeline stall in __asm_to/copy_from_user

Reducing pipeline stall of read after write (RAW).

These are the results from combination of the speedup with
Gary's misalign fix. Speeds up from 680Mbps to 900Mbps.

Before applying these two patches.
---
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
Accepted connection from 192.168.1.153, port 45972
[  5] local 192.168.1.112 port 5201 connected to 192.168.1.153 port 45974
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-1.00   sec  80.8 MBytes   678 Mbits/sec
[  5]   1.00-2.00   sec  82.1 MBytes   689 Mbits/sec
[  5]   2.00-3.00   sec  82.1 MBytes   688 Mbits/sec
[  5]   3.00-4.00   sec  81.7 MBytes   685 Mbits/sec
[  5]   4.00-5.00   sec  82.1 MBytes   689 Mbits/sec
[  5]   5.00-6.00   sec  82.0 MBytes   687 Mbits/sec
[  5]   6.00-7.00   sec  82.4 MBytes   691 Mbits/sec
[  5]   7.00-8.00   sec  82.2 MBytes   689 Mbits/sec
[  5]   8.00-9.00   sec  82.2 MBytes   690 Mbits/sec
[  5]   9.00-10.00  sec  82.2 MBytes   690 Mbits/sec
[  5]  10.00-10.01  sec   486 KBytes   682 Mbits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-10.01  sec   820 MBytes   688 Mbits/sec                  receiver
-----------------------------------------------------------
---

Afer.
---
-----------------------------------------------------------
Server listening on 5201
-----------------------------------------------------------
Accepted connection from 192.168.1.153, port 44612
[  5] local 192.168.1.112 port 5201 connected to 192.168.1.153 port 44614
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-1.00   sec   105 MBytes   879 Mbits/sec
[  5]   1.00-2.00   sec   108 MBytes   904 Mbits/sec
[  5]   2.00-3.00   sec   107 MBytes   901 Mbits/sec
[  5]   3.00-4.00   sec   108 MBytes   902 Mbits/sec
[  5]   4.00-5.00   sec   108 MBytes   906 Mbits/sec
[  5]   5.00-6.00   sec   107 MBytes   900 Mbits/sec
[  5]   6.00-7.00   sec   108 MBytes   906 Mbits/sec
[  5]   7.00-8.00   sec   108 MBytes   904 Mbits/sec
[  5]   8.00-9.00   sec   108 MBytes   902 Mbits/sec
[  5]   9.00-10.00  sec   108 MBytes   905 Mbits/sec
[  5]  10.00-10.01  sec   612 KBytes   902 Mbits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bitrate
[  5]   0.00-10.01  sec  1.05 GBytes   901 Mbits/sec                  receiver
-----------------------------------------------------------
---

Signed-off-by: Akira Tsukamoto <akira.tsukamoto@...il.com>
---
 arch/riscv/lib/uaccess.S | 106 +++++++++++++++++++++++++++------------
 1 file changed, 73 insertions(+), 33 deletions(-)

diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S
index fceaeb18cc64..2528a77709e1 100644
--- a/arch/riscv/lib/uaccess.S
+++ b/arch/riscv/lib/uaccess.S
@@ -19,50 +19,90 @@ ENTRY(__asm_copy_from_user)
     li t6, SR_SUM
     csrs CSR_STATUS, t6

-    add a3, a1, a2
+    move t5, a0  /* Preserve return value */
+
+    /* Defer to byte-oriented copy for small sizes */
+    sltiu a3, a2, 64
+    bnez a3, 4f
     /* Use word-oriented copy only if low-order bits match */
-    andi t0, a0, SZREG-1
-    andi t1, a1, SZREG-1
-    bne t0, t1, 2f
+    andi a3, t5, SZREG-1
+    andi a4, a1, SZREG-1
+    bne a3, a4, 4f

-    addi t0, a1, SZREG-1
-    andi t1, a3, ~(SZREG-1)
-    andi t0, t0, ~(SZREG-1)
+    beqz a3, 2f  /* Skip if already aligned */
     /*
-     * a3: terminal address of source region
-     * t0: lowest XLEN-aligned address in source
-     * t1: highest XLEN-aligned address in source
+     * Round to nearest double word-aligned address
+     * greater than or equal to start address
      */
-    bgeu t0, t1, 2f
-    bltu a1, t0, 4f
+    andi a3, a1, ~(SZREG-1)
+    addi a3, a3, SZREG
+    /* Handle initial misalignment */
+    sub a4, a3, a1
 1:
-    fixup REG_L, t2, (a1), 10f
-    fixup REG_S, t2, (a0), 10f
-    addi a1, a1, SZREG
-    addi a0, a0, SZREG
-    bltu a1, t1, 1b
-2:
-    bltu a1, a3, 5f
+    lb a5, 0(a1)
+    addi a1, a1, 1
+    sb a5, 0(t5)
+    addi t5, t5, 1
+    bltu a1, a3, 1b
+    sub a2, a2, a4  /* Update count */

+2:
+    andi a4, a2, ~((8*SZREG)-1)
+    beqz a4, 4f
+    add a3, a1, a4
 3:
+    fixup REG_L a4,       0(a1), 10f
+    fixup REG_L a5,   SZREG(a1), 10f
+    fixup REG_L a6, 2*SZREG(a1), 10f
+    fixup REG_L a7, 3*SZREG(a1), 10f
+    fixup REG_L t0, 4*SZREG(a1), 10f
+    fixup REG_L t1, 5*SZREG(a1), 10f
+    fixup REG_L t2, 6*SZREG(a1), 10f
+    fixup REG_L t3, 7*SZREG(a1), 10f
+    fixup REG_S a4,       0(t5), 10f
+    fixup REG_S a5,   SZREG(t5), 10f
+    fixup REG_S a6, 2*SZREG(t5), 10f
+    fixup REG_S a7, 3*SZREG(t5), 10f
+    fixup REG_S t0, 4*SZREG(t5), 10f
+    fixup REG_S t1, 5*SZREG(t5), 10f
+    fixup REG_S t2, 6*SZREG(t5), 10f
+    fixup REG_S t3, 7*SZREG(t5), 10f
+    addi a1, a1, 8*SZREG
+    addi t5, t5, 8*SZREG
+    bltu a1, a3, 3b
+    andi a2, a2, (8*SZREG)-1  /* Update count */
+
+4:
+    /* Handle trailing misalignment */
+    beqz a2, 6f
+    add a3, a1, a2
+
+    /* Use word-oriented copy if co-aligned to word boundary */
+    or a5, a1, t5
+    or a5, a5, a3
+    andi a5, a5, 3
+    bnez a5, 5f
+7:
+    fixup lw a4, 0(a1), 10f
+    addi a1, a1, 4
+    fixup sw a4, 0(t5), 10f
+    addi t5, t5, 4
+    bltu a1, a3, 7b
+
+    j 6f
+
+5:
+    fixup lb a4, 0(a1), 10f
+    addi a1, a1, 1
+    fixup sb a4, 0(t5), 10f
+    addi t5, t5, 1
+    bltu a1, a3, 5b
+
+6:
     /* Disable access to user memory */
     csrc CSR_STATUS, t6
     li a0, 0
     ret
-4: /* Edge case: unalignment */
-    fixup lbu, t2, (a1), 10f
-    fixup sb, t2, (a0), 10f
-    addi a1, a1, 1
-    addi a0, a0, 1
-    bltu a1, t0, 4b
-    j 1b
-5: /* Edge case: remainder */
-    fixup lbu, t2, (a1), 10f
-    fixup sb, t2, (a0), 10f
-    addi a1, a1, 1
-    addi a0, a0, 1
-    bltu a1, a3, 5b
-    j 3b
 ENDPROC(__asm_copy_to_user)
 ENDPROC(__asm_copy_from_user)
 EXPORT_SYMBOL(__asm_copy_to_user)
-- 
2.17.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ