[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-Id: <20251202074121.81364-2-maohan4761@gmail.com>
Date: Tue, 2 Dec 2025 15:41:21 +0800
From: maohan4761@...il.com
To: pjw@...nel.org,
palmer@...belt.com
Cc: guoren@...nel.org,
linux-riscv@...ts.infradead.org,
linux-kernel@...r.kernel.org,
Mao Han <han_mao@...ux.alibaba.com>
Subject: [PATCH 1/1] riscv: Optimize user copy with efficient unaligned access support
From: Mao Han <han_mao@...ux.alibaba.com>
Introduce an optimized path in fallback_scalar_usercopy_sum_enabled for
systems that support efficient unaligned memory accesses (i.e., when
CONFIG_RISCV_EFFICIENT_UNALIGNED_ACCESS is enabled).
This eliminates the overhead of bit-shifting and OR-ing partial words to
reconstruct misaligned values, which was previously required for handling
protential alignments. Medium-sized buffers between 8 and 9*SZREG also see
noticeable improvement, as the original path would fall back to
byte-by-byte copying.
Signed-off-by: Mao Han <han_mao@...ux.alibaba.com>
---
arch/riscv/lib/uaccess.S | 113 +++++++++++++++++++++++++++++++++++++++
1 file changed, 113 insertions(+)
diff --git a/arch/riscv/lib/uaccess.S b/arch/riscv/lib/uaccess.S
index 4efea1b..bf124a1 100644
--- a/arch/riscv/lib/uaccess.S
+++ b/arch/riscv/lib/uaccess.S
@@ -54,6 +54,118 @@ EXPORT_SYMBOL(__asm_copy_from_user_sum_enabled)
EXPORT_SYMBOL(__asm_copy_to_user_sum_enabled)
SYM_FUNC_START(fallback_scalar_usercopy_sum_enabled)
+ /*
+ * Save the terminal address which will be used to compute the number
+ * of bytes copied in case of a fixup exception.
+ */
+ add t5, a0, a2
+
+ /*
+ * Register allocation for code below:
+ * a0 - start of uncopied dst
+ * a1 - start of uncopied src
+ * a2 - size
+ * t0 - end of uncopied dst
+ */
+ add t0, a0, a2
+#ifdef CONFIG_RISCV_EFFICIENT_UNALIGNED_ACCESS
+ /* If length < 8, go to byte copy */
+ li a3, 8
+ bltu a2, a3, .Lbyte_copy_tail
+
+ /* check length >= 128 */
+ li t1, 128
+ bltu a2, t1, .L_len_less_16x_szreg
+
+.L_loop_16x_reg:
+ fixup REG_L a4, 0(a1), 10f
+ fixup REG_L a5, SZREG(a1), 10f
+ fixup REG_L a6, 2*SZREG(a1), 10f
+ fixup REG_L a7, 3*SZREG(a1), 10f
+ fixup REG_S a4, 0(a0), 10f
+ fixup REG_S a5, SZREG(a0), 10f
+ fixup REG_S a6, 2*SZREG(a0), 10f
+ fixup REG_S a7, 3*SZREG(a0), 10f
+
+ fixup REG_L t1, 4*SZREG(a1), 10f
+ fixup REG_L t2, 5*SZREG(a1), 10f
+ fixup REG_L t3, 6*SZREG(a1), 10f
+ fixup REG_L t4, 7*SZREG(a1), 10f
+ fixup REG_S t1, 4*SZREG(a0), 10f
+ fixup REG_S t2, 5*SZREG(a0), 10f
+ fixup REG_S t3, 6*SZREG(a0), 10f
+ fixup REG_S t4, 7*SZREG(a0), 10f
+
+ fixup REG_L a4, 8*SZREG(a1), 10f
+ fixup REG_L a5, 9*SZREG(a1), 10f
+ fixup REG_L a6, 10*SZREG(a1), 10f
+ fixup REG_L a7, 11*SZREG(a1), 10f
+ fixup REG_S a4, 8*SZREG(a0), 10f
+ fixup REG_S a5, 9*SZREG(a0), 10f
+ fixup REG_S a6, 10*SZREG(a0), 10f
+ fixup REG_S a7, 11*SZREG(a0), 10f
+
+ fixup REG_L t1, 12*SZREG(a1), 10f
+ fixup REG_L t2, 13*SZREG(a1), 10f
+ fixup REG_L t3, 14*SZREG(a1), 10f
+ fixup REG_L t4, 15*SZREG(a1), 10f
+ fixup REG_S t1, 12*SZREG(a0), 10f
+ fixup REG_S t2, 13*SZREG(a0), 10f
+ fixup REG_S t3, 14*SZREG(a0), 10f
+ fixup REG_S t4, 15*SZREG(a0), 10f
+
+ addi a1, a1, 16*SZREG
+ addi a0, a0, 16*SZREG
+
+ addi t1, a0, 16*SZREG
+ bleu t1, t0, .L_loop_16x_reg
+
+.L_len_less_16x_szreg:
+ # Pre-check: ensure at least one register copy is possible
+ addi t1, a0, 4*SZREG
+ bgtu t1, t0, .L_len_less_4x_szreg
+
+.L_loop_4x_reg:
+ fixup REG_L a4, 0(a1), 10f
+ fixup REG_L a5, SZREG(a1), 10f
+ fixup REG_L a6, 2*SZREG(a1), 10f
+ fixup REG_L a7, 3*SZREG(a1), 10f
+ fixup REG_S a4, 0(a0), 10f
+ fixup REG_S a5, SZREG(a0), 10f
+ fixup REG_S a6, 2*SZREG(a0), 10f
+ fixup REG_S a7, 3*SZREG(a0), 10f
+ addi a1, a1, 4*SZREG
+ addi a0, a0, 4*SZREG
+
+ # Check if another register copy is safe
+ addi t1, a0, 4*SZREG
+ bleu t1, t0, .L_loop_4x_reg
+
+.L_len_less_4x_szreg:
+ # Pre-check: ensure at least one register copy is possible
+ add t1, a0, SZREG
+ bgtu t1, t0, .Lbyte_copy_word
+
+.L_loop_reg:
+ fixup REG_L a4, 0(a1), 10f
+ addi a1, a1, SZREG
+ fixup REG_S a4, 0(a0), 10f
+ addi a0, a0, SZREG
+
+ # Check if another register copy is safe
+ addi t1, a0, SZREG
+ bleu t1, t0, .L_loop_reg
+.Lbyte_copy_word:
+#if __riscv_xlen == 64
+ add t1, a0, 4
+ bgtu t1, t0, .Lbyte_copy_tail
+
+ fixup lw a4, 0(a1), 10f
+ addi a1, a1, 4
+ fixup sw a4, 0(a0), 10f
+ addi a0, a0, 4
+#endif
+#else
/*
* Save the terminal address which will be used to compute the number
* of bytes copied in case of a fixup exception.
@@ -190,6 +302,7 @@ SYM_FUNC_START(fallback_scalar_usercopy_sum_enabled)
/* Revert src to original unaligned value */
add a1, a1, a3
+#endif /* CONFIG_RISCV_EFFICIENT_UNALIGNED_ACCESS */
.Lbyte_copy_tail:
/*
* Byte copy anything left.
--
2.25.1
Powered by blists - more mailing lists