[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20260129010211.103615-1-teknoraver@meta.com>
Date: Thu, 29 Jan 2026 02:02:11 +0100
From: Matteo Croce <technoboy85@...il.com>
To: linux-riscv@...ts.infradead.org,
Paul Walmsley <pjw@...nel.org>,
Palmer Dabbelt <palmer@...belt.com>,
Albert Ou <aou@...s.berkeley.edu>
Cc: linux-kernel@...r.kernel.org
Subject: [PATCH] riscv: memcpy: fast copy for unaligned buffers
The RISC-V memcpy() does an 8 byte wide copy when the two buffers have
the same alignment, and fallbacks to a single byte copy otherwise.
Implement a unaligned aware 8 byte copy when buffers are unaligned
which copies 8 bytes at time by doing proper shifting.
Benchmarks shows that the aligned code path is unaffected, while the
unaligned one gets a ~2.3x boost.
Benchmark with the current implementation:
memcpy: aligned copy of 400 MBytes in 429 msecs (931 MB/s)
memcpy: unaligned copy of 400 MBytes in 1202 msecs (332 MB/s)
Benchmark with the new unaligned copy:
memcpy: aligned copy of 400 MBytes in 428 msecs (933 MB/s)
memcpy: unaligned copy of 400 MBytes in 519 msecs (770 MB/s)
These numbers are calculated on a 1.8 GHz SiFive P550 CPU
with this custom unit test:
https://lore.kernel.org/lkml/20260129004328.102770-1-teknoraver@meta.com/T/
Signed-off-by: Matteo Croce <teknoraver@...a.com>
---
arch/riscv/lib/memcpy.S | 84 ++++++++++++++++++++++++++++++++++++++---
1 file changed, 79 insertions(+), 5 deletions(-)
diff --git a/arch/riscv/lib/memcpy.S b/arch/riscv/lib/memcpy.S
index 44e009ec5fef..293f8a348cfd 100644
--- a/arch/riscv/lib/memcpy.S
+++ b/arch/riscv/lib/memcpy.S
@@ -10,13 +10,14 @@
SYM_FUNC_START(__memcpy)
move t6, a0 /* Preserve return value */
- /* Defer to byte-oriented copy for small sizes */
- sltiu a3, a2, 128
- bnez a3, 4f
- /* Use word-oriented copy only if low-order bits match */
+ /* Check alignment first */
andi a3, t6, SZREG-1
andi a4, a1, SZREG-1
- bne a3, a4, 4f
+ bne a3, a4, .Lshifted_copy
+
+ /* Aligned path: defer to byte-oriented copy for small sizes */
+ sltiu a5, a2, 128
+ bnez a5, 4f
beqz a3, 2f /* Skip if already aligned */
/*
@@ -76,6 +77,79 @@ SYM_FUNC_START(__memcpy)
addi t6, t6, 16*SZREG
bltu a1, a3, 3b
andi a2, a2, (16*SZREG)-1 /* Update count */
+ j 4f /* Skip shifted copy section */
+
+.Lshifted_copy:
+ /*
+ * Source and dest have different alignments.
+ * a3 = dest & (SZREG-1), a4 = src & (SZREG-1)
+ * Align destination first, then use shifted word copy.
+ */
+
+ /* For small sizes, just use byte copy */
+ sltiu a5, a2, 16
+ bnez a5, 4f
+
+ /* If dest is already aligned, skip to shifted loop setup */
+ beqz a3, .Ldest_aligned
+
+ /* Calculate bytes needed to align dest: SZREG - a3 */
+ neg a5, a3
+ addi a5, a5, SZREG
+ sub a2, a2, a5 /* Update count */
+
+.Lalign_dest_loop:
+ lb a4, 0(a1)
+ addi a1, a1, 1
+ sb a4, 0(t6)
+ addi t6, t6, 1
+ addi a5, a5, -1
+ bnez a5, .Lalign_dest_loop
+
+.Ldest_aligned:
+ /*
+ * Dest is now aligned. Check if we have enough bytes
+ * remaining for word-oriented copy.
+ */
+ sltiu a3, a2, SZREG
+ bnez a3, 4f
+
+ /*
+ * Calculate shift amounts based on source alignment (distance).
+ * distance = src & (SZREG-1), guaranteed non-zero since we only
+ * reach here when src and dest had different alignments.
+ */
+ andi a3, a1, SZREG-1 /* a3 = distance */
+ slli a4, a3, 3 /* a4 = distance * 8 (right shift amount) */
+ li a5, SZREG*8
+ sub a5, a5, a4 /* a5 = SZREG*8 - distance*8 (left shift) */
+
+ /* Align src backwards to word boundary */
+ sub a1, a1, a3
+
+ /* Calculate end address: dest + (count rounded down to words) */
+ andi a6, a2, ~(SZREG-1)
+ add a6, t6, a6 /* a6 = loop end address for dest */
+
+ /* Load first aligned word from source */
+ REG_L t0, 0(a1)
+
+.Lshifted_loop:
+ REG_L t1, SZREG(a1) /* Load next aligned word */
+ srl t2, t0, a4 /* Shift right: low part from current word */
+ mv t0, t1 /* Current = next for next iteration */
+ addi a1, a1, SZREG
+ addi t6, t6, SZREG
+ sll t3, t0, a5 /* Shift left: high part from next word */
+ or t2, t2, t3 /* Combine to form output word */
+ REG_S t2, -SZREG(t6) /* Store to aligned dest */
+ bltu t6, a6, .Lshifted_loop
+
+ /* Restore src to correct unaligned position */
+ add a1, a1, a3
+ /* Calculate remaining byte count */
+ andi a2, a2, SZREG-1
+ /* Fall through to label 4 for remaining bytes */
4:
/* Handle trailing misalignment */
--
2.52.0
Powered by blists - more mailing lists