diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 7ff00ea64e4f..e42bf35b9b62 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -39,23 +39,19 @@ SYM_FUNC_START(__memmove)
 	cmp %rdi, %r8
 	jg 2f
 
-	/* FSRM implies ERMS => no length checks, do the copy directly */
+	/*
+	 * Three rep-string alternatives:
+	 *  - go to "movsq" for large regions where source and dest are
+	 *    mutually aligned (same in low 8 bits). "label 4"
+	 *  - plain rep-movsb for FSRM
+	 *  - rep-movs for > 32 byte for ERMS.
+	 */
 .Lmemmove_begin_forward:
-	ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM
-	ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
+	ALTERNATIVE_2 \
+		"cmp  $680, %rdx ; jb 3f ; cmpb %dil, %sil; je 4f", \
+		"movq %rdx, %rcx ; rep movsb; retq", X86_FEATURE_FSRM, \
+		"cmp $0x20, %rdx; jb 1f; movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
 
-	/*
-	 * movsq instruction have many startup latency
-	 * so we handle small size by general register.
-	 */
-	cmp  $680, %rdx
-	jb	3f
-	/*
-	 * movsq instruction is only good for aligned case.
-	 */
-
-	cmpb %dil, %sil
-	je 4f
 3:
 	sub $0x20, %rdx
 	/*