diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 7ff00ea64e4f..e42bf35b9b62 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -39,23 +39,19 @@ SYM_FUNC_START(__memmove) cmp %rdi, %r8 jg 2f - /* FSRM implies ERMS => no length checks, do the copy directly */ + /* + * Three rep-string alternatives: + * - go to "movsq" for large regions where source and dest are + * mutually aligned (same in low 8 bits). "label 4" + * - plain rep-movsb for FSRM + * - rep-movs for > 32 byte for ERMS. + */ .Lmemmove_begin_forward: - ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM - ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS + ALTERNATIVE_2 \ + "cmp $680, %rdx ; jb 3f ; cmpb %dil, %sil; je 4f", \ + "movq %rdx, %rcx ; rep movsb; retq", X86_FEATURE_FSRM, \ + "cmp $0x20, %rdx; jb 1f; movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS - /* - * movsq instruction have many startup latency - * so we handle small size by general register. - */ - cmp $680, %rdx - jb 3f - /* - * movsq instruction is only good for aligned case. - */ - - cmpb %dil, %sil - je 4f 3: sub $0x20, %rdx /*