lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20190914113717.GA28054@zn.tnic>
Date:   Sat, 14 Sep 2019 13:37:17 +0200
From:   Borislav Petkov <bp@...en8.de>
To:     Alexey Dobriyan <adobriyan@...il.com>
Cc:     tglx@...utronix.de, mingo@...hat.com, hpa@...or.com,
        linux-kernel@...r.kernel.org, x86@...r.kernel.org,
        linux@...musvillemoes.dk, torvalds@...ux-foundation.org
Subject: Re: [PATCH] x86_64: new and improved memset()

On Sat, Sep 14, 2019 at 01:33:45PM +0300, Alexey Dobriyan wrote:
> --- a/arch/x86/include/asm/string_64.h
> +++ b/arch/x86/include/asm/string_64.h
> @@ -15,7 +15,111 @@ extern void *memcpy(void *to, const void *from, size_t len);
>  extern void *__memcpy(void *to, const void *from, size_t len);
>  
>  #define __HAVE_ARCH_MEMSET
> +#if defined(_ARCH_X86_BOOT) || defined(CONFIG_FORTIFY_SOURCE)
>  void *memset(void *s, int c, size_t n);
> +#else
> +#include <asm/alternative.h>
> +#include <asm/cpufeatures.h>
> +
> +/* Internal, do not use. */
> +static __always_inline void memset0(void *s, size_t n)
> +{
> +	/* Internal, do not use. */
> +	void _memset0_mov(void);
> +	void _memset0_rep_stosq(void);
> +	void memset0_mov(void);
> +	void memset0_rep_stosq(void);
> +	void memset0_rep_stosb(void);
> +
> +	if (__builtin_constant_p(n) && n == 0) {
> +	} else if (__builtin_constant_p(n) && n == 1) {
> +		*(uint8_t *)s = 0;
> +	} else if (__builtin_constant_p(n) && n == 2) {
> +		*(uint16_t *)s = 0;
> +	} else if (__builtin_constant_p(n) && n == 4) {
> +		*(uint32_t *)s = 0;
> +	} else if (__builtin_constant_p(n) && n == 6) {
> +		*(uint32_t *)s = 0;
> +		*(uint16_t *)(s + 4) = 0;
> +	} else if (__builtin_constant_p(n) && n == 8) {
> +		*(uint64_t *)s = 0;
> +	} else if (__builtin_constant_p(n) && (n & 7) == 0) {
> +		alternative_call_2(
> +			_memset0_mov,
> +			_memset0_rep_stosq, X86_FEATURE_REP_GOOD,
> +			memset0_rep_stosb, X86_FEATURE_ERMS,
> +			ASM_OUTPUT2("=D" (s), "=c" (n)),
> +			"D" (s), "c" (n)
> +			: "rax", "cc", "memory"
> +		);
> +	} else {
> +		alternative_call_2(
> +			memset0_mov,
> +			memset0_rep_stosq, X86_FEATURE_REP_GOOD,
> +			memset0_rep_stosb, X86_FEATURE_ERMS,
> +			ASM_OUTPUT2("=D" (s), "=c" (n)),
> +			"D" (s), "c" (n)
> +			: "rax", "rsi", "cc", "memory"
> +		);
> +	}
> +}
> +
> +/* Internal, do not use. */
> +static __always_inline void memsetx(void *s, int c, size_t n)
> +{
> +	/* Internal, do not use. */
> +	void _memsetx_mov(void);
> +	void _memsetx_rep_stosq(void);
> +	void memsetx_mov(void);
> +	void memsetx_rep_stosq(void);
> +	void memsetx_rep_stosb(void);
> +
> +	const uint64_t ccc = (uint8_t)c * 0x0101010101010101ULL;
> +
> +	if (__builtin_constant_p(n) && n == 0) {
> +	} else if (__builtin_constant_p(n) && n == 1) {
> +		*(uint8_t *)s = ccc;
> +	} else if (__builtin_constant_p(n) && n == 2) {
> +		*(uint16_t *)s = ccc;
> +	} else if (__builtin_constant_p(n) && n == 4) {
> +		*(uint32_t *)s = ccc;
> +	} else if (__builtin_constant_p(n) && n == 8) {
> +		*(uint64_t *)s = ccc;
> +	} else if (__builtin_constant_p(n) && (n & 7) == 0) {
> +		alternative_call_2(
> +			_memsetx_mov,
> +			_memsetx_rep_stosq, X86_FEATURE_REP_GOOD,
> +			memsetx_rep_stosb, X86_FEATURE_ERMS,
> +			ASM_OUTPUT2("=D" (s), "=c" (n)),
> +			"D" (s), "c" (n), "a" (ccc)
> +			: "cc", "memory"
> +		);
> +	} else {
> +		alternative_call_2(
> +			memsetx_mov,
> +			memsetx_rep_stosq, X86_FEATURE_REP_GOOD,
> +			memsetx_rep_stosb, X86_FEATURE_ERMS,
> +			ASM_OUTPUT2("=D" (s), "=c" (n)),
> +			"D" (s), "c" (n), "a" (ccc)
> +			: "rsi", "cc", "memory"
> +		);
> +	}
> +}
> +
> +static __always_inline void *memset(void *s, int c, size_t n)
> +{
> +	if (__builtin_constant_p(c)) {
> +		if (c == 0) {
> +			memset0(s, n);
> +		} else {
> +			memsetx(s, c, n);
> +		}
> +		return s;
> +	} else {
> +		return __builtin_memset(s, c, n);
> +	}
> +}

I'm willing to take something like that only when such complexity is
justified by numbers. I.e., I'm much more inclined to capping it under
32 and 64 byte sizes and keeping it simple.

...

> +ENTRY(_memset0_mov)
> +	xor	eax, eax
> +.globl _memsetx_mov
> +_memsetx_mov:
> +	add	rcx, rdi
> +	cmp	rdi, rcx
> +	je	1f
> +2:
> +	mov	[rdi], rax
> +	add	rdi, 8
> +	cmp	rdi, rcx
> +	jne	2b
> +1:
> +	ret
> +ENDPROC(_memset0_mov)
> +ENDPROC(_memsetx_mov)
> +EXPORT_SYMBOL(_memset0_mov)
> +EXPORT_SYMBOL(_memsetx_mov)
> +
> +ENTRY(memset0_mov)
> +	xor	eax, eax
> +.globl memsetx_mov
> +memsetx_mov:
> +	lea	rsi, [rdi + rcx]
> +	cmp	rdi, rsi
> +	je	1f
> +2:
> +	mov	[rdi], al
> +	add	rdi, 1
> +	cmp	rdi, rsi
> +	jne	2b
> +1:
> +	ret

Say what now? Intel syntax? You must be joking...

> +ENDPROC(memset0_mov)
> +ENDPROC(memsetx_mov)
> +EXPORT_SYMBOL(memset0_mov)
> +EXPORT_SYMBOL(memsetx_mov)

Too many exported symbols. Again, I'll much more prefer a cleaner,
smaller solution than one where readability suffers greatly at the
expense of *maybe* getting a bit better performance.

> --- a/drivers/firmware/efi/libstub/Makefile
> +++ b/drivers/firmware/efi/libstub/Makefile
> @@ -28,7 +28,7 @@ KBUILD_CFLAGS			:= $(cflags-y) -DDISABLE_BRANCH_PROFILING \
>  				   -D__NO_FORTIFY \
>  				   $(call cc-option,-ffreestanding) \
>  				   $(call cc-option,-fno-stack-protector) \
> -				   -D__DISABLE_EXPORTS
> +				   -D__DISABLE_EXPORTS -D_ARCH_X86_BOOT

Yeah, something like that is inevitable, I've come to realize too. ;-\

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ