lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <877dcpq3bx.fsf@mpe.ellerman.id.au>
Date:   Tue, 30 Nov 2021 22:40:18 +1100
From:   Michael Ellerman <mpe@...erman.id.au>
To:     LEROY Christophe <christophe.leroy@...roup.eu>
Cc:     "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        "linuxppc-dev@...ts.ozlabs.org" <linuxppc-dev@...ts.ozlabs.org>,
        Segher Boessenkool <segher@...nel.crashing.org>,
        Paul Mackerras <paulus@...ba.org>,
        Benjamin Herrenschmidt <benh@...nel.crashing.org>
Subject: Re: [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible

LEROY Christophe <christophe.leroy@...roup.eu> writes:
> Hi Michael,
>
> Any chance to get this series merged this cycle ?

Yeah. I hesitated a bit at the changes which make the atomic asm even
more complicated, but I guess it's worth it.

cheers

> Le 21/09/2021 à 17:09, Christophe Leroy a écrit :
>> Today we get the following code generation for bitops like
>> set or clear bit:
>> 
>> 	c0009fe0:	39 40 08 00 	li      r10,2048
>> 	c0009fe4:	7c e0 40 28 	lwarx   r7,0,r8
>> 	c0009fe8:	7c e7 53 78 	or      r7,r7,r10
>> 	c0009fec:	7c e0 41 2d 	stwcx.  r7,0,r8
>> 
>> 	c000d568:	39 00 18 00 	li      r8,6144
>> 	c000d56c:	7c c0 38 28 	lwarx   r6,0,r7
>> 	c000d570:	7c c6 40 78 	andc    r6,r6,r8
>> 	c000d574:	7c c0 39 2d 	stwcx.  r6,0,r7
>> 
>> Most set bits are constant on lower 16 bits, so it can easily
>> be replaced by the "immediate" version of the operation. Allow
>> GCC to choose between the normal or immediate form.
>> 
>> For clear bits, on 32 bits 'rlwinm' can be used instead of 'andc' for
>> when all bits to be cleared are consecutive.
>> 
>> On 64 bits we don't have any equivalent single operation for clearing,
>> single bits or a few bits, we'd need two 'rldicl' so it is not
>> worth it, the li/andc sequence is doing the same.
>> 
>> With this patch we get:
>> 
>> 	c0009fe0:	7d 00 50 28 	lwarx   r8,0,r10
>> 	c0009fe4:	61 08 08 00 	ori     r8,r8,2048
>> 	c0009fe8:	7d 00 51 2d 	stwcx.  r8,0,r10
>> 
>> 	c000d558:	7c e0 40 28 	lwarx   r7,0,r8
>> 	c000d55c:	54 e7 05 64 	rlwinm  r7,r7,0,21,18
>> 	c000d560:	7c e0 41 2d 	stwcx.  r7,0,r8
>> 
>> On pmac32_defconfig, it reduces the text by approx 10 kbytes.
>> 
>> Signed-off-by: Christophe Leroy <christophe.leroy@...roup.eu>
>> Reviewed-by: Segher Boessenkool <segher@...nel.crashing.org>
>> ---
>> v5: Fixed the argument of is_rlwinm_mask_valid() in test_and_clear_bits()
>> 
>> v4: Rebased
>> 
>> v3:
>> - Using the mask validation proposed by Segher
>> 
>> v2:
>> - Use "n" instead of "i" as constraint for the rlwinm mask
>> - Improve mask verification to handle more than single bit masks
>> 
>> Signed-off-by: Christophe Leroy <christophe.leroy@...roup.eu>
>> ---
>>   arch/powerpc/include/asm/bitops.h | 89 ++++++++++++++++++++++++++++---
>>   1 file changed, 81 insertions(+), 8 deletions(-)
>> 
>> diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
>> index 11847b6a244e..a05d8c62cbea 100644
>> --- a/arch/powerpc/include/asm/bitops.h
>> +++ b/arch/powerpc/include/asm/bitops.h
>> @@ -71,19 +71,61 @@ static inline void fn(unsigned long mask,	\
>>   	__asm__ __volatile__ (			\
>>   	prefix					\
>>   "1:"	PPC_LLARX "%0,0,%3,0\n"			\
>> -	stringify_in_c(op) "%0,%0,%2\n"		\
>> +	#op "%I2 %0,%0,%2\n"			\
>>   	PPC_STLCX "%0,0,%3\n"			\
>>   	"bne- 1b\n"				\
>>   	: "=&r" (old), "+m" (*p)		\
>> -	: "r" (mask), "r" (p)			\
>> +	: "rK" (mask), "r" (p)			\
>>   	: "cc", "memory");			\
>>   }
>>   
>>   DEFINE_BITOP(set_bits, or, "")
>> -DEFINE_BITOP(clear_bits, andc, "")
>> -DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER)
>>   DEFINE_BITOP(change_bits, xor, "")
>>   
>> +static __always_inline bool is_rlwinm_mask_valid(unsigned long x)
>> +{
>> +	if (!x)
>> +		return false;
>> +	if (x & 1)
>> +		x = ~x;	// make the mask non-wrapping
>> +	x += x & -x;	// adding the low set bit results in at most one bit set
>> +
>> +	return !(x & (x - 1));
>> +}
>> +
>> +#define DEFINE_CLROP(fn, prefix)					\
>> +static inline void fn(unsigned long mask, volatile unsigned long *_p)	\
>> +{									\
>> +	unsigned long old;						\
>> +	unsigned long *p = (unsigned long *)_p;				\
>> +									\
>> +	if (IS_ENABLED(CONFIG_PPC32) &&					\
>> +	    __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {\
>> +		asm volatile (						\
>> +			prefix						\
>> +		"1:"	"lwarx	%0,0,%3\n"				\
>> +			"rlwinm	%0,%0,0,%2\n"				\
>> +			"stwcx.	%0,0,%3\n"				\
>> +			"bne- 1b\n"					\
>> +			: "=&r" (old), "+m" (*p)			\
>> +			: "n" (~mask), "r" (p)				\
>> +			: "cc", "memory");				\
>> +	} else {							\
>> +		asm volatile (						\
>> +			prefix						\
>> +		"1:"	PPC_LLARX "%0,0,%3,0\n"				\
>> +			"andc %0,%0,%2\n"				\
>> +			PPC_STLCX "%0,0,%3\n"				\
>> +			"bne- 1b\n"					\
>> +			: "=&r" (old), "+m" (*p)			\
>> +			: "r" (mask), "r" (p)				\
>> +			: "cc", "memory");				\
>> +	}								\
>> +}
>> +
>> +DEFINE_CLROP(clear_bits, "")
>> +DEFINE_CLROP(clear_bits_unlock, PPC_RELEASE_BARRIER)
>> +
>>   static inline void arch_set_bit(int nr, volatile unsigned long *addr)
>>   {
>>   	set_bits(BIT_MASK(nr), addr + BIT_WORD(nr));
>> @@ -116,12 +158,12 @@ static inline unsigned long fn(			\
>>   	__asm__ __volatile__ (				\
>>   	prefix						\
>>   "1:"	PPC_LLARX "%0,0,%3,%4\n"			\
>> -	stringify_in_c(op) "%1,%0,%2\n"			\
>> +	#op "%I2 %1,%0,%2\n"				\
>>   	PPC_STLCX "%1,0,%3\n"				\
>>   	"bne- 1b\n"					\
>>   	postfix						\
>>   	: "=&r" (old), "=&r" (t)			\
>> -	: "r" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0)	\
>> +	: "rK" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0)	\
>>   	: "cc", "memory");				\
>>   	return (old & mask);				\
>>   }
>> @@ -130,11 +172,42 @@ DEFINE_TESTOP(test_and_set_bits, or, PPC_ATOMIC_ENTRY_BARRIER,
>>   	      PPC_ATOMIC_EXIT_BARRIER, 0)
>>   DEFINE_TESTOP(test_and_set_bits_lock, or, "",
>>   	      PPC_ACQUIRE_BARRIER, 1)
>> -DEFINE_TESTOP(test_and_clear_bits, andc, PPC_ATOMIC_ENTRY_BARRIER,
>> -	      PPC_ATOMIC_EXIT_BARRIER, 0)
>>   DEFINE_TESTOP(test_and_change_bits, xor, PPC_ATOMIC_ENTRY_BARRIER,
>>   	      PPC_ATOMIC_EXIT_BARRIER, 0)
>>   
>> +static inline unsigned long test_and_clear_bits(unsigned long mask, volatile unsigned long *_p)
>> +{
>> +	unsigned long old, t;
>> +	unsigned long *p = (unsigned long *)_p;
>> +
>> +	if (IS_ENABLED(CONFIG_PPC32) &&
>> +	    __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {
>> +		asm volatile (
>> +			PPC_ATOMIC_ENTRY_BARRIER
>> +		"1:"	"lwarx %0,0,%3\n"
>> +			"rlwinm	%1,%0,0,%2\n"
>> +			"stwcx. %1,0,%3\n"
>> +			"bne- 1b\n"
>> +			PPC_ATOMIC_EXIT_BARRIER
>> +			: "=&r" (old), "=&r" (t)
>> +			: "n" (~mask), "r" (p)
>> +			: "cc", "memory");
>> +	} else {
>> +		asm volatile (
>> +			PPC_ATOMIC_ENTRY_BARRIER
>> +		"1:"	PPC_LLARX "%0,0,%3,0\n"
>> +			"andc	%1,%0,%2\n"
>> +			PPC_STLCX "%1,0,%3\n"
>> +			"bne- 1b\n"
>> +			PPC_ATOMIC_EXIT_BARRIER
>> +			: "=&r" (old), "=&r" (t)
>> +			: "r" (mask), "r" (p)
>> +			: "cc", "memory");
>> +	}
>> +
>> +	return (old & mask);
>> +}
>> +
>>   static inline int arch_test_and_set_bit(unsigned long nr,
>>   					volatile unsigned long *addr)
>>   {
>> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ