linux-kernel - Re: [PATCH v2] RAS/AMD/ATL: Remove bitwise_xor

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives

Hash Suite for Android: free password hash cracker in your pocket

[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]

Message-ID: <3e191e87-5b7f-49e9-b794-eb244d478c56@suse.com>
Date: Mon, 24 Nov 2025 15:24:40 +0200
From: Nikolay Borisov <nik.borisov@...e.com>
To: Borislav Petkov <bp@...en8.de>, Kuan-Wei Chiu <visitorckw@...il.com>
Cc: linux-edac@...r.kernel.org, linux-kernel@...r.kernel.org,
 Yazen.Ghannam@....com
Subject: Re: [PATCH v2] RAS/AMD/ATL: Remove bitwise_xor_bits



On 11/24/25 14:52, Borislav Petkov wrote:
> On Mon, Nov 24, 2025 at 08:03:02PM +0800, Kuan-Wei Chiu wrote:
>> On Mon, Nov 24, 2025 at 12:05:26PM +0100, Borislav Petkov wrote:
>>> On Mon, Nov 24, 2025 at 04:57:51PM +0800, Kuan-Wei Chiu wrote:
>>>>> Both LLVM/GCC support a __builtin_parity function which is functionally
>>>>> equivalent to the custom bitwise_xor_bits() one. Let's simplify the code by
>>>>> relying on the built-in. No functional changes.
>>>>
>>>> IIRC in some cases,
>>>
>>> Which are those cases?
>>>
>>> Do you have a trigger scenario?
>>>
>> I did a quick search, and I believe it was this kernel test robot
>> report [1] that reminded me of this compiler behavior.
>>
>> [1]: https://lore.kernel.org/oe-kbuild-all/202501312159.l6jNRaYy-lkp@intel.com/
> 
> Interesting, thanks for the pointer.
> 
> @Nik, just use hweight16() but pls do check what asm the compiler generates
> before and after so that at least there's some palpable improvement or gcc is
> smart enough to replace the unrolled XORing with something slick.
> 
> Also put in the commit message why we're not using the builtin parity thing
> and quote the link above.
> 
> Thx.
> 


So the custom function one results in the following asm being inlined i.e 4 total copies:

# drivers/ras/amd/atl/umc.c:59:                 tmp ^= (val >> i) & 0x1;
         xorl    %ecx, %ecx      # ivtmp.157
# drivers/ras/amd/atl/umc.c:55:         u16 tmp = 0;
         xorl    %esi, %esi      # tmp
# drivers/ras/amd/atl/umc.c:253:                temp  = bitwise_xor_bits(col & addr_hash.bank[i].col_xor);
         andl    %r13d, %edx     # col, tmp249
# drivers/ras/amd/atl/umc.c:59:                 tmp ^= (val >> i) & 0x1;
         movzwl  %dx, %edx       # tmp249, _387
.L3:
         movl    %edx, %eax      # _387, tmp250
         sarl    %cl, %eax       # ivtmp.157, tmp250
# drivers/ras/amd/atl/umc.c:58:         for (i = 0; i < 16; i++)
         addl    $1, %ecx        #, ivtmp.157
# drivers/ras/amd/atl/umc.c:59:                 tmp ^= (val >> i) & 0x1;
         andl    $1, %eax        #, tmp251
# drivers/ras/amd/atl/umc.c:59:                 tmp ^= (val >> i) & 0x1;
         xorl    %eax, %esi      # tmp251, tmp
# drivers/ras/amd/atl/umc.c:58:         for (i = 0; i < 16; i++)
         cmpl    $16, %ecx       #, ivtmp.157
         jne     .L3     #,


Whilst with hweight, if the cpu has popcnt it will be 2 instructions - popcnt and an andl:

# drivers/ras/amd/atl/umc.c:243:                temp ^= hweight16(row & addr_hash.bank[i].row_xor) & 1;
         andl    %ebp, %edi      # _321, tmp221
# ./arch/x86/include/asm/arch_hweight.h:19:     asm_inline (ALTERNATIVE("call __sw_hweight32",
#APP
# 19 "./arch/x86/include/asm/arch_hweight.h" 1
         # ALT: oldinstr
771:
         call __sw_hweight32
772:
# ALT: padding
.skip -(((775f-774f)-(772b-771b)) > 0) * ((775f-774f)-(772b-771b)),0x90
773:
.pushsection .altinstructions,"a"
  .long 771b - .
  .long 774f - .
  .4byte ( 4*32+23)
  .byte 773b-771b
  .byte 775f-774f
.popsection
.pushsection .altinstr_replacement, "ax"
# ALT: replacement
774:
         popcntl %edi, %eax      # tmp221, res
775:
.popsection
                                                                                 
# 0 "" 2
# drivers/ras/amd/atl/umc.c:243:                temp ^= hweight16(row & addr_hash.bank[i].row_xor) & 1;
#NO_APP
         xorl    %eax, %edx      # res, tmp222
         andl    $1, %edx        #, temp


So yeah, much better IMHO, unless there is some hidden latency in the popcnt...