[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <723120f6-c2e5-4277-bcd7-daf95984877e@linaro.org>
Date: Fri, 30 Aug 2024 14:38:03 -0300
From: Adhemerval Zanella Netto <adhemerval.zanella@...aro.org>
To: Ard Biesheuvel <ardb@...nel.org>
Cc: "Jason A . Donenfeld" <Jason@...c4.com>, Theodore Ts'o <tytso@....edu>,
linux-kernel@...r.kernel.org, linux-crypto@...r.kernel.org,
linux-arm-kernel@...ts.infradead.org, linux-arch@...r.kernel.org,
Catalin Marinas <catalin.marinas@....com>, Will Deacon <will@...nel.org>,
Thomas Gleixner <tglx@...utronix.de>, Eric Biggers <ebiggers@...nel.org>,
Christophe Leroy <christophe.leroy@...roup.eu>
Subject: Re: [PATCH v2] aarch64: vdso: Wire up getrandom() vDSO implementation
On 30/08/24 11:11, Ard Biesheuvel wrote:
> On Thu, 29 Aug 2024 at 22:17, Adhemerval Zanella
> <adhemerval.zanella@...aro.org> wrote:
>>
>> Hook up the generic vDSO implementation to the aarch64 vDSO data page.
>> The _vdso_rng_data required data is placed within the _vdso_data vvar
>> page, by using a offset larger than the vdso_data.
>>
>> The vDSO function requires a ChaCha20 implementation that does not
>> write to the stack, and that can do an entire ChaCha20 permutation.
>> The one provided is based on the current chacha-neon-core.S and uses NEON
>> on the permute operation. The fallback for chips that do not support
>> NEON issues the syscall.
>>
>> This also passes the vdso_test_chacha test along with
>> vdso_test_getrandom. The vdso_test_getrandom bench-single result on
>> Neoverse-N1 shows:
>>
>> vdso: 25000000 times in 0.746506464 seconds
>> libc: 25000000 times in 8.849179444 seconds
>> syscall: 25000000 times in 8.818726425 seconds
>>
>> Changes from v1:
>> - Fixed style issues and typos.
>> - Added fallback for systems without NEON support.
>> - Avoid use of non-volatile vector registers in neon chacha20.
>> - Use c-getrandom-y for vgetrandom.c.
>> - Fixed TIMENS vdso_rnd_data access.
>>
>> Signed-off-by: Adhemerval Zanella <adhemerval.zanella@...aro.org>
>> ---
> ...
>> diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
>> new file mode 100644
>> index 000000000000..9ebf12a09c65
>> --- /dev/null
>> +++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
>> @@ -0,0 +1,168 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +#include <linux/linkage.h>
>> +#include <asm/cache.h>
>> +#include <asm/assembler.h>
>> +
>> + .text
>> +
>> +#define state0 v0
>> +#define state1 v1
>> +#define state2 v2
>> +#define state3 v3
>> +#define copy0 v4
>> +#define copy1 v5
>> +#define copy2 v6
>> +#define copy3 v7
>> +#define copy3_d d7
>> +#define one_d d16
>> +#define one_q q16
>> +#define tmp v17
>> +#define rot8 v18
>> +
>
> Please make a note somewhere around here that you are deliberately
> avoiding d8-d15 because they are callee-save in user space.
Ack.
>
>> +/*
>> + * ARM64 ChaCha20 implementation meant for vDSO. Produces a given positive
>> + * number of blocks of output with nonce 0, taking an input key and 8-bytes
>> + * counter. Importantly does not spill to the stack.
>> + *
>> + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
>> + * const uint8_t *key,
>> + * uint32_t *counter,
>> + * size_t nblocks)
>> + *
>> + * x0: output bytes
>> + * x1: 32-byte key input
>> + * x2: 8-byte counter input/output
>> + * x3: number of 64-byte block to write to output
>> + */
>> +SYM_FUNC_START(__arch_chacha20_blocks_nostack)
>> +
>> + /* copy0 = "expand 32-byte k" */
>> + adr_l x8, CTES
>> + ld1 {copy0.4s}, [x8]
>> + /* copy1,copy2 = key */
>> + ld1 { copy1.4s, copy2.4s }, [x1]
>> + /* copy3 = counter || zero nonce */
>> + ldr copy3_d, [x2]
>> +
>> + adr_l x8, ONE
>> + ldr one_q, [x8]
>> +
>> + adr_l x10, ROT8
>> + ld1 {rot8.4s}, [x10]
>
> These immediate loads are forcing the vDSO to have a .rodata section,
> which is best avoided, given that this is mapped into every user space
> program.
>
> Either use the existing mov_q macro and then move the values into SIMD
> registers, or compose the required vectors in a different way.
Ack, mov_q seems suffice here.
>
> E.g., with one_v == v16,
>
> movi one_v.2s, #1
> uzp1 one_v.4s, one_v.4s, one_v.4s
>
> puts the correct value in one_d, uses 1 instruction and 16 bytes of
> rodata less, and avoids a memory access.
Ack.
>
> The ROT8 + tbl can be replaced by shl/sri (see below)
>
>> +.Lblock:
>> + /* copy state to auxiliary vectors for the final add after the permute. */
>> + mov state0.16b, copy0.16b
>> + mov state1.16b, copy1.16b
>> + mov state2.16b, copy2.16b
>> + mov state3.16b, copy3.16b
>> +
>> + mov w4, 20
>> +.Lpermute:
>> + /*
>> + * Permute one 64-byte block where the state matrix is stored in the four NEON
>> + * registers state0-state3. It performs matrix operations on four words in parallel,
>> + * but requires shuffling to rearrange the words after each round.
>> + */
>> +
>> +.Ldoubleround:
>> + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
>> + add state0.4s, state0.4s, state1.4s
>> + eor state3.16b, state3.16b, state0.16b
>> + rev32 state3.8h, state3.8h
>> +
>> + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
>> + add state2.4s, state2.4s, state3.4s
>> + eor tmp.16b, state1.16b, state2.16b
>> + shl state1.4s, tmp.4s, #12
>> + sri state1.4s, tmp.4s, #20
>> +
>> + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
>> + add state0.4s, state0.4s, state1.4s
>> + eor state3.16b, state3.16b, state0.16b
>> + tbl state3.16b, {state3.16b}, rot8.16b
>> +
>
> This can be changed to the below, removing the need for the ROT8 vector
>
> eor tmp.16b, state3.16b, state0.16b
> shl state3.4s, tmp.4s, #8
> sri state3.4s, tmp.4s, #24
>
Ack.
>> + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
>> + add state2.4s, state2.4s, state3.4s
>> + eor tmp.16b, state1.16b, state2.16b
>> + shl state1.4s, tmp.4s, #7
>> + sri state1.4s, tmp.4s, #25
>> +
>> + /* state1[0,1,2,3] = state1[1,2,3,0] */
>> + ext state1.16b, state1.16b, state1.16b, #4
>> + /* state2[0,1,2,3] = state2[2,3,0,1] */
>> + ext state2.16b, state2.16b, state2.16b, #8
>> + /* state3[0,1,2,3] = state3[1,2,3,0] */
>> + ext state3.16b, state3.16b, state3.16b, #12
>> +
>> + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
>> + add state0.4s, state0.4s, state1.4s
>> + eor state3.16b, state3.16b, state0.16b
>> + rev32 state3.8h, state3.8h
>> +
>> + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
>> + add state2.4s, state2.4s, state3.4s
>> + eor tmp.16b, state1.16b, state2.16b
>> + shl state1.4s, tmp.4s, #12
>> + sri state1.4s, tmp.4s, #20
>> +
>> + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
>> + add state0.4s, state0.4s, state1.4s
>> + eor state3.16b, state3.16b, state0.16b
>> + tbl state3.16b, {state3.16b}, rot8.16b
>> +
>> + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
>> + add state2.4s, state2.4s, state3.4s
>> + eor tmp.16b, state1.16b, state2.16b
>> + shl state1.4s, tmp.4s, #7
>> + sri state1.4s, tmp.4s, #25
>> +
>> + /* state1[0,1,2,3] = state1[3,0,1,2] */
>> + ext state1.16b, state1.16b, state1.16b, #12
>> + /* state2[0,1,2,3] = state2[2,3,0,1] */
>> + ext state2.16b, state2.16b, state2.16b, #8
>> + /* state3[0,1,2,3] = state3[1,2,3,0] */
>> + ext state3.16b, state3.16b, state3.16b, #4
>> +
>> + subs w4, w4, #2
>> + b.ne .Ldoubleround
>> +
>> + /* output0 = state0 + state0 */
>> + add state0.4s, state0.4s, copy0.4s
>> + /* output1 = state1 + state1 */
>> + add state1.4s, state1.4s, copy1.4s
>> + /* output2 = state2 + state2 */
>> + add state2.4s, state2.4s, copy2.4s
>> + /* output2 = state3 + state3 */
>> + add state3.4s, state3.4s, copy3.4s
>> + st1 { state0.4s - state3.4s }, [x0]
>> +
>> + /* ++copy3.counter */
>> + add copy3_d, copy3_d, one_d
>> +
>
> This 'add' clears the upper half of the SIMD register, which is where
> the zero nonce lives. So this happens to be correct, but it is not
> very intuitive, so perhaps a comment would be in order here.
Ack, will do.
>
>> + /* output += 64, --nblocks */
>> + add x0, x0, 64
>> + subs x3, x3, #1
>> + b.ne .Lblock
>> +
>> + /* counter = copy3.counter */
>> + str copy3_d, [x2]
>> +
>> + /* Zero out the potentially sensitive regs, in case nothing uses these again. */
>> + eor state0.16b, state0.16b, state0.16b
>> + eor state1.16b, state1.16b, state1.16b
>> + eor state2.16b, state2.16b, state2.16b
>> + eor state3.16b, state3.16b, state3.16b
>> + eor copy1.16b, copy1.16b, copy1.16b
>> + eor copy2.16b, copy2.16b, copy2.16b
>
> This is not x86 - no need to use XOR to clear registers, you can just
> use 'movi reg.16b, #0' here.
Ack.
>
>> + ret
>> +SYM_FUNC_END(__arch_chacha20_blocks_nostack)
>> +
>> + .section ".rodata", "a", %progbits
>> + .align L1_CACHE_SHIFT
>> +
>> +CTES: .word 1634760805, 857760878, 2036477234, 1797285236
>> +ONE: .xword 1, 0
>> +ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
>> +
>> +emit_aarch64_feature_1_and
> ...
Powered by blists - more mailing lists