linux-kernel - Re: [PATCH] aarch64: vdso: Wire up getrandom() vDSO implementation

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAMj1kXFan+yKH_Z8ib=dDBiupRO1SpBn1EbkUC6pQ_k5+bjUvQ@mail.gmail.com>
Date: Tue, 27 Aug 2024 15:51:14 +0200
From: Ard Biesheuvel <ardb@...nel.org>
To: Adhemerval Zanella <adhemerval.zanella@...aro.org>
Cc: "Jason A . Donenfeld" <Jason@...c4.com>, "Theodore Ts'o" <tytso@....edu>, linux-kernel@...r.kernel.org, 
	linux-crypto@...r.kernel.org, linux-arm-kernel@...ts.infradead.org, 
	linux-arch@...r.kernel.org, Catalin Marinas <catalin.marinas@....com>, 
	Will Deacon <will@...nel.org>, Thomas Gleixner <tglx@...utronix.de>, Eric Biggers <ebiggers@...nel.org>, 
	Christophe Leroy <christophe.leroy@...roup.eu>
Subject: Re: [PATCH] aarch64: vdso: Wire up getrandom() vDSO implementation

Hi Adhemerval,

...

> diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> new file mode 100644
> index 000000000000..3fb9715dd6f0
> --- /dev/null
> +++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> @@ -0,0 +1,153 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/linkage.h>
> +#include <asm/cache.h>
> +
> +       .text
> +
> +/*
> + * ARM64 ChaCha20 implementation meant for vDSO.  Produces a given positive
> + * number of blocks of output with nonnce 0, taking an input key and 8-bytes
> + * counter.  Importantly does not spill to the stack.
> + *
> + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
> + *                                    const uint8_t *key,
> + *                                    uint32_t *counter,
> + *                                    size_t nblocks)
> + *
> + *     x0: output bytes
> + *     x1: 32-byte key input
> + *     x2: 8-byte counter input/output
> + *     x3: number of 64-byte block to write to output
> + */
> +SYM_FUNC_START(__arch_chacha20_blocks_nostack)
> +

Shouldn't we preserve d8-d15 here?

> +       /* v0 = "expand 32-byte k" */
> +       adr_l           x8, CTES
> +       ld1             {v5.4s}, [x8]
> +       /* v1,v2 = key */
> +       ld1             { v6.4s, v7.4s }, [x1]
> +       /* v3 = counter || zero noonce  */
> +       ldr             d8, [x2]
> +
> +       adr_l           x8, ONE
> +       ldr             q13, [x8]
> +
> +       adr_l           x10, ROT8
> +       ld1             {v12.4s}, [x10]
> +.Lblock:
> +       /* copy state to auxiliary vectors for the final add after the permute.  */
> +       mov             v0.16b, v5.16b
> +       mov             v1.16b, v6.16b
> +       mov             v2.16b, v7.16b
> +       mov             v3.16b, v8.16b
> +
> +       mov             w4, 20
> +.Lpermute:
> +       /*
> +        * Permute one 64-byte block where the state matrix is stored in the four NEON
> +        * registers v0-v3.  It performs matrix operations on four words in parallel,
> +        * but requires shuffling to rearrange the words after each round.
> +        */
> +
> +.Ldoubleround:
> +       /* x0 += x1, x3 = rotl32(x3 ^ x0, 16) */
> +       add             v0.4s, v0.4s, v1.4s
> +       eor             v3.16b, v3.16b, v0.16b
> +       rev32           v3.8h, v3.8h
> +
> +       /* x2 += x3, x1 = rotl32(x1 ^ x2, 12) */
> +       add             v2.4s, v2.4s, v3.4s
> +       eor             v4.16b, v1.16b, v2.16b
> +       shl             v1.4s, v4.4s, #12
> +       sri             v1.4s, v4.4s, #20
> +
> +       /* x0 += x1, x3 = rotl32(x3 ^ x0, 8) */
> +       add             v0.4s, v0.4s, v1.4s
> +       eor             v3.16b, v3.16b, v0.16b
> +       tbl             v3.16b, {v3.16b}, v12.16b
> +
> +       /* x2 += x3, x1 = rotl32(x1 ^ x2, 7) */
> +       add             v2.4s, v2.4s, v3.4s
> +       eor             v4.16b, v1.16b, v2.16b
> +       shl             v1.4s, v4.4s, #7
> +       sri             v1.4s, v4.4s, #25
> +
> +       /* x1 = shuffle32(x1, MASK(0, 3, 2, 1)) */
> +       ext             v1.16b, v1.16b, v1.16b, #4
> +       /* x2 = shuffle32(x2, MASK(1, 0, 3, 2)) */
> +       ext             v2.16b, v2.16b, v2.16b, #8
> +       /* x3 = shuffle32(x3, MASK(2, 1, 0, 3)) */
> +       ext             v3.16b, v3.16b, v3.16b, #12
> +
> +       /* x0 += x1, x3 = rotl32(x3 ^ x0, 16) */
> +       add             v0.4s, v0.4s, v1.4s
> +       eor             v3.16b, v3.16b, v0.16b
> +       rev32           v3.8h, v3.8h
> +
> +       /* x2 += x3, x1 = rotl32(x1 ^ x2, 12) */
> +       add             v2.4s, v2.4s, v3.4s
> +       eor             v4.16b, v1.16b, v2.16b
> +       shl             v1.4s, v4.4s, #12
> +       sri             v1.4s, v4.4s, #20
> +
> +       /* x0 += x1, x3 = rotl32(x3 ^ x0, 8) */
> +       add             v0.4s, v0.4s, v1.4s
> +       eor             v3.16b, v3.16b, v0.16b
> +       tbl             v3.16b, {v3.16b}, v12.16b
> +
> +       /* x2 += x3, x1 = rotl32(x1 ^ x2, 7) */
> +       add             v2.4s, v2.4s, v3.4s
> +       eor             v4.16b, v1.16b, v2.16b
> +       shl             v1.4s, v4.4s, #7
> +       sri             v1.4s, v4.4s, #25
> +
> +       /* x1 = shuffle32(x1, MASK(2, 1, 0, 3)) */
> +       ext             v1.16b, v1.16b, v1.16b, #12
> +       /* x2 = shuffle32(x2, MASK(1, 0, 3, 2)) */
> +       ext             v2.16b, v2.16b, v2.16b, #8
> +       /* x3 = shuffle32(x3, MASK(0, 3, 2, 1)) */
> +       ext             v3.16b, v3.16b, v3.16b, #4
> +
> +       subs            w4, w4, #2
> +       b.ne            .Ldoubleround
> +
> +       /* output0 = state0 + v0 */
> +       add             v0.4s, v0.4s, v5.4s
> +       /* output1 = state1 + v1 */
> +       add             v1.4s, v1.4s, v6.4s
> +       /* output2 = state2 + v2 */
> +       add             v2.4s, v2.4s, v7.4s
> +       /* output2 = state3 + v3 */
> +       add             v3.4s, v3.4s, v8.4s
> +       st1             { v0.4s - v3.4s }, [x0]
> +
> +       /* ++copy3.counter */
> +       add             d8, d8, d13
> +
> +       /* output += 64, --nblocks */
> +       add             x0, x0, 64
> +       subs            x3, x3, #1
> +       b.ne            .Lblock
> +
> +       /* counter = copy3.counter */
> +       str             d8, [x2]
> +
> +       /* Zero out the potentially sensitive regs, in case nothing uses these again. */
> +       eor             v0.16b, v0.16b, v0.16b
> +       eor             v1.16b, v1.16b, v1.16b
> +       eor             v2.16b, v2.16b, v2.16b
> +       eor             v3.16b, v3.16b, v3.16b
> +       eor             v6.16b, v6.16b, v6.16b
> +       eor             v7.16b, v7.16b, v7.16b
> +       ret
> +SYM_FUNC_END(__arch_chacha20_blocks_nostack)
> +
> +        .section        ".rodata", "a", %progbits
> +        .align          L1_CACHE_SHIFT
> +
> +CTES:  .word           1634760805, 857760878,  2036477234, 1797285236
> +ONE:    .xword         1, 0
> +ROT8:  .word           0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
> +
> +emit_aarch64_feature_1_and