linux-kernel - Re: [PATCH v4 2/2] arm64: vdso: wire up getrandom() vDSO implementation

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAMj1kXGddK5pkVUM0Gx5OZStpOhYEYQteRk-pPtZZZiDAn0wiw@mail.gmail.com>
Date: Mon, 2 Sep 2024 22:57:33 +0200
From: Ard Biesheuvel <ardb@...nel.org>
To: Adhemerval Zanella <adhemerval.zanella@...aro.org>
Cc: "Jason A . Donenfeld" <Jason@...c4.com>, "Theodore Ts'o" <tytso@....edu>, linux-kernel@...r.kernel.org, 
	linux-crypto@...r.kernel.org, linux-arm-kernel@...ts.infradead.org, 
	linux-arch@...r.kernel.org, Catalin Marinas <catalin.marinas@....com>, 
	Will Deacon <will@...nel.org>, Thomas Gleixner <tglx@...utronix.de>, Eric Biggers <ebiggers@...nel.org>, 
	Christophe Leroy <christophe.leroy@...roup.eu>
Subject: Re: [PATCH v4 2/2] arm64: vdso: wire up getrandom() vDSO implementation

Hi Adhemerval,

I have just a couple of more points below, on the BE handling in the asm.

On Mon, 2 Sept 2024 at 18:19, Adhemerval Zanella
<adhemerval.zanella@...aro.org> wrote:
>
> Hook up the generic vDSO implementation to the aarch64 vDSO data page.
> The _vdso_rng_data required data is placed within the _vdso_data vvar
> page, by using a offset larger than the vdso_data.
>
> The vDSO function requires a ChaCha20 implementation that does not write
> to the stack, and that can do an entire ChaCha20 permutation.  The one
> provided uses NEON on the permute operation, with a fallback to the
> syscall for chips that do not support AdvSIMD.
>
> This also passes the vdso_test_chacha test along with
> vdso_test_getrandom. The vdso_test_getrandom bench-single result on
> Neoverse-N1 shows:
>
>    vdso: 25000000 times in 0.783884250 seconds
>    libc: 25000000 times in 8.780275399 seconds
> syscall: 25000000 times in 8.786581518 seconds
>
> A small fixup to arch/arm64/include/asm/mman.h was required to avoid
> pulling kernel code into the vDSO, similar to what's already done in
> arch/arm64/include/asm/rwonce.h.
>
> Signed-off-by: Adhemerval Zanella <adhemerval.zanella@...aro.org>
> ---
>  arch/arm64/Kconfig                         |   1 +
>  arch/arm64/include/asm/mman.h              |   6 +-
>  arch/arm64/include/asm/vdso.h              |   6 +
>  arch/arm64/include/asm/vdso/getrandom.h    |  50 ++++++
>  arch/arm64/include/asm/vdso/vsyscall.h     |  10 ++
>  arch/arm64/kernel/vdso.c                   |   6 -
>  arch/arm64/kernel/vdso/Makefile            |  25 ++-
>  arch/arm64/kernel/vdso/vdso                |   1 +
>  arch/arm64/kernel/vdso/vdso.lds.S          |   4 +
>  arch/arm64/kernel/vdso/vgetrandom-chacha.S | 178 +++++++++++++++++++++
>  arch/arm64/kernel/vdso/vgetrandom.c        |  15 ++
>  tools/arch/arm64/vdso                      |   1 +
>  tools/include/linux/compiler.h             |   4 +
>  tools/testing/selftests/vDSO/Makefile      |   3 +-
>  14 files changed, 294 insertions(+), 16 deletions(-)
>  create mode 100644 arch/arm64/include/asm/vdso/getrandom.h
>  create mode 120000 arch/arm64/kernel/vdso/vdso
>  create mode 100644 arch/arm64/kernel/vdso/vgetrandom-chacha.S
>  create mode 100644 arch/arm64/kernel/vdso/vgetrandom.c
>  create mode 120000 tools/arch/arm64/vdso
>
...
> diff --git a/arch/arm64/kernel/vdso/vgetrandom-chacha.S b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> new file mode 100644
> index 000000000000..4e5f9c349522
> --- /dev/null
> +++ b/arch/arm64/kernel/vdso/vgetrandom-chacha.S
> @@ -0,0 +1,178 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/linkage.h>
> +#include <asm/cache.h>
> +#include <asm/assembler.h>
> +
> +       .text
> +
> +#define state0         v0
> +#define state1         v1
> +#define state2         v2
> +#define state3         v3
> +#define copy0          v4
> +#define copy0_q                q4
> +#define copy1          v5
> +#define copy2          v6
> +#define copy3          v7
> +#define copy3_d                d7
> +#define one_d          d16
> +#define one_q          q16
> +#define one_v          v16
> +#define tmp            v17
> +#define rot8           v18
> +
> +/*
> + * ARM64 ChaCha20 implementation meant for vDSO.  Produces a given positive
> + * number of blocks of output with nonce 0, taking an input key and 8-bytes
> + * counter.  Importantly does not spill to the stack.
> + *
> + * This implementation avoids d8-d15 because they are callee-save in user
> + * space.
> + *
> + * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
> + *                                    const uint8_t *key,
> + *                                    uint32_t *counter,
> + *                                    size_t nblocks)
> + *
> + *     x0: output bytes
> + *     x1: 32-byte key input
> + *     x2: 8-byte counter input/output
> + *     x3: number of 64-byte block to write to output
> + */
> +SYM_FUNC_START(__arch_chacha20_blocks_nostack)
> +
> +       /* copy0 = "expand 32-byte k" */
> +       mov_q           x8, 0x3320646e61707865
> +       mov_q           x9, 0x6b20657479622d32
> +       mov             copy0.d[0], x8
> +       mov             copy0.d[1], x9
> +
> +       /* copy1,copy2 = key */
> +       ld1             { copy1.4s, copy2.4s }, [x1]
> +       /* copy3 = counter || zero nonce  */
> +       ldr             copy3_d, [x2]
> +CPU_BE( rev64          copy3.4s, copy3.4s)
> +

This loads 2 u32s as a single u64, and then swaps them if we are running on BE.
So better to just use

  ld1 {copy3.2s}, [x2]

here, and drop the CPU_BE() special case.

> +       movi            one_v.2s, #1
> +       uzp1            one_v.4s, one_v.4s, one_v.4s
> +
> +.Lblock:
> +       /* copy state to auxiliary vectors for the final add after the permute.  */
> +       mov             state0.16b, copy0.16b
> +       mov             state1.16b, copy1.16b
> +       mov             state2.16b, copy2.16b
> +       mov             state3.16b, copy3.16b
> +
> +       mov             w4, 20
> +.Lpermute:
> +       /*
> +        * Permute one 64-byte block where the state matrix is stored in the four NEON
> +        * registers state0-state3.  It performs matrix operations on four words in parallel,
> +        * but requires shuffling to rearrange the words after each round.
> +        */
> +
> +.Ldoubleround:
> +       /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
> +       add             state0.4s, state0.4s, state1.4s
> +       eor             state3.16b, state3.16b, state0.16b
> +       rev32           state3.8h, state3.8h
> +
> +       /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
> +       add             state2.4s, state2.4s, state3.4s
> +       eor             tmp.16b, state1.16b, state2.16b
> +       shl             state1.4s, tmp.4s, #12
> +       sri             state1.4s, tmp.4s, #20
> +
> +       /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
> +       add             state0.4s, state0.4s, state1.4s
> +       eor             tmp.16b, state3.16b, state0.16b
> +       shl             state3.4s, tmp.4s, #8
> +       sri             state3.4s, tmp.4s, #24
> +
> +       /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
> +       add             state2.4s, state2.4s, state3.4s
> +       eor             tmp.16b, state1.16b, state2.16b
> +       shl             state1.4s, tmp.4s, #7
> +       sri             state1.4s, tmp.4s, #25
> +
> +       /* state1[0,1,2,3] = state1[1,2,3,0] */
> +       ext             state1.16b, state1.16b, state1.16b, #4
> +       /* state2[0,1,2,3] = state2[2,3,0,1] */
> +       ext             state2.16b, state2.16b, state2.16b, #8
> +       /* state3[0,1,2,3] = state3[1,2,3,0] */
> +       ext             state3.16b, state3.16b, state3.16b, #12
> +
> +       /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
> +       add             state0.4s, state0.4s, state1.4s
> +       eor             state3.16b, state3.16b, state0.16b
> +       rev32           state3.8h, state3.8h
> +
> +       /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
> +       add             state2.4s, state2.4s, state3.4s
> +       eor             tmp.16b, state1.16b, state2.16b
> +       shl             state1.4s, tmp.4s, #12
> +       sri             state1.4s, tmp.4s, #20
> +
> +       /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
> +       add             state0.4s, state0.4s, state1.4s
> +       eor             tmp.16b, state3.16b, state0.16b
> +       shl             state3.4s, tmp.4s, #8
> +       sri             state3.4s, tmp.4s, #24
> +
> +       /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
> +       add             state2.4s, state2.4s, state3.4s
> +       eor             tmp.16b, state1.16b, state2.16b
> +       shl             state1.4s, tmp.4s, #7
> +       sri             state1.4s, tmp.4s, #25
> +
> +       /* state1[0,1,2,3] = state1[3,0,1,2] */
> +       ext             state1.16b, state1.16b, state1.16b, #12
> +       /* state2[0,1,2,3] = state2[2,3,0,1] */
> +       ext             state2.16b, state2.16b, state2.16b, #8
> +       /* state3[0,1,2,3] = state3[1,2,3,0] */
> +       ext             state3.16b, state3.16b, state3.16b, #4
> +
> +       subs            w4, w4, #2
> +       b.ne            .Ldoubleround
> +
> +       /* output0 = state0 + state0 */
> +       add             state0.4s, state0.4s, copy0.4s
> +CPU_BE( rev32          state0.16b, state0.16b)
> +       /* output1 = state1 + state1 */
> +       add             state1.4s, state1.4s, copy1.4s
> +CPU_BE( rev32          state1.16b, state1.16b)
> +       /* output2 = state2 + state2 */
> +       add             state2.4s, state2.4s, copy2.4s
> +CPU_BE( rev32          state2.16b, state2.16b)
> +       /* output2 = state3 + state3 */
> +       add             state3.4s, state3.4s, copy3.4s
> +CPU_BE( rev32          state3.16b, state3.16b)
> +       st1             { state0.4s - state3.4s }, [x0]
> +

If the u32s shouldn't be swabbed for BE, you should simply be able to do

st1 {state0.16b - state3.16b}, [x0]

here, and drop the CPU_BE(*).

> +       /*
> +        * ++copy3.counter, the 'add' clears the upper half of the SIMD register
> +        * which is the expected behaviour here.
> +        */
> +       add             copy3_d, copy3_d, one_d
> +
> +       /* output += 64, --nblocks */
> +       add             x0, x0, 64
> +       subs            x3, x3, #1
> +       b.ne            .Lblock
> +
> +       /* counter = copy3.counter */
> +CPU_BE( rev64          copy3.4s, copy3.4s)
> +       str             copy3_d, [x2]
> +

... and this could be

st1 {copy3.2s}, [x2]

> +       /* Zero out the potentially sensitive regs, in case nothing uses these again. */
> +       movi            state0.16b, #0
> +       movi            state1.16b, #0
> +       movi            state2.16b, #0
> +       movi            state3.16b, #0
> +       movi            copy1.16b, #0
> +       movi            copy2.16b, #0
> +       ret
> +SYM_FUNC_END(__arch_chacha20_blocks_nostack)
> +
> +emit_aarch64_feature_1_and