[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAMj1kXHnBA5qeyHa-b6w+cw5-iomA=3drk7yGGzp-gc_-4uKig@mail.gmail.com>
Date: Wed, 8 May 2024 11:01:25 +0200
From: Ard Biesheuvel <ardb@...nel.org>
To: Eric Biggers <ebiggers@...nel.org>
Cc: linux-crypto@...r.kernel.org, x86@...nel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] crypto: x86/aes-gcm - add VAES and AVX512 / AVX10
optimized AES-GCM
On Wed, 8 May 2024 at 09:22, Eric Biggers <ebiggers@...nel.org> wrote:
>
> From: Eric Biggers <ebiggers@...gle.com>
>
> Add implementations of AES-GCM for x86_64 CPUs that support VAES (vector
> AES), VPCLMULQDQ (vector carryless multiplication), and either AVX512 or
> AVX10. There are two implementations, sharing most source code: one
> using 256-bit vectors and one using 512-bit vectors.
>
> I wrote the new AES-GCM assembly code from scratch, focusing on
> performance, code size (both source and binary), and documenting the
> source. The new assembly file aes-gcm-avx10-x86_64.S is about 1200
> lines including extensive comments, and it generates less than 8 KB of
> binary code. This includes both 256-bit and 512-bit vector code; note
> that only one is used at runtime. The main loop does 4 vectors at a
> time, with the AES and GHASH instructions interleaved. Any remainder is
> handled using a simple 1 vector at a time loop, with masking.
>
This looks very good! The code is well structured and due to the
comments, it is reasonably easy to follow for someone familiar with
the underlying math.
I also strongly prefer a parameterized implementation that assembles
to a minimal object code size over the other proposed implementations,
where there may be a slight marginal performance gain due to the use
of different code paths for different input sizes, but this tends to
be beneficial mostly for benchmarks and not for real-world use cases.
..
>
> Signed-off-by: Eric Biggers <ebiggers@...gle.com>
Tested-by: Ard Biesheuvel <ardb@...nel.org> # Tiger Lake
Reviewed-by: Ard Biesheuvel <ardb@...nel.org>
Some nits below.
> ---
> arch/x86/crypto/Kconfig | 1 +
> arch/x86/crypto/Makefile | 3 +
> arch/x86/crypto/aes-gcm-avx10-x86_64.S | 1201 ++++++++++++++++++++++++
> arch/x86/crypto/aesni-intel_glue.c | 515 +++++++++-
> 4 files changed, 1706 insertions(+), 14 deletions(-)
> create mode 100644 arch/x86/crypto/aes-gcm-avx10-x86_64.S
>
..
> diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
> index 5b25d2a58aeb..e4dec49023af 100644
> --- a/arch/x86/crypto/aesni-intel_glue.c
> +++ b/arch/x86/crypto/aesni-intel_glue.c
> @@ -1212,17 +1212,481 @@ static struct simd_skcipher_alg *aes_xts_simdalg_##suffix
> DEFINE_XTS_ALG(aesni_avx, "xts-aes-aesni-avx", 500);
> #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
> DEFINE_XTS_ALG(vaes_avx2, "xts-aes-vaes-avx2", 600);
> DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700);
> DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800);
> -#endif
> +
> +#define NUM_KEY_POWERS 16 /* excludes zero padding */
> +#define FULL_NUM_KEY_POWERS (NUM_KEY_POWERS + 3) /* includes zero padding */
> +
> +struct aes_gcm_key_avx10 {
> + struct crypto_aes_ctx aes_key AESNI_ALIGN_ATTR;
> + u32 rfc4106_nonce AESNI_ALIGN_ATTR;
Is the alignment needed here?
> + u8 ghash_key_powers[FULL_NUM_KEY_POWERS][16] AESNI_ALIGN_ATTR;
> +};
> +
> +asmlinkage void
> +aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key);
> +asmlinkage void
> +aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key);
> +
> +asmlinkage void
> +aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key,
> + u8 ghash_acc[16], const u8 *aad, int aadlen);
> +
> +asmlinkage void
> +aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
> + const u32 le_ctr[4], u8 ghash_acc[16],
> + const u8 *src, u8 *dst, int datalen);
> +asmlinkage void
> +aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
> + const u32 le_ctr[4], u8 ghash_acc[16],
> + const u8 *src, u8 *dst, int datalen);
> +
> +asmlinkage void
> +aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key,
> + const u32 le_ctr[4], u8 ghash_acc[16],
> + const u8 *src, u8 *dst, int datalen);
> +asmlinkage void
> +aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key,
> + const u32 le_ctr[4], u8 ghash_acc[16],
> + const u8 *src, u8 *dst, int datalen);
> +
> +asmlinkage void
> +aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
> + const u32 le_ctr[4], const u8 ghash_acc[16],
> + u64 total_aadlen, u64 total_datalen,
> + u8 *tag, int taglen);
> +asmlinkage bool
> +aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key,
> + const u32 le_ctr[4], const u8 ghash_acc[16],
> + u64 total_aadlen, u64 total_datalen,
> + const u8 *tag, int taglen);
> +
> +static int gcm_setkey_vaes_avx10(struct crypto_aead *tfm, const u8 *raw_key,
> + unsigned int keylen, bool vl256)
> +{
> + struct aes_gcm_key_avx10 *key = aes_align_addr(crypto_aead_ctx(tfm));
> + int err;
> +
> + /* The assembly code assumes the following offsets. */
> + BUILD_BUG_ON(offsetof(typeof(*key), aes_key.key_enc) != 0);
> + BUILD_BUG_ON(offsetof(typeof(*key), aes_key.key_length) != 480);
> + BUILD_BUG_ON(offsetof(typeof(*key), ghash_key_powers) != 512);
> +
> + if (likely(crypto_simd_usable())) {
Is it really necessary to have 3 different code paths here? If so,
could you add a comment why?
> + err = aes_check_keylen(keylen);
> + if (err)
> + return err;
> + kernel_fpu_begin();
> + aesni_set_key(&key->aes_key, raw_key, keylen);
> + if (vl256)
> + aes_gcm_precompute_vaes_avx10_256(key);
> + else
> + aes_gcm_precompute_vaes_avx10_512(key);
> + kernel_fpu_end();
> + } else {
> + static const u8 x_to_the_minus1[16] __aligned(__alignof__(be128)) = {
> + [0] = 0xc2, [15] = 1
> + };
> + be128 h1 = {};
> + be128 h;
> + int i;
> +
> + err = aes_expandkey(&key->aes_key, raw_key, keylen);
> + if (err)
> + return err;
> + /*
> + * Emulate the aes_gcm_precompute assembly function in portable
> + * C code: Encrypt the all-zeroes block to get the hash key H^1,
> + * zeroize the padding at the end of ghash_key_powers, and store
> + * H^1 * x^-1 through H^NUM_KEY_POWERS * x^-1, byte-reversed.
> + */
> + aes_encrypt(&key->aes_key, (u8 *)&h1, (u8 *)&h1);
> + memset(key->ghash_key_powers, 0, sizeof(key->ghash_key_powers));
> + h = h1;
> + gf128mul_lle(&h, (const be128 *)x_to_the_minus1);
> + for (i = NUM_KEY_POWERS - 1; i >= 0; i--) {
> + put_unaligned_be64(h.a, &key->ghash_key_powers[i][8]);
> + put_unaligned_be64(h.b, &key->ghash_key_powers[i][0]);
> + gf128mul_lle(&h, &h1);
> + }
> + }
> + return 0;
> +}
> +
Powered by blists - more mailing lists