diff -urp yescrypt-0.7.1/tests.c yescrypt-0.7.1-avx2/tests.c --- yescrypt-0.7.1/tests.c 2015-02-01 10:37:31.000000000 +0000 +++ yescrypt-0.7.1-avx2/tests.c 2015-04-24 11:46:38.655400756 +0000 @@ -311,7 +311,11 @@ main(int argc, char *argv[]) yescrypt_free_shared(&shared); shared.aligned_size = ((uint64_t)1 << NROM_log2) * 128 * r * 1; - shared.aligned = malloc(shared.aligned_size); +/* Have to align to 32 bytes manually for AVX2, since malloc() might not */ + shared.aligned = malloc(shared.aligned_size + 31); + shared.aligned = (uint8_t *)shared.aligned + 31; + shared.aligned = (uint8_t *)shared.aligned - + ((size_t)shared.aligned & 31); /* These should be unused by yescrypt_init_shared() */ shared.base_size = 0; diff -urp yescrypt-0.7.1/yescrypt-simd.c yescrypt-0.7.1-avx2/yescrypt-simd.c --- yescrypt-0.7.1/yescrypt-simd.c 2015-02-01 14:10:41.000000000 +0000 +++ yescrypt-0.7.1-avx2/yescrypt-simd.c 2015-04-24 15:05:51.994268326 +0000 @@ -30,15 +30,15 @@ /* * On 64-bit, enabling SSE4.1 helps our pwxform code indirectly, via avoiding - * gcc bug 54349 (fixed for gcc 4.9+). On 32-bit, it's of direct help. AVX - * and XOP are of further help either way. + * gcc bug 54349 (fixed for gcc 4.9+). On 32-bit, it's of direct help. AVX, + * XOP, and AVX2 are of further help either way. */ #ifndef __SSE4_1__ -#warning "Consider enabling SSE4.1, AVX, or XOP in the C compiler for significantly better performance" +#warning "Consider enabling SSE4.1, AVX, XOP, or AVX2 in the C compiler for significantly better performance" #endif #include -#ifdef __XOP__ +#if defined(__XOP__) || defined(__AVX2__) #include #endif @@ -77,7 +77,7 @@ } #endif -#define SALSA20_2ROUNDS \ +#define SALSA20_2ROUNDS(X0, X1, X2, X3) \ /* Operate on "columns" */ \ ARX(X1, X0, X3, 7) \ ARX(X2, X1, X0, 9) \ @@ -100,8 +100,42 @@ X2 = _mm_shuffle_epi32(X2, 0x4E); \ X3 = _mm_shuffle_epi32(X3, 0x93); +typedef union { + uint32_t w[16]; + __m128i q[4]; +#ifdef __AVX2__ + __m256i o[2]; +#endif +} salsa20_blk_t; + +#ifdef __AVX2__ +/** + * Apply the Salsa20/8 core to the block provided in (X0, X1). + */ +#define SALSA20_8_BASE(maybe_decl, out) \ + { \ + maybe_decl Y0 = X0; \ + maybe_decl Y1 = X1; \ + __m128i A0 = _mm256_castsi256_si128(X0); \ + __m128i A1 = _mm256_extracti128_si256(X0, 1); \ + __m128i A2 = _mm256_castsi256_si128(X1); \ + __m128i A3 = _mm256_extracti128_si256(X1, 1); \ + SALSA20_2ROUNDS(A0, A1, A2, A3) \ + SALSA20_2ROUNDS(A0, A1, A2, A3) \ + SALSA20_2ROUNDS(A0, A1, A2, A3) \ + SALSA20_2ROUNDS(A0, A1, A2, A3) \ + (out).o[0] = X0 = \ + _mm256_add_epi32(Y0, _mm256_inserti128_si256( \ + _mm256_castsi128_si256(A0), A1, 1)); \ + (out).o[1] = X1 = \ + _mm256_add_epi32(Y1, _mm256_inserti128_si256( \ + _mm256_castsi128_si256(A2), A3, 1)); \ + } +#define SALSA20_8(out) \ + SALSA20_8_BASE(__m256i, out) +#else /** - * Apply the salsa20/8 core to the block provided in (X0 ... X3). + * Apply the Salsa20/8 core to the block provided in (X0 ... X3). */ #define SALSA20_8_BASE(maybe_decl, out) \ { \ @@ -113,16 +147,32 @@ SALSA20_2ROUNDS \ SALSA20_2ROUNDS \ SALSA20_2ROUNDS \ - (out)[0] = X0 = _mm_add_epi32(X0, Y0); \ - (out)[1] = X1 = _mm_add_epi32(X1, Y1); \ - (out)[2] = X2 = _mm_add_epi32(X2, Y2); \ - (out)[3] = X3 = _mm_add_epi32(X3, Y3); \ + (out).q[0] = X0 = _mm_add_epi32(X0, Y0); \ + (out).q[1] = X1 = _mm_add_epi32(X1, Y1); \ + (out).q[2] = X2 = _mm_add_epi32(X2, Y2); \ + (out).q[3] = X3 = _mm_add_epi32(X3, Y3); \ } #define SALSA20_8(out) \ SALSA20_8_BASE(__m128i, out) +#endif + +#ifdef __AVX2__ +/** + * Apply the Salsa20/8 core to the block provided in (X0, X1) ^ (Z0, Z1). + */ +#define SALSA20_8_XOR_ANY(maybe_decl, Z0, Z1, out) \ + X0 = _mm256_xor_si256(X0, Z0); \ + X1 = _mm256_xor_si256(X1, Z1); \ + SALSA20_8_BASE(maybe_decl, out) + +#define SALSA20_8_XOR_MEM(in, out) \ + SALSA20_8_XOR_ANY(__m256i, (in).o[0], (in).o[1], out) +#define SALSA20_8_XOR_REG(out) \ + SALSA20_8_XOR_ANY(/* empty */, Y0, Y1, out) +#else /** - * Apply the salsa20/8 core to the block provided in (X0 ... X3) ^ (Z0 ... Z3). + * Apply the Salsa20/8 core to the block provided in (X0 ... X3) ^ (Z0 ... Z3). */ #define SALSA20_8_XOR_ANY(maybe_decl, Z0, Z1, Z2, Z3, out) \ X0 = _mm_xor_si128(X0, Z0); \ @@ -132,15 +182,35 @@ SALSA20_8_BASE(maybe_decl, out) #define SALSA20_8_XOR_MEM(in, out) \ - SALSA20_8_XOR_ANY(__m128i, (in)[0], (in)[1], (in)[2], (in)[3], out) + SALSA20_8_XOR_ANY(__m128i, (in).q[0], (in).q[1], (in).q[2], (in).q[3], out) #define SALSA20_8_XOR_REG(out) \ SALSA20_8_XOR_ANY(/* empty */, Y0, Y1, Y2, Y3, out) +#endif -typedef union { - uint32_t w[16]; - __m128i q[4]; -} salsa20_blk_t; +#ifdef __AVX2__ +#define DECL_X \ + __m256i X0, X1; +#define DECL_Y \ + __m256i Y0, Y1; +#define READ_X(in) \ + X0 = (in).o[0]; \ + X1 = (in).o[1]; +#define INTEGERIFY(x) \ + _mm_cvtsi128_si32(_mm256_castsi256_si128(x)) +#else +#define DECL_X \ + __m128i X0, X1, X2, X3; +#define DECL_Y \ + __m128i Y0, Y1, Y2, Y3; +#define READ_X(in) \ + X0 = (in).q[0]; \ + X1 = (in).q[1]; \ + X2 = (in).q[2]; \ + X3 = (in).q[3]; +#define INTEGERIFY(x) \ + _mm_cvtsi128_si32(x) +#endif /** * blockmix_salsa8(Bin, Bout, r): @@ -151,7 +221,7 @@ static void blockmix_salsa8(const salsa20_blk_t *restrict Bin, salsa20_blk_t *restrict Bout, size_t r) { - __m128i X0, X1, X2, X3; + DECL_X size_t i; r--; @@ -167,35 +237,32 @@ blockmix_salsa8(const salsa20_blk_t *res PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) /* 1: X <-- B_{2r - 1} */ - X0 = Bin[r * 2 + 1].q[0]; - X1 = Bin[r * 2 + 1].q[1]; - X2 = Bin[r * 2 + 1].q[2]; - X3 = Bin[r * 2 + 1].q[3]; + READ_X(Bin[r * 2 + 1]) /* 3: X <-- H(X \xor B_i) */ /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[0].q, Bout[0].q) + SALSA20_8_XOR_MEM(Bin[0], Bout[0]) /* 2: for i = 0 to 2r - 1 do */ for (i = 0; i < r;) { /* 3: X <-- H(X \xor B_i) */ /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[i * 2 + 1].q, Bout[r + 1 + i].q) + SALSA20_8_XOR_MEM(Bin[i * 2 + 1], Bout[r + 1 + i]) i++; /* 3: X <-- H(X \xor B_i) */ /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[i * 2].q, Bout[i].q) + SALSA20_8_XOR_MEM(Bin[i * 2], Bout[i]) } /* 3: X <-- H(X \xor B_i) */ /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - SALSA20_8_XOR_MEM(Bin[r * 2 + 1].q, Bout[r * 2 + 1].q) + SALSA20_8_XOR_MEM(Bin[r * 2 + 1], Bout[r * 2 + 1]) } /* @@ -263,7 +330,109 @@ blockmix_salsa8(const salsa20_blk_t *res #define Smask (((1 << Swidth) - 1) * PWXsimple * 8) #define Smask2 (((uint64_t)Smask << 32) | Smask) -#if !defined(__x86_64__) && defined(__SSE4_1__) +#if defined(__x86_64__) && defined(__AVX2__) +/* 64-bit with AVX2 */ +#if 0 +#define PWXFORM_X_T __m128i +#define PWXFORM_SIMD(X, x, s0, s1) \ + x = _mm_and_si128(_mm256_castsi256_si128( \ + _mm256_permute4x64_epi64(X, 0xe8)), _mm_set1_epi64x(Smask2)); \ + s0 = _mm256_blend_epi32( \ + _mm256_castsi128_si256(*(const __m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x))), \ + _mm256_loadu_si256((const __m256i *)(S0 + (uint32_t)_mm_extract_epi32(x, 2) - 16)), 0xf0); \ + s1 = _mm256_blend_epi32( \ + _mm256_castsi128_si256(*(const __m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1))), \ + _mm256_loadu_si256((const __m256i *)(S1 + (uint32_t)_mm_extract_epi32(x, 3) - 16)), 0xf0); \ + X = _mm256_mul_epu32(_mm256_srli_epi64(X, 32), X); \ + X = _mm256_add_epi64(X, s0); \ + X = _mm256_xor_si256(X, s1); +#endif +#if 0 +#define PWXFORM_X_T __m128i +#define PWXFORM_SIMD(X, x, s0, s1) \ + x = _mm_and_si128(_mm256_castsi256_si128( \ + _mm256_permute4x64_epi64(X, 0xe8)), _mm_set1_epi64x(Smask2)); \ + s0 = _mm256_inserti128_si256( \ + _mm256_castsi128_si256( \ + *(const __m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x))), \ + *(const __m128i *)(S0 + (uint32_t)_mm_extract_epi32(x, 2)), 1); \ + s1 = _mm256_inserti128_si256( \ + _mm256_castsi128_si256( \ + *(const __m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1))), \ + *(const __m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 3)), 1); \ + X = _mm256_mul_epu32(_mm256_srli_epi64(X, 32), X); \ + X = _mm256_add_epi64(X, s0); \ + X = _mm256_xor_si256(X, s1); +#endif +#if 0 +#define PWXFORM_X_T uint64_t +#define PWXFORM_SIMD(X, M, x, y, s0, s1) \ + M = _mm256_and_si256(X, _mm256_set1_epi64x(Smask2)); \ + x = EXTRACT64(_mm256_castsi256_si128(M)); \ + y = EXTRACT64(_mm256_extracti128_si256(M, 1)); \ + s0 = _mm256_blend_epi32( \ + _mm256_castsi128_si256(*(const __m128i *)(S0 + (uint32_t)x)), \ + _mm256_loadu_si256((const __m256i *)(S0 + (uint32_t)y - 16)), 0xf0); \ + s1 = _mm256_blend_epi32( \ + _mm256_castsi128_si256(*(const __m128i *)(S1 + (x >> 32))), \ + _mm256_loadu_si256((const __m256i *)(S1 + (y >> 32) - 16)), 0xf0); \ + X = _mm256_mul_epu32(_mm256_srli_epi64(X, 32), X); \ + X = _mm256_add_epi64(X, s0); \ + X = _mm256_xor_si256(X, s1); +#endif +#if 1 +#define PWXFORM_X_T uint64_t +#define PWXFORM_SIMD(X, M, x, y, s0, s1) \ + M = _mm256_and_si256(X, _mm256_set1_epi64x(Smask2)); \ + x = EXTRACT64(_mm256_castsi256_si128(M)); \ + y = EXTRACT64(_mm256_extracti128_si256(M, 1)); \ + s0 = _mm256_inserti128_si256( \ + _mm256_castsi128_si256( \ + *(const __m128i *)(S0 + (uint32_t)x)), \ + *(const __m128i *)(S0 + (uint32_t)y), 1); \ + s1 = _mm256_inserti128_si256( \ + _mm256_castsi128_si256( \ + *(const __m128i *)(S1 + (x >> 32))), \ + *(const __m128i *)(S1 + (y >> 32)), 1); \ + X = _mm256_mul_epu32(_mm256_srli_epi64(X, 32), X); \ + X = _mm256_add_epi64(X, s0); \ + X = _mm256_xor_si256(X, s1); +#endif +#if 0 +#define PWXFORM_X_T uint64_t +#define PWXFORM_SIMD(X, M, x, y, s0, s1) \ + x = EXTRACT64(_mm256_castsi256_si128(X)) & Smask2; \ + y = EXTRACT64(_mm256_extracti128_si256(X, 1)) & Smask2; \ + s0 = _mm256_inserti128_si256( \ + _mm256_castsi128_si256( \ + *(const __m128i *)(S0 + (uint32_t)x)), \ + *(const __m128i *)(S0 + (uint32_t)y), 1); \ + s1 = _mm256_inserti128_si256( \ + _mm256_castsi128_si256( \ + *(const __m128i *)(S1 + (x >> 32))), \ + *(const __m128i *)(S1 + (y >> 32)), 1); \ + X = _mm256_mul_epu32(_mm256_srli_epi64(X, 32), X); \ + X = _mm256_add_epi64(X, s0); \ + X = _mm256_xor_si256(X, s1); +#endif +#elif defined(__AVX2__) +/* 32-bit with AVX2 */ +#define PWXFORM_X_T __m128i +#define PWXFORM_SIMD(X, x, s0, s1) \ + x = _mm_and_si128(_mm256_castsi256_si128( \ + _mm256_permute4x64_epi64(X, 0xe8)), _mm_set1_epi64x(Smask2)); \ + s0 = _mm256_inserti128_si256( \ + _mm256_castsi128_si256( \ + *(const __m128i *)(S0 + (uint32_t)_mm_cvtsi128_si32(x))), \ + *(const __m128i *)(S0 + (uint32_t)_mm_extract_epi32(x, 2)), 1); \ + s1 = _mm256_inserti128_si256( \ + _mm256_castsi128_si256( \ + *(const __m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 1))), \ + *(const __m128i *)(S1 + (uint32_t)_mm_extract_epi32(x, 3)), 1); \ + X = _mm256_mul_epu32(_mm256_srli_epi64(X, 32), X); \ + X = _mm256_add_epi64(X, s0); \ + X = _mm256_xor_si256(X, s1); +#elif !defined(__x86_64__) && defined(__SSE4_1__) /* 32-bit with SSE4.1 */ #define PWXFORM_X_T __m128i #define PWXFORM_SIMD(X, x, s0, s1) \ @@ -285,6 +454,21 @@ blockmix_salsa8(const salsa20_blk_t *res X = _mm_xor_si128(X, s1); #endif +#ifdef __AVX2__ +#define PWXFORM_ROUND \ + PWXFORM_SIMD(X0, M0, x0, y0, s00, s01) \ + PWXFORM_SIMD(X1, M1, x1, y1, s10, s11) + +#define PWXFORM \ + { \ + PWXFORM_X_T x0, x1; \ + PWXFORM_X_T y0, y1; \ + __m256i M0, M1, s00, s01, s10, s11; \ + PWXFORM_ROUND PWXFORM_ROUND \ + PWXFORM_ROUND PWXFORM_ROUND \ + PWXFORM_ROUND PWXFORM_ROUND \ + } +#else #define PWXFORM_ROUND \ PWXFORM_SIMD(X0, x0, s00, s01) \ PWXFORM_SIMD(X1, x1, s10, s11) \ @@ -299,18 +483,29 @@ blockmix_salsa8(const salsa20_blk_t *res PWXFORM_ROUND PWXFORM_ROUND \ PWXFORM_ROUND PWXFORM_ROUND \ } +#endif + +#ifdef __AVX2__ +#define XOR4(in) \ + X0 = _mm256_xor_si256(X0, (in).o[0]); \ + X1 = _mm256_xor_si256(X1, (in).o[1]); +#define OUT(out) \ + (out).o[0] = X0; \ + (out).o[1] = X1; +#else #define XOR4(in) \ - X0 = _mm_xor_si128(X0, (in)[0]); \ - X1 = _mm_xor_si128(X1, (in)[1]); \ - X2 = _mm_xor_si128(X2, (in)[2]); \ - X3 = _mm_xor_si128(X3, (in)[3]); + X0 = _mm_xor_si128(X0, (in).q[0]); \ + X1 = _mm_xor_si128(X1, (in).q[1]); \ + X2 = _mm_xor_si128(X2, (in).q[2]); \ + X3 = _mm_xor_si128(X3, (in).q[3]); #define OUT(out) \ - (out)[0] = X0; \ - (out)[1] = X1; \ - (out)[2] = X2; \ - (out)[3] = X3; + (out).q[0] = X0; \ + (out).q[1] = X1; \ + (out).q[2] = X2; \ + (out).q[3] = X3; +#endif /** * blockmix_pwxform(Bin, Bout, r, S): @@ -323,7 +518,7 @@ blockmix(const salsa20_blk_t *restrict B { const uint8_t * S0 = (const uint8_t *)S; const uint8_t * S1 = (const uint8_t *)S + Sbytes / 2; - __m128i X0, X1, X2, X3; + DECL_X size_t i; /* Convert 128-byte blocks to 64-byte blocks */ @@ -339,43 +534,46 @@ blockmix(const salsa20_blk_t *restrict B PREFETCH_OUT(&Bout[r], _MM_HINT_T0) /* 2: X <-- B'_{r_1 - 1} */ - X0 = Bin[r].q[0]; - X1 = Bin[r].q[1]; - X2 = Bin[r].q[2]; - X3 = Bin[r].q[3]; + READ_X(Bin[r]) /* 3: for i = 0 to r_1 - 1 do */ for (i = 0; i < r; i++) { /* 5: X <-- X \xor B'_i */ - XOR4(Bin[i].q) + XOR4(Bin[i]) /* 7: X <-- pwxform(X) */ PWXFORM /* 8: B'_i <-- X */ - OUT(Bout[i].q) + OUT(Bout[i]) } /* Last iteration of the loop above */ /* 5: X <-- X \xor B'_i */ - XOR4(Bin[i].q) + XOR4(Bin[i]) /* 7: X <-- pwxform(X) */ PWXFORM /* 11: B_i <-- H(B_i) */ - SALSA20_8(Bout[i].q) + SALSA20_8(Bout[i]) } +#ifdef __AVX2__ +#define XOR4_2(in1, in2) \ + X0 = _mm256_xor_si256((in1).o[0], (in2).o[0]); \ + X1 = _mm256_xor_si256((in1).o[1], (in2).o[1]); +#else #define XOR4_2(in1, in2) \ - X0 = _mm_xor_si128((in1)[0], (in2)[0]); \ - X1 = _mm_xor_si128((in1)[1], (in2)[1]); \ - X2 = _mm_xor_si128((in1)[2], (in2)[2]); \ - X3 = _mm_xor_si128((in1)[3], (in2)[3]); + X0 = _mm_xor_si128((in1).q[0], (in2).q[0]); \ + X1 = _mm_xor_si128((in1).q[1], (in2).q[1]); \ + X2 = _mm_xor_si128((in1).q[2], (in2).q[2]); \ + X3 = _mm_xor_si128((in1).q[3], (in2).q[3]); +#endif static uint32_t blockmix_salsa8_xor(const salsa20_blk_t *restrict Bin1, const salsa20_blk_t *restrict Bin2, salsa20_blk_t *restrict Bout, size_t r) { - __m128i X0, X1, X2, X3; + DECL_X size_t i; r--; @@ -395,38 +593,38 @@ blockmix_salsa8_xor(const salsa20_blk_t PREFETCH_OUT(&Bout[r * 2 + 1], _MM_HINT_T0) /* 1: X <-- B_{2r - 1} */ - XOR4_2(Bin1[r * 2 + 1].q, Bin2[r * 2 + 1].q) + XOR4_2(Bin1[r * 2 + 1], Bin2[r * 2 + 1]) /* 3: X <-- H(X \xor B_i) */ /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[0].q) - SALSA20_8_XOR_MEM(Bin2[0].q, Bout[0].q) + XOR4(Bin1[0]) + SALSA20_8_XOR_MEM(Bin2[0], Bout[0]) /* 2: for i = 0 to 2r - 1 do */ for (i = 0; i < r;) { /* 3: X <-- H(X \xor B_i) */ /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[i * 2 + 1].q) - SALSA20_8_XOR_MEM(Bin2[i * 2 + 1].q, Bout[r + 1 + i].q) + XOR4(Bin1[i * 2 + 1]) + SALSA20_8_XOR_MEM(Bin2[i * 2 + 1], Bout[r + 1 + i]) i++; /* 3: X <-- H(X \xor B_i) */ /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[i * 2].q) - SALSA20_8_XOR_MEM(Bin2[i * 2].q, Bout[i].q) + XOR4(Bin1[i * 2]) + SALSA20_8_XOR_MEM(Bin2[i * 2], Bout[i]) } /* 3: X <-- H(X \xor B_i) */ /* 4: Y_i <-- X */ /* 6: B' <-- (Y_0, Y_2 ... Y_{2r-2}, Y_1, Y_3 ... Y_{2r-1}) */ - XOR4(Bin1[r * 2 + 1].q) - SALSA20_8_XOR_MEM(Bin2[r * 2 + 1].q, Bout[r * 2 + 1].q) + XOR4(Bin1[r * 2 + 1]) + SALSA20_8_XOR_MEM(Bin2[r * 2 + 1], Bout[r * 2 + 1]) - return _mm_cvtsi128_si32(X0); + return INTEGERIFY(X0); } static uint32_t @@ -436,7 +634,7 @@ blockmix_xor(const salsa20_blk_t *restri { const uint8_t * S0 = (const uint8_t *)S; const uint8_t * S1 = (const uint8_t *)S + Sbytes / 2; - __m128i X0, X1, X2, X3; + DECL_X size_t i; /* Convert 128-byte blocks to 64-byte blocks */ @@ -464,44 +662,55 @@ blockmix_xor(const salsa20_blk_t *restri PREFETCH_OUT(&Bout[r], _MM_HINT_T0); /* 2: X <-- B'_{r_1 - 1} */ - XOR4_2(Bin1[r].q, Bin2[r].q) + XOR4_2(Bin1[r], Bin2[r]) /* 3: for i = 0 to r_1 - 1 do */ for (i = 0; i < r; i++) { /* 5: X <-- X \xor B'_i */ - XOR4(Bin1[i].q) - XOR4(Bin2[i].q) + XOR4(Bin1[i]) + XOR4(Bin2[i]) /* 7: X <-- pwxform(X) */ PWXFORM /* 8: B'_i <-- X */ - OUT(Bout[i].q) + OUT(Bout[i]) } /* Last iteration of the loop above */ /* 5: X <-- X \xor B'_i */ - XOR4(Bin1[i].q) - XOR4(Bin2[i].q) + XOR4(Bin1[i]) + XOR4(Bin2[i]) /* 7: X <-- pwxform(X) */ PWXFORM /* 11: B_i <-- H(B_i) */ - SALSA20_8(Bout[i].q) + SALSA20_8(Bout[i]) - return _mm_cvtsi128_si32(X0); + return INTEGERIFY(X0); } #undef XOR4 + +#ifdef __AVX2__ #define XOR4(in, out) \ - (out)[0] = Y0 = _mm_xor_si128((in)[0], (out)[0]); \ - (out)[1] = Y1 = _mm_xor_si128((in)[1], (out)[1]); \ - (out)[2] = Y2 = _mm_xor_si128((in)[2], (out)[2]); \ - (out)[3] = Y3 = _mm_xor_si128((in)[3], (out)[3]); + (out).o[0] = Y0 = _mm256_xor_si256((in).o[0], (out).o[0]); \ + (out).o[1] = Y1 = _mm256_xor_si256((in).o[1], (out).o[1]); + +#define XOR4_Y \ + X0 = _mm256_xor_si256(X0, Y0); \ + X1 = _mm256_xor_si256(X1, Y1); +#else +#define XOR4(in, out) \ + (out).q[0] = Y0 = _mm_xor_si128((in).q[0], (out).q[0]); \ + (out).q[1] = Y1 = _mm_xor_si128((in).q[1], (out).q[1]); \ + (out).q[2] = Y2 = _mm_xor_si128((in).q[2], (out).q[2]); \ + (out).q[3] = Y3 = _mm_xor_si128((in).q[3], (out).q[3]); #define XOR4_Y \ X0 = _mm_xor_si128(X0, Y0); \ X1 = _mm_xor_si128(X1, Y1); \ X2 = _mm_xor_si128(X2, Y2); \ X3 = _mm_xor_si128(X3, Y3); +#endif static uint32_t blockmix_xor_save(const salsa20_blk_t *restrict Bin1, @@ -510,7 +719,8 @@ blockmix_xor_save(const salsa20_blk_t *r { const uint8_t * S0 = (const uint8_t *)S; const uint8_t * S1 = (const uint8_t *)S + Sbytes / 2; - __m128i X0, X1, X2, X3, Y0, Y1, Y2, Y3; + DECL_X + DECL_Y size_t i; /* Convert 128-byte blocks to 64-byte blocks */ @@ -528,30 +738,30 @@ blockmix_xor_save(const salsa20_blk_t *r PREFETCH_OUT(&Bout[r], _MM_HINT_T0); /* 2: X <-- B'_{r_1 - 1} */ - XOR4_2(Bin1[r].q, Bin2[r].q) + XOR4_2(Bin1[r], Bin2[r]) /* 3: for i = 0 to r_1 - 1 do */ for (i = 0; i < r; i++) { - XOR4(Bin1[i].q, Bin2[i].q) + XOR4(Bin1[i], Bin2[i]) /* 5: X <-- X \xor B'_i */ XOR4_Y /* 7: X <-- pwxform(X) */ PWXFORM /* 8: B'_i <-- X */ - OUT(Bout[i].q) + OUT(Bout[i]) } /* Last iteration of the loop above */ - XOR4(Bin1[i].q, Bin2[i].q) + XOR4(Bin1[i], Bin2[i]) /* 5: X <-- X \xor B'_i */ XOR4_Y /* 7: X <-- pwxform(X) */ PWXFORM /* 11: B_i <-- H(B_i) */ - SALSA20_8(Bout[i].q) + SALSA20_8(Bout[i]) - return _mm_cvtsi128_si32(X0); + return INTEGERIFY(X0); } #undef ARX @@ -561,6 +771,10 @@ blockmix_xor_save(const salsa20_blk_t *r #undef SALSA20_8_XOR_ANY #undef SALSA20_8_XOR_MEM #undef SALSA20_8_XOR_REG +#undef DECL_X +#undef DECL_Y +#undef READ_X +#undef INTEGERIFY #undef PWXFORM_X_T #undef PWXFORM_SIMD #undef PWXFORM_ROUND