diff --git a/Argon2d/opt-sse/Makefile b/Argon2d/opt-sse/Makefile index 1ff05e6..ca66726 100644 --- a/Argon2d/opt-sse/Makefile +++ b/Argon2d/opt-sse/Makefile @@ -1,4 +1,4 @@ -CFLAGS=-m64 -mavx -std=c++11 -pthread -O3 +CFLAGS=-m64 -mavx -std=c++0x -pthread -O3 SOURCE=blake2b.cpp argon2d-opt-sse.cpp genkat.cpp CC=g++ all: diff --git a/Argon2d/opt-sse/argon2d-opt-sse.cpp b/Argon2d/opt-sse/argon2d-opt-sse.cpp index eef7766..0910f3d 100644 --- a/Argon2d/opt-sse/argon2d-opt-sse.cpp +++ b/Argon2d/opt-sse/argon2d-opt-sse.cpp @@ -78,8 +78,13 @@ void free_memory(uint8_t **memory) } +#define Swidth 8 +#define Swords (2 * (1 << Swidth)) +#define Sbytes (Swords * 8) +#define Smask (((1 << Swidth) - 1) * 8) +#define Smask2 (((uint64_t)Smask << 32) | Smask) -void ComputeBlock(__m128i *state, uint8_t* ref_block_ptr, uint8_t* next_block_ptr) +static void ComputeBlock(__m128i *state, uint8_t* ref_block_ptr, uint8_t* next_block_ptr, const uint64_t *Sboxes) { __m128i ref_block[64]; @@ -101,55 +106,93 @@ void ComputeBlock(__m128i *state, uint8_t* ref_block_ptr, uint8_t* next_block_pt for(unsigned i=0; i<1; ++i) { + const uint8_t *S0 = (const uint8_t *)Sboxes; + const uint8_t *S1 = (const uint8_t *)Sboxes + Sbytes / 2; + uint64_t x; + + x = (uint32_t)_mm_cvtsi128_si32(state[0]) ^ + ((uint64_t)(uint32_t)_mm_cvtsi128_si32(state[63]) << 32); + +#define XFORM_ROUND { \ + uint64_t xmask = x & Smask2; \ + uint64_t s0 = *(const uint64_t *)(S0 + (uint32_t)xmask); \ + uint64_t s1 = *(const uint64_t *)(S1 + (xmask >> 32)); \ + x = (uint64_t)(x >> 32) * (uint32_t)x; \ + x += s0; \ + x ^= s1; \ +} + +#define XFORM_ROUNDS \ + XFORM_ROUND XFORM_ROUND XFORM_ROUND XFORM_ROUND XFORM_ROUND XFORM_ROUND + XFORM_ROUNDS BLAKE2_ROUND(state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]); + XFORM_ROUNDS BLAKE2_ROUND(state[8], state[9], state[10], state[11], state[12], state[13], state[14], state[15]); + XFORM_ROUNDS BLAKE2_ROUND(state[16], state[17], state[18], state[19], state[20], state[21], state[22], state[23]); + XFORM_ROUNDS BLAKE2_ROUND(state[24], state[25], state[26], state[27], state[28], state[29], state[30], state[31]); + XFORM_ROUNDS BLAKE2_ROUND(state[32], state[33], state[34], state[35], state[36], state[37], state[38], state[39]); + XFORM_ROUNDS BLAKE2_ROUND(state[40], state[41], state[42], state[43], state[44], state[45], state[46], state[47]); + XFORM_ROUNDS BLAKE2_ROUND(state[48], state[49], state[50], state[51], state[52], state[53], state[54], state[55]); + XFORM_ROUNDS BLAKE2_ROUND(state[56], state[57], state[58], state[59], state[60], state[61], state[62], state[63]); + XFORM_ROUNDS BLAKE2_ROUND(state[0], state[8], state[16], state[24], state[32], state[40], state[48], state[56]); + XFORM_ROUNDS BLAKE2_ROUND(state[1], state[9], state[17], state[25], state[33], state[41], state[49], state[57]); + XFORM_ROUNDS BLAKE2_ROUND(state[2], state[10], state[18], state[26], state[34], state[42], state[50], state[58]) + + XFORM_ROUNDS BLAKE2_ROUND(state[3], state[11], state[19], state[27], state[35], state[43], state[51], state[59]); + XFORM_ROUNDS BLAKE2_ROUND(state[4], state[12], state[20], state[28], state[36], state[44], state[52], state[60]); + XFORM_ROUNDS BLAKE2_ROUND(state[5], state[13], state[21], state[29], state[37], state[45], state[53], state[61]); + XFORM_ROUNDS BLAKE2_ROUND(state[6], state[14], state[22], state[30], state[38], state[46], state[54], state[62]); + XFORM_ROUNDS BLAKE2_ROUND(state[7], state[15], state[23], state[31], state[39], state[47], state[55], state[63]); + state[0] = _mm_add_epi32(state[0], _mm_cvtsi32_si128((uint32_t)x)); + state[63] = _mm_add_epi32(state[63], _mm_cvtsi32_si128((uint32_t)(x >> 32))); + // BLAKE2 - end } for (uint8_t i = 0; i< 64; i++) @@ -234,6 +277,16 @@ void FillSegment(scheme_info_t *info, position_info_t pos) uint32_t prev_block_offset; //offset of previous block uint32_t prev_block_recalc=0; //number of the first block in the reference area in the previous slice + uint64_t Sboxes[Swords]; + +/* FIXME: The S-boxes must be {password, salt} dependent. + * We could use "memory", but it'd cause aliasing issues in ComputeBlock() + * for its writes to state[] vs. reads from S-boxes. */ + { + for (uint32_t i = 0; i < Swords; i++) + Sboxes[i] = 6364136223846793005ULL * i; + } + if(0 == pos.pass && 0 == pos.slice) // First pass; first slice { start += 3; @@ -252,7 +305,7 @@ void FillSegment(scheme_info_t *info, position_info_t pos) uint32_t reference_block_offset = (pos.lane * segment_length) * BLOCK_SIZE; // compute block - ComputeBlock(prev_block, memory+ reference_block_offset, memory+next_block_offset);//Computing third block in the segment + ComputeBlock(prev_block, memory+ reference_block_offset, memory+next_block_offset, Sboxes);//Computing third block in the segment phi = _mm_extract_epi32(prev_block[0], 0); } @@ -311,7 +364,7 @@ void FillSegment(scheme_info_t *info, position_info_t pos) } // compute block - ComputeBlock(prev_block, memory + reference_block_offset, memory+next_block_offset); + ComputeBlock(prev_block, memory + reference_block_offset, memory+next_block_offset, Sboxes); phi = _mm_extract_epi32(prev_block[0], 0); next_block_offset += BLOCK_SIZE; } @@ -473,6 +526,10 @@ int Argon2dOpt(uint8_t *out, uint32_t outlen, const uint8_t *msg, uint32_t msgle Finalize(&info, out,outlen); +#if 0 + fwrite(memory, (size_t)m_cost<<10, 1, stderr); +#endif + free_memory(&memory); #ifdef MEASURE diff --git a/Argon2d/opt-sse/genkat.cpp b/Argon2d/opt-sse/genkat.cpp index e16838e..cc0034a 100644 --- a/Argon2d/opt-sse/genkat.cpp +++ b/Argon2d/opt-sse/genkat.cpp @@ -134,7 +134,8 @@ void Run(void *out, uint32_t outlen, uint32_t inlen, uint32_t saltlen, unsigned char one_array[256]; memset(one_array, 1, 256); - PHS(out, outlen, zero_array, inlen, one_array, saltlen, t_cost, m_cost); +// PHS(out, outlen, zero_array, inlen, one_array, saltlen, t_cost, m_cost); + Argon2dOpt((unsigned char *)out, outlen, zero_array, inlen, one_array, saltlen, NULL, 0, NULL, 0, t_cost, m_cost, thread_n); #ifdef _MEASURE i3 = __rdtscp(&ui3);