| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949 |
- #include <stdint.h>
- #include <string.h>
- #include "../onetimeauth_poly1305.h"
- #include "crypto_verify_16.h"
- #include "poly1305_sse2.h"
- #include "private/common.h"
- #include "private/sse2_64_32.h"
- #include "utils.h"
- #if defined(HAVE_TI_MODE) && defined(HAVE_EMMINTRIN_H)
- # ifdef __GNUC__
- # pragma GCC target("sse2")
- # endif
- # include <emmintrin.h>
- typedef __m128i xmmi;
- # if defined(_MSC_VER)
- # define POLY1305_NOINLINE __declspec(noinline)
- # elif defined(__clang__) || defined(__GNUC__)
- # define POLY1305_NOINLINE __attribute__((noinline))
- # else
- # define POLY1305_NOINLINE
- # endif
- # define poly1305_block_size 32
- enum poly1305_state_flags_t {
- poly1305_started = 1,
- poly1305_final_shift8 = 4,
- poly1305_final_shift16 = 8,
- poly1305_final_r2_r = 16, /* use [r^2,r] for the final block */
- poly1305_final_r_1 = 32 /* use [r,1] for the final block */
- };
- typedef struct poly1305_state_internal_t {
- union {
- uint64_t h[3];
- uint32_t hh[10];
- } H; /* 40 bytes */
- uint32_t R[5]; /* 20 bytes */
- uint32_t R2[5]; /* 20 bytes */
- uint32_t R4[5]; /* 20 bytes */
- uint64_t pad[2]; /* 16 bytes */
- uint64_t flags; /* 8 bytes */
- unsigned long long leftover; /* 8 bytes */
- unsigned char buffer[poly1305_block_size]; /* 32 bytes */
- } poly1305_state_internal_t; /* 164 bytes total */
- /*
- * _mm_loadl_epi64() is turned into a simple MOVQ. So, unaligned accesses are
- * totally fine, even though this intrinsic requires a __m128i* input.
- * This confuses dynamic analysis, so force alignment, only in debug mode.
- */
- # ifdef DEBUG
- static xmmi
- _fakealign_mm_loadl_epi64(const void *m)
- {
- xmmi tmp;
- memcpy(&tmp, m, 8);
- return _mm_loadl_epi64(&tmp);
- }
- # define _mm_loadl_epi64(X) _fakealign_mm_loadl_epi64(X)
- #endif
- /* copy 0-31 bytes */
- static inline void
- poly1305_block_copy31(unsigned char *dst, const unsigned char *src,
- unsigned long long bytes)
- {
- if (bytes & 16) {
- _mm_store_si128((xmmi *) (void *) dst,
- _mm_loadu_si128((const xmmi *) (const void *) src));
- src += 16;
- dst += 16;
- }
- if (bytes & 8) {
- memcpy(dst, src, 8);
- src += 8;
- dst += 8;
- }
- if (bytes & 4) {
- memcpy(dst, src, 4);
- src += 4;
- dst += 4;
- }
- if (bytes & 2) {
- memcpy(dst, src, 2);
- src += 2;
- dst += 2;
- }
- if (bytes & 1) {
- *dst = *src;
- }
- }
- static POLY1305_NOINLINE void
- poly1305_init_ext(poly1305_state_internal_t *st, const unsigned char key[32],
- unsigned long long bytes)
- {
- uint32_t *R;
- uint128_t d[3];
- uint64_t r0, r1, r2;
- uint64_t rt0, rt1, rt2, st2, c;
- uint64_t t0, t1;
- unsigned long long i;
- if (!bytes) {
- bytes = ~(unsigned long long) 0;
- }
- /* H = 0 */
- _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], _mm_setzero_si128());
- _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], _mm_setzero_si128());
- _mm_storeu_si128((xmmi *) (void *) &st->H.hh[8], _mm_setzero_si128());
- /* clamp key */
- memcpy(&t0, key, 8);
- memcpy(&t1, key + 8, 8);
- r0 = t0 & 0xffc0fffffff;
- t0 >>= 44;
- t0 |= t1 << 20;
- r1 = t0 & 0xfffffc0ffff;
- t1 >>= 24;
- r2 = t1 & 0x00ffffffc0f;
- /* r^1 */
- R = st->R;
- R[0] = (uint32_t)(r0) &0x3ffffff;
- R[1] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
- R[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
- R[3] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
- R[4] = (uint32_t)((r2 >> 16));
- /* save pad */
- memcpy(&st->pad[0], key + 16, 8);
- memcpy(&st->pad[1], key + 24, 8);
- rt0 = r0;
- rt1 = r1;
- rt2 = r2;
- /* r^2, r^4 */
- for (i = 0; i < 2; i++) {
- if (i == 0) {
- R = st->R2;
- if (bytes <= 16) {
- break;
- }
- } else if (i == 1) {
- R = st->R4;
- if (bytes < 96) {
- break;
- }
- }
- st2 = rt2 * (5 << 2);
- d[0] = ((uint128_t) rt0 * rt0) + ((uint128_t)(rt1 * 2) * st2);
- d[1] = ((uint128_t) rt2 * st2) + ((uint128_t)(rt0 * 2) * rt1);
- d[2] = ((uint128_t) rt1 * rt1) + ((uint128_t)(rt2 * 2) * rt0);
- rt0 = (uint64_t) d[0] & 0xfffffffffff;
- c = (uint64_t)(d[0] >> 44);
- d[1] += c;
- rt1 = (uint64_t) d[1] & 0xfffffffffff;
- c = (uint64_t)(d[1] >> 44);
- d[2] += c;
- rt2 = (uint64_t) d[2] & 0x3ffffffffff;
- c = (uint64_t)(d[2] >> 42);
- rt0 += c * 5;
- c = (rt0 >> 44);
- rt0 = rt0 & 0xfffffffffff;
- rt1 += c;
- c = (rt1 >> 44);
- rt1 = rt1 & 0xfffffffffff;
- rt2 += c; /* even if rt2 overflows, it will still fit in rp4 safely, and
- is safe to multiply with */
- R[0] = (uint32_t)(rt0) &0x3ffffff;
- R[1] = (uint32_t)((rt0 >> 26) | (rt1 << 18)) & 0x3ffffff;
- R[2] = (uint32_t)((rt1 >> 8)) & 0x3ffffff;
- R[3] = (uint32_t)((rt1 >> 34) | (rt2 << 10)) & 0x3ffffff;
- R[4] = (uint32_t)((rt2 >> 16));
- }
- st->flags = 0;
- st->leftover = 0U;
- }
- static POLY1305_NOINLINE void
- poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
- unsigned long long bytes)
- {
- CRYPTO_ALIGN(64)
- xmmi HIBIT =
- _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << 24), _MM_SHUFFLE(1, 0, 1, 0));
- const xmmi MMASK = _mm_shuffle_epi32(_mm_cvtsi32_si128((1 << 26) - 1),
- _MM_SHUFFLE(1, 0, 1, 0));
- const xmmi FIVE =
- _mm_shuffle_epi32(_mm_cvtsi32_si128(5), _MM_SHUFFLE(1, 0, 1, 0));
- xmmi H0, H1, H2, H3, H4;
- xmmi T0, T1, T2, T3, T4, T5, T6, T7, T8;
- xmmi M0, M1, M2, M3, M4;
- xmmi M5, M6, M7, M8;
- xmmi C1, C2;
- xmmi R20, R21, R22, R23, R24, S21, S22, S23, S24;
- xmmi R40, R41, R42, R43, R44, S41, S42, S43, S44;
- if (st->flags & poly1305_final_shift8) {
- HIBIT = _mm_srli_si128(HIBIT, 8);
- }
- if (st->flags & poly1305_final_shift16) {
- HIBIT = _mm_setzero_si128();
- }
- if (!(st->flags & poly1305_started)) {
- /* H = [Mx,My] */
- T5 = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
- _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
- T6 = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
- _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
- H0 = _mm_and_si128(MMASK, T5);
- H1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
- T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
- H2 = _mm_and_si128(MMASK, T5);
- H3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
- H4 = _mm_srli_epi64(T6, 40);
- H4 = _mm_or_si128(H4, HIBIT);
- m += 32;
- bytes -= 32;
- st->flags |= poly1305_started;
- } else {
- T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[0]);
- T1 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[4]);
- T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[8]);
- H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 0, 0));
- H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 2, 2));
- H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1, 1, 0, 0));
- H3 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 3, 2, 2));
- H4 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(1, 1, 0, 0));
- }
- if (st->flags & (poly1305_final_r2_r | poly1305_final_r_1)) {
- if (st->flags & poly1305_final_r2_r) {
- /* use [r^2, r] */
- T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
- T3 = _mm_cvtsi32_si128(st->R[4]);
- T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
- T1 = _mm_cvtsi32_si128(st->R2[4]);
- T4 = _mm_unpacklo_epi32(T0, T2);
- T5 = _mm_unpackhi_epi32(T0, T2);
- R24 = _mm_unpacklo_epi64(T1, T3);
- } else {
- /* use [r^1, 1] */
- T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
- T1 = _mm_cvtsi32_si128(st->R[4]);
- T2 = _mm_cvtsi32_si128(1);
- T4 = _mm_unpacklo_epi32(T0, T2);
- T5 = _mm_unpackhi_epi32(T0, T2);
- R24 = T1;
- }
- R20 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(1, 1, 0, 0));
- R21 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(3, 3, 2, 2));
- R22 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(1, 1, 0, 0));
- R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3, 3, 2, 2));
- } else {
- /* use [r^2, r^2] */
- T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
- T1 = _mm_cvtsi32_si128(st->R2[4]);
- R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
- R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
- R22 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
- R23 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
- R24 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
- }
- S21 = _mm_mul_epu32(R21, FIVE);
- S22 = _mm_mul_epu32(R22, FIVE);
- S23 = _mm_mul_epu32(R23, FIVE);
- S24 = _mm_mul_epu32(R24, FIVE);
- if (bytes >= 64) {
- T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R4[0]);
- T1 = _mm_cvtsi32_si128(st->R4[4]);
- R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
- R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
- R42 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
- R43 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
- R44 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
- S41 = _mm_mul_epu32(R41, FIVE);
- S42 = _mm_mul_epu32(R42, FIVE);
- S43 = _mm_mul_epu32(R43, FIVE);
- S44 = _mm_mul_epu32(R44, FIVE);
- while (bytes >= 64) {
- xmmi v00, v01, v02, v03, v04;
- xmmi v10, v11, v12, v13, v14;
- xmmi v20, v21, v22, v23, v24;
- xmmi v30, v31, v32, v33, v34;
- xmmi v40, v41, v42, v43, v44;
- xmmi T14, T15;
- /* H *= [r^4,r^4], preload [Mx,My] */
- T15 = S42;
- T0 = H4;
- T0 = _mm_mul_epu32(T0, S41);
- v01 = H3;
- v01 = _mm_mul_epu32(v01, T15);
- T14 = S43;
- T1 = H4;
- T1 = _mm_mul_epu32(T1, T15);
- v11 = H3;
- v11 = _mm_mul_epu32(v11, T14);
- T2 = H4;
- T2 = _mm_mul_epu32(T2, T14);
- T0 = _mm_add_epi64(T0, v01);
- T15 = S44;
- v02 = H2;
- v02 = _mm_mul_epu32(v02, T14);
- T3 = H4;
- T3 = _mm_mul_epu32(T3, T15);
- T1 = _mm_add_epi64(T1, v11);
- v03 = H1;
- v03 = _mm_mul_epu32(v03, T15);
- v12 = H2;
- v12 = _mm_mul_epu32(v12, T15);
- T0 = _mm_add_epi64(T0, v02);
- T14 = R40;
- v21 = H3;
- v21 = _mm_mul_epu32(v21, T15);
- v31 = H3;
- v31 = _mm_mul_epu32(v31, T14);
- T0 = _mm_add_epi64(T0, v03);
- T4 = H4;
- T4 = _mm_mul_epu32(T4, T14);
- T1 = _mm_add_epi64(T1, v12);
- v04 = H0;
- v04 = _mm_mul_epu32(v04, T14);
- T2 = _mm_add_epi64(T2, v21);
- v13 = H1;
- v13 = _mm_mul_epu32(v13, T14);
- T3 = _mm_add_epi64(T3, v31);
- T15 = R41;
- v22 = H2;
- v22 = _mm_mul_epu32(v22, T14);
- v32 = H2;
- v32 = _mm_mul_epu32(v32, T15);
- T0 = _mm_add_epi64(T0, v04);
- v41 = H3;
- v41 = _mm_mul_epu32(v41, T15);
- T1 = _mm_add_epi64(T1, v13);
- v14 = H0;
- v14 = _mm_mul_epu32(v14, T15);
- T2 = _mm_add_epi64(T2, v22);
- T14 = R42;
- T5 = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
- _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
- v23 = H1;
- v23 = _mm_mul_epu32(v23, T15);
- T3 = _mm_add_epi64(T3, v32);
- v33 = H1;
- v33 = _mm_mul_epu32(v33, T14);
- T4 = _mm_add_epi64(T4, v41);
- v42 = H2;
- v42 = _mm_mul_epu32(v42, T14);
- T1 = _mm_add_epi64(T1, v14);
- T15 = R43;
- T6 = _mm_unpacklo_epi64(
- _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
- _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
- v24 = H0;
- v24 = _mm_mul_epu32(v24, T14);
- T2 = _mm_add_epi64(T2, v23);
- v34 = H0;
- v34 = _mm_mul_epu32(v34, T15);
- T3 = _mm_add_epi64(T3, v33);
- M0 = _mm_and_si128(MMASK, T5);
- v43 = H1;
- v43 = _mm_mul_epu32(v43, T15);
- T4 = _mm_add_epi64(T4, v42);
- M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
- v44 = H0;
- v44 = _mm_mul_epu32(v44, R44);
- T2 = _mm_add_epi64(T2, v24);
- T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
- T3 = _mm_add_epi64(T3, v34);
- M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T6, 14));
- T4 = _mm_add_epi64(T4, v43);
- M2 = _mm_and_si128(MMASK, T5);
- T4 = _mm_add_epi64(T4, v44);
- M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
- /* H += [Mx',My'] */
- T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 32));
- T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 48));
- T7 = _mm_unpacklo_epi32(T5, T6);
- T8 = _mm_unpackhi_epi32(T5, T6);
- M5 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
- M6 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
- M7 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
- M8 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
- M6 = _mm_slli_epi64(M6, 6);
- M7 = _mm_slli_epi64(M7, 12);
- M8 = _mm_slli_epi64(M8, 18);
- T0 = _mm_add_epi64(T0, M5);
- T1 = _mm_add_epi64(T1, M6);
- T2 = _mm_add_epi64(T2, M7);
- T3 = _mm_add_epi64(T3, M8);
- T4 = _mm_add_epi64(T4, HIBIT);
- /* H += [Mx,My]*[r^2,r^2] */
- T15 = S22;
- v00 = M4;
- v00 = _mm_mul_epu32(v00, S21);
- v01 = M3;
- v01 = _mm_mul_epu32(v01, T15);
- T14 = S23;
- v10 = M4;
- v10 = _mm_mul_epu32(v10, T15);
- v11 = M3;
- v11 = _mm_mul_epu32(v11, T14);
- T0 = _mm_add_epi64(T0, v00);
- v20 = M4;
- v20 = _mm_mul_epu32(v20, T14);
- T0 = _mm_add_epi64(T0, v01);
- T15 = S24;
- v02 = M2;
- v02 = _mm_mul_epu32(v02, T14);
- T1 = _mm_add_epi64(T1, v10);
- v30 = M4;
- v30 = _mm_mul_epu32(v30, T15);
- T1 = _mm_add_epi64(T1, v11);
- v03 = M1;
- v03 = _mm_mul_epu32(v03, T15);
- T2 = _mm_add_epi64(T2, v20);
- v12 = M2;
- v12 = _mm_mul_epu32(v12, T15);
- T0 = _mm_add_epi64(T0, v02);
- T14 = R20;
- v21 = M3;
- v21 = _mm_mul_epu32(v21, T15);
- T3 = _mm_add_epi64(T3, v30);
- v31 = M3;
- v31 = _mm_mul_epu32(v31, T14);
- T0 = _mm_add_epi64(T0, v03);
- v40 = M4;
- v40 = _mm_mul_epu32(v40, T14);
- T1 = _mm_add_epi64(T1, v12);
- v04 = M0;
- v04 = _mm_mul_epu32(v04, T14);
- T2 = _mm_add_epi64(T2, v21);
- v13 = M1;
- v13 = _mm_mul_epu32(v13, T14);
- T3 = _mm_add_epi64(T3, v31);
- T15 = R21;
- v22 = M2;
- v22 = _mm_mul_epu32(v22, T14);
- T4 = _mm_add_epi64(T4, v40);
- v32 = M2;
- v32 = _mm_mul_epu32(v32, T15);
- T0 = _mm_add_epi64(T0, v04);
- v41 = M3;
- v41 = _mm_mul_epu32(v41, T15);
- T1 = _mm_add_epi64(T1, v13);
- v14 = M0;
- v14 = _mm_mul_epu32(v14, T15);
- T2 = _mm_add_epi64(T2, v22);
- T14 = R22;
- v23 = M1;
- v23 = _mm_mul_epu32(v23, T15);
- T3 = _mm_add_epi64(T3, v32);
- v33 = M1;
- v33 = _mm_mul_epu32(v33, T14);
- T4 = _mm_add_epi64(T4, v41);
- v42 = M2;
- v42 = _mm_mul_epu32(v42, T14);
- T1 = _mm_add_epi64(T1, v14);
- T15 = R23;
- v24 = M0;
- v24 = _mm_mul_epu32(v24, T14);
- T2 = _mm_add_epi64(T2, v23);
- v34 = M0;
- v34 = _mm_mul_epu32(v34, T15);
- T3 = _mm_add_epi64(T3, v33);
- v43 = M1;
- v43 = _mm_mul_epu32(v43, T15);
- T4 = _mm_add_epi64(T4, v42);
- v44 = M0;
- v44 = _mm_mul_epu32(v44, R24);
- T2 = _mm_add_epi64(T2, v24);
- T3 = _mm_add_epi64(T3, v34);
- T4 = _mm_add_epi64(T4, v43);
- T4 = _mm_add_epi64(T4, v44);
- /* reduce */
- C1 = _mm_srli_epi64(T0, 26);
- C2 = _mm_srli_epi64(T3, 26);
- T0 = _mm_and_si128(T0, MMASK);
- T3 = _mm_and_si128(T3, MMASK);
- T1 = _mm_add_epi64(T1, C1);
- T4 = _mm_add_epi64(T4, C2);
- C1 = _mm_srli_epi64(T1, 26);
- C2 = _mm_srli_epi64(T4, 26);
- T1 = _mm_and_si128(T1, MMASK);
- T4 = _mm_and_si128(T4, MMASK);
- T2 = _mm_add_epi64(T2, C1);
- T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
- C1 = _mm_srli_epi64(T2, 26);
- C2 = _mm_srli_epi64(T0, 26);
- T2 = _mm_and_si128(T2, MMASK);
- T0 = _mm_and_si128(T0, MMASK);
- T3 = _mm_add_epi64(T3, C1);
- T1 = _mm_add_epi64(T1, C2);
- C1 = _mm_srli_epi64(T3, 26);
- T3 = _mm_and_si128(T3, MMASK);
- T4 = _mm_add_epi64(T4, C1);
- /* Final: H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx',My']) */
- H0 = T0;
- H1 = T1;
- H2 = T2;
- H3 = T3;
- H4 = T4;
- m += 64;
- bytes -= 64;
- }
- }
- if (bytes >= 32) {
- xmmi v01, v02, v03, v04;
- xmmi v11, v12, v13, v14;
- xmmi v21, v22, v23, v24;
- xmmi v31, v32, v33, v34;
- xmmi v41, v42, v43, v44;
- xmmi T14, T15;
- /* H *= [r^2,r^2] */
- T15 = S22;
- T0 = H4;
- T0 = _mm_mul_epu32(T0, S21);
- v01 = H3;
- v01 = _mm_mul_epu32(v01, T15);
- T14 = S23;
- T1 = H4;
- T1 = _mm_mul_epu32(T1, T15);
- v11 = H3;
- v11 = _mm_mul_epu32(v11, T14);
- T2 = H4;
- T2 = _mm_mul_epu32(T2, T14);
- T0 = _mm_add_epi64(T0, v01);
- T15 = S24;
- v02 = H2;
- v02 = _mm_mul_epu32(v02, T14);
- T3 = H4;
- T3 = _mm_mul_epu32(T3, T15);
- T1 = _mm_add_epi64(T1, v11);
- v03 = H1;
- v03 = _mm_mul_epu32(v03, T15);
- v12 = H2;
- v12 = _mm_mul_epu32(v12, T15);
- T0 = _mm_add_epi64(T0, v02);
- T14 = R20;
- v21 = H3;
- v21 = _mm_mul_epu32(v21, T15);
- v31 = H3;
- v31 = _mm_mul_epu32(v31, T14);
- T0 = _mm_add_epi64(T0, v03);
- T4 = H4;
- T4 = _mm_mul_epu32(T4, T14);
- T1 = _mm_add_epi64(T1, v12);
- v04 = H0;
- v04 = _mm_mul_epu32(v04, T14);
- T2 = _mm_add_epi64(T2, v21);
- v13 = H1;
- v13 = _mm_mul_epu32(v13, T14);
- T3 = _mm_add_epi64(T3, v31);
- T15 = R21;
- v22 = H2;
- v22 = _mm_mul_epu32(v22, T14);
- v32 = H2;
- v32 = _mm_mul_epu32(v32, T15);
- T0 = _mm_add_epi64(T0, v04);
- v41 = H3;
- v41 = _mm_mul_epu32(v41, T15);
- T1 = _mm_add_epi64(T1, v13);
- v14 = H0;
- v14 = _mm_mul_epu32(v14, T15);
- T2 = _mm_add_epi64(T2, v22);
- T14 = R22;
- v23 = H1;
- v23 = _mm_mul_epu32(v23, T15);
- T3 = _mm_add_epi64(T3, v32);
- v33 = H1;
- v33 = _mm_mul_epu32(v33, T14);
- T4 = _mm_add_epi64(T4, v41);
- v42 = H2;
- v42 = _mm_mul_epu32(v42, T14);
- T1 = _mm_add_epi64(T1, v14);
- T15 = R23;
- v24 = H0;
- v24 = _mm_mul_epu32(v24, T14);
- T2 = _mm_add_epi64(T2, v23);
- v34 = H0;
- v34 = _mm_mul_epu32(v34, T15);
- T3 = _mm_add_epi64(T3, v33);
- v43 = H1;
- v43 = _mm_mul_epu32(v43, T15);
- T4 = _mm_add_epi64(T4, v42);
- v44 = H0;
- v44 = _mm_mul_epu32(v44, R24);
- T2 = _mm_add_epi64(T2, v24);
- T3 = _mm_add_epi64(T3, v34);
- T4 = _mm_add_epi64(T4, v43);
- T4 = _mm_add_epi64(T4, v44);
- /* H += [Mx,My] */
- if (m) {
- T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 0));
- T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 16));
- T7 = _mm_unpacklo_epi32(T5, T6);
- T8 = _mm_unpackhi_epi32(T5, T6);
- M0 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
- M1 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
- M2 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
- M3 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
- M1 = _mm_slli_epi64(M1, 6);
- M2 = _mm_slli_epi64(M2, 12);
- M3 = _mm_slli_epi64(M3, 18);
- T0 = _mm_add_epi64(T0, M0);
- T1 = _mm_add_epi64(T1, M1);
- T2 = _mm_add_epi64(T2, M2);
- T3 = _mm_add_epi64(T3, M3);
- T4 = _mm_add_epi64(T4, HIBIT);
- }
- /* reduce */
- C1 = _mm_srli_epi64(T0, 26);
- C2 = _mm_srli_epi64(T3, 26);
- T0 = _mm_and_si128(T0, MMASK);
- T3 = _mm_and_si128(T3, MMASK);
- T1 = _mm_add_epi64(T1, C1);
- T4 = _mm_add_epi64(T4, C2);
- C1 = _mm_srli_epi64(T1, 26);
- C2 = _mm_srli_epi64(T4, 26);
- T1 = _mm_and_si128(T1, MMASK);
- T4 = _mm_and_si128(T4, MMASK);
- T2 = _mm_add_epi64(T2, C1);
- T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
- C1 = _mm_srli_epi64(T2, 26);
- C2 = _mm_srli_epi64(T0, 26);
- T2 = _mm_and_si128(T2, MMASK);
- T0 = _mm_and_si128(T0, MMASK);
- T3 = _mm_add_epi64(T3, C1);
- T1 = _mm_add_epi64(T1, C2);
- C1 = _mm_srli_epi64(T3, 26);
- T3 = _mm_and_si128(T3, MMASK);
- T4 = _mm_add_epi64(T4, C1);
- /* H = (H*[r^2,r^2] + [Mx,My]) */
- H0 = T0;
- H1 = T1;
- H2 = T2;
- H3 = T3;
- H4 = T4;
- }
- if (m) {
- T0 = _mm_shuffle_epi32(H0, _MM_SHUFFLE(0, 0, 2, 0));
- T1 = _mm_shuffle_epi32(H1, _MM_SHUFFLE(0, 0, 2, 0));
- T2 = _mm_shuffle_epi32(H2, _MM_SHUFFLE(0, 0, 2, 0));
- T3 = _mm_shuffle_epi32(H3, _MM_SHUFFLE(0, 0, 2, 0));
- T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0, 0, 2, 0));
- T0 = _mm_unpacklo_epi64(T0, T1);
- T1 = _mm_unpacklo_epi64(T2, T3);
- _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], T0);
- _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], T1);
- _mm_storel_epi64((xmmi *) (void *) &st->H.hh[8], T4);
- } else {
- uint32_t t0, t1, t2, t3, t4, b;
- uint64_t h0, h1, h2, g0, g1, g2, c, nc;
- /* H = H[0]+H[1] */
- T0 = H0;
- T1 = H1;
- T2 = H2;
- T3 = H3;
- T4 = H4;
- T0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
- T1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
- T2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
- T3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
- T4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
- t0 = _mm_cvtsi128_si32(T0);
- b = (t0 >> 26);
- t0 &= 0x3ffffff;
- t1 = _mm_cvtsi128_si32(T1) + b;
- b = (t1 >> 26);
- t1 &= 0x3ffffff;
- t2 = _mm_cvtsi128_si32(T2) + b;
- b = (t2 >> 26);
- t2 &= 0x3ffffff;
- t3 = _mm_cvtsi128_si32(T3) + b;
- b = (t3 >> 26);
- t3 &= 0x3ffffff;
- t4 = _mm_cvtsi128_si32(T4) + b;
- /* everything except t4 is in range, so this is all safe */
- h0 = (((uint64_t) t0) | ((uint64_t) t1 << 26)) & 0xfffffffffffull;
- h1 = (((uint64_t) t1 >> 18) | ((uint64_t) t2 << 8) |
- ((uint64_t) t3 << 34)) &
- 0xfffffffffffull;
- h2 = (((uint64_t) t3 >> 10) | ((uint64_t) t4 << 16));
- c = (h2 >> 42);
- h2 &= 0x3ffffffffff;
- h0 += c * 5;
- c = (h0 >> 44);
- h0 &= 0xfffffffffff;
- h1 += c;
- c = (h1 >> 44);
- h1 &= 0xfffffffffff;
- h2 += c;
- c = (h2 >> 42);
- h2 &= 0x3ffffffffff;
- h0 += c * 5;
- c = (h0 >> 44);
- h0 &= 0xfffffffffff;
- h1 += c;
- g0 = h0 + 5;
- c = (g0 >> 44);
- g0 &= 0xfffffffffff;
- g1 = h1 + c;
- c = (g1 >> 44);
- g1 &= 0xfffffffffff;
- g2 = h2 + c - ((uint64_t) 1 << 42);
- c = (g2 >> 63) - 1;
- nc = ~c;
- h0 = (h0 & nc) | (g0 & c);
- h1 = (h1 & nc) | (g1 & c);
- h2 = (h2 & nc) | (g2 & c);
- st->H.h[0] = h0;
- st->H.h[1] = h1;
- st->H.h[2] = h2;
- }
- }
- static void
- poly1305_update(poly1305_state_internal_t *st, const unsigned char *m,
- unsigned long long bytes)
- {
- unsigned long long i;
- /* handle leftover */
- if (st->leftover) {
- unsigned long long want = (poly1305_block_size - st->leftover);
- if (want > bytes) {
- want = bytes;
- }
- for (i = 0; i < want; i++) {
- st->buffer[st->leftover + i] = m[i];
- }
- bytes -= want;
- m += want;
- st->leftover += want;
- if (st->leftover < poly1305_block_size) {
- return;
- }
- poly1305_blocks(st, st->buffer, poly1305_block_size);
- st->leftover = 0;
- }
- /* process full blocks */
- if (bytes >= poly1305_block_size) {
- unsigned long long want = (bytes & ~(poly1305_block_size - 1));
- poly1305_blocks(st, m, want);
- m += want;
- bytes -= want;
- }
- /* store leftover */
- if (bytes) {
- for (i = 0; i < bytes; i++) {
- st->buffer[st->leftover + i] = m[i];
- }
- st->leftover += bytes;
- }
- }
- static POLY1305_NOINLINE void
- poly1305_finish_ext(poly1305_state_internal_t *st, const unsigned char *m,
- unsigned long long leftover, unsigned char mac[16])
- {
- uint64_t h0, h1, h2;
- if (leftover) {
- CRYPTO_ALIGN(16) unsigned char final[32] = { 0 };
- poly1305_block_copy31(final, m, leftover);
- if (leftover != 16) {
- final[leftover] = 1;
- }
- st->flags |=
- (leftover >= 16) ? poly1305_final_shift8 : poly1305_final_shift16;
- poly1305_blocks(st, final, 32);
- }
- if (st->flags & poly1305_started) {
- /* finalize, H *= [r^2,r], or H *= [r,1] */
- if (!leftover || (leftover > 16)) {
- st->flags |= poly1305_final_r2_r;
- } else {
- st->flags |= poly1305_final_r_1;
- }
- poly1305_blocks(st, NULL, 32);
- }
- h0 = st->H.h[0];
- h1 = st->H.h[1];
- h2 = st->H.h[2];
- /* pad */
- h0 = ((h0) | (h1 << 44));
- h1 = ((h1 >> 20) | (h2 << 24));
- #ifdef HAVE_AMD64_ASM
- __asm__ __volatile__(
- "addq %2, %0 ;\n"
- "adcq %3, %1 ;\n"
- : "+r"(h0), "+r"(h1)
- : "r"(st->pad[0]), "r"(st->pad[1])
- : "flags", "cc");
- #else
- {
- uint128_t h;
- memcpy(&h, &st->pad[0], 16);
- h += ((uint128_t) h1 << 64) | h0;
- h0 = (uint64_t) h;
- h1 = (uint64_t)(h >> 64);
- }
- #endif
- _mm_storeu_si128((xmmi *) (void *) st + 0, _mm_setzero_si128());
- _mm_storeu_si128((xmmi *) (void *) st + 1, _mm_setzero_si128());
- _mm_storeu_si128((xmmi *) (void *) st + 2, _mm_setzero_si128());
- _mm_storeu_si128((xmmi *) (void *) st + 3, _mm_setzero_si128());
- _mm_storeu_si128((xmmi *) (void *) st + 4, _mm_setzero_si128());
- _mm_storeu_si128((xmmi *) (void *) st + 5, _mm_setzero_si128());
- _mm_storeu_si128((xmmi *) (void *) st + 6, _mm_setzero_si128());
- _mm_storeu_si128((xmmi *) (void *) st + 7, _mm_setzero_si128());
- memcpy(&mac[0], &h0, 8);
- memcpy(&mac[8], &h1, 8);
- sodium_memzero((void *) st, sizeof *st);
- }
- static void
- poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16])
- {
- poly1305_finish_ext(st, st->buffer, st->leftover, mac);
- }
- static int
- crypto_onetimeauth_poly1305_sse2_init(crypto_onetimeauth_poly1305_state *state,
- const unsigned char *key)
- {
- COMPILER_ASSERT(sizeof(crypto_onetimeauth_poly1305_state) >=
- sizeof(poly1305_state_internal_t));
- poly1305_init_ext((poly1305_state_internal_t *) (void *) state, key, 0U);
- return 0;
- }
- static int
- crypto_onetimeauth_poly1305_sse2_update(
- crypto_onetimeauth_poly1305_state *state, const unsigned char *in,
- unsigned long long inlen)
- {
- poly1305_update((poly1305_state_internal_t *) (void *) state, in, inlen);
- return 0;
- }
- static int
- crypto_onetimeauth_poly1305_sse2_final(crypto_onetimeauth_poly1305_state *state,
- unsigned char *out)
- {
- poly1305_finish((poly1305_state_internal_t *) (void *) state, out);
- return 0;
- }
- static int
- crypto_onetimeauth_poly1305_sse2(unsigned char *out, const unsigned char *m,
- unsigned long long inlen,
- const unsigned char *key)
- {
- CRYPTO_ALIGN(64) poly1305_state_internal_t st;
- unsigned long long blocks;
- poly1305_init_ext(&st, key, inlen);
- blocks = inlen & ~31;
- if (blocks > 0) {
- poly1305_blocks(&st, m, blocks);
- m += blocks;
- inlen -= blocks;
- }
- poly1305_finish_ext(&st, m, inlen, out);
- return 0;
- }
- static int
- crypto_onetimeauth_poly1305_sse2_verify(const unsigned char *h,
- const unsigned char *in,
- unsigned long long inlen,
- const unsigned char *k)
- {
- unsigned char correct[16];
- crypto_onetimeauth_poly1305_sse2(correct, in, inlen, k);
- return crypto_verify_16(h, correct);
- }
- struct crypto_onetimeauth_poly1305_implementation
- crypto_onetimeauth_poly1305_sse2_implementation = {
- SODIUM_C99(.onetimeauth =) crypto_onetimeauth_poly1305_sse2,
- SODIUM_C99(.onetimeauth_verify =)
- crypto_onetimeauth_poly1305_sse2_verify,
- SODIUM_C99(.onetimeauth_init =) crypto_onetimeauth_poly1305_sse2_init,
- SODIUM_C99(.onetimeauth_update =)
- crypto_onetimeauth_poly1305_sse2_update,
- SODIUM_C99(.onetimeauth_final =) crypto_onetimeauth_poly1305_sse2_final
- };
- #endif
|