poly1305_sse2.c 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949
  1. #include <stdint.h>
  2. #include <string.h>
  3. #include "../onetimeauth_poly1305.h"
  4. #include "crypto_verify_16.h"
  5. #include "poly1305_sse2.h"
  6. #include "private/common.h"
  7. #include "private/sse2_64_32.h"
  8. #include "utils.h"
  9. #if defined(HAVE_TI_MODE) && defined(HAVE_EMMINTRIN_H)
  10. # ifdef __GNUC__
  11. # pragma GCC target("sse2")
  12. # endif
  13. # include <emmintrin.h>
  14. typedef __m128i xmmi;
  15. # if defined(_MSC_VER)
  16. # define POLY1305_NOINLINE __declspec(noinline)
  17. # elif defined(__clang__) || defined(__GNUC__)
  18. # define POLY1305_NOINLINE __attribute__((noinline))
  19. # else
  20. # define POLY1305_NOINLINE
  21. # endif
  22. # define poly1305_block_size 32
  23. enum poly1305_state_flags_t {
  24. poly1305_started = 1,
  25. poly1305_final_shift8 = 4,
  26. poly1305_final_shift16 = 8,
  27. poly1305_final_r2_r = 16, /* use [r^2,r] for the final block */
  28. poly1305_final_r_1 = 32 /* use [r,1] for the final block */
  29. };
  30. typedef struct poly1305_state_internal_t {
  31. union {
  32. uint64_t h[3];
  33. uint32_t hh[10];
  34. } H; /* 40 bytes */
  35. uint32_t R[5]; /* 20 bytes */
  36. uint32_t R2[5]; /* 20 bytes */
  37. uint32_t R4[5]; /* 20 bytes */
  38. uint64_t pad[2]; /* 16 bytes */
  39. uint64_t flags; /* 8 bytes */
  40. unsigned long long leftover; /* 8 bytes */
  41. unsigned char buffer[poly1305_block_size]; /* 32 bytes */
  42. } poly1305_state_internal_t; /* 164 bytes total */
  43. /*
  44. * _mm_loadl_epi64() is turned into a simple MOVQ. So, unaligned accesses are
  45. * totally fine, even though this intrinsic requires a __m128i* input.
  46. * This confuses dynamic analysis, so force alignment, only in debug mode.
  47. */
  48. # ifdef DEBUG
  49. static xmmi
  50. _fakealign_mm_loadl_epi64(const void *m)
  51. {
  52. xmmi tmp;
  53. memcpy(&tmp, m, 8);
  54. return _mm_loadl_epi64(&tmp);
  55. }
  56. # define _mm_loadl_epi64(X) _fakealign_mm_loadl_epi64(X)
  57. #endif
  58. /* copy 0-31 bytes */
  59. static inline void
  60. poly1305_block_copy31(unsigned char *dst, const unsigned char *src,
  61. unsigned long long bytes)
  62. {
  63. if (bytes & 16) {
  64. _mm_store_si128((xmmi *) (void *) dst,
  65. _mm_loadu_si128((const xmmi *) (const void *) src));
  66. src += 16;
  67. dst += 16;
  68. }
  69. if (bytes & 8) {
  70. memcpy(dst, src, 8);
  71. src += 8;
  72. dst += 8;
  73. }
  74. if (bytes & 4) {
  75. memcpy(dst, src, 4);
  76. src += 4;
  77. dst += 4;
  78. }
  79. if (bytes & 2) {
  80. memcpy(dst, src, 2);
  81. src += 2;
  82. dst += 2;
  83. }
  84. if (bytes & 1) {
  85. *dst = *src;
  86. }
  87. }
  88. static POLY1305_NOINLINE void
  89. poly1305_init_ext(poly1305_state_internal_t *st, const unsigned char key[32],
  90. unsigned long long bytes)
  91. {
  92. uint32_t *R;
  93. uint128_t d[3];
  94. uint64_t r0, r1, r2;
  95. uint64_t rt0, rt1, rt2, st2, c;
  96. uint64_t t0, t1;
  97. unsigned long long i;
  98. if (!bytes) {
  99. bytes = ~(unsigned long long) 0;
  100. }
  101. /* H = 0 */
  102. _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], _mm_setzero_si128());
  103. _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], _mm_setzero_si128());
  104. _mm_storeu_si128((xmmi *) (void *) &st->H.hh[8], _mm_setzero_si128());
  105. /* clamp key */
  106. memcpy(&t0, key, 8);
  107. memcpy(&t1, key + 8, 8);
  108. r0 = t0 & 0xffc0fffffff;
  109. t0 >>= 44;
  110. t0 |= t1 << 20;
  111. r1 = t0 & 0xfffffc0ffff;
  112. t1 >>= 24;
  113. r2 = t1 & 0x00ffffffc0f;
  114. /* r^1 */
  115. R = st->R;
  116. R[0] = (uint32_t)(r0) &0x3ffffff;
  117. R[1] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
  118. R[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
  119. R[3] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
  120. R[4] = (uint32_t)((r2 >> 16));
  121. /* save pad */
  122. memcpy(&st->pad[0], key + 16, 8);
  123. memcpy(&st->pad[1], key + 24, 8);
  124. rt0 = r0;
  125. rt1 = r1;
  126. rt2 = r2;
  127. /* r^2, r^4 */
  128. for (i = 0; i < 2; i++) {
  129. if (i == 0) {
  130. R = st->R2;
  131. if (bytes <= 16) {
  132. break;
  133. }
  134. } else if (i == 1) {
  135. R = st->R4;
  136. if (bytes < 96) {
  137. break;
  138. }
  139. }
  140. st2 = rt2 * (5 << 2);
  141. d[0] = ((uint128_t) rt0 * rt0) + ((uint128_t)(rt1 * 2) * st2);
  142. d[1] = ((uint128_t) rt2 * st2) + ((uint128_t)(rt0 * 2) * rt1);
  143. d[2] = ((uint128_t) rt1 * rt1) + ((uint128_t)(rt2 * 2) * rt0);
  144. rt0 = (uint64_t) d[0] & 0xfffffffffff;
  145. c = (uint64_t)(d[0] >> 44);
  146. d[1] += c;
  147. rt1 = (uint64_t) d[1] & 0xfffffffffff;
  148. c = (uint64_t)(d[1] >> 44);
  149. d[2] += c;
  150. rt2 = (uint64_t) d[2] & 0x3ffffffffff;
  151. c = (uint64_t)(d[2] >> 42);
  152. rt0 += c * 5;
  153. c = (rt0 >> 44);
  154. rt0 = rt0 & 0xfffffffffff;
  155. rt1 += c;
  156. c = (rt1 >> 44);
  157. rt1 = rt1 & 0xfffffffffff;
  158. rt2 += c; /* even if rt2 overflows, it will still fit in rp4 safely, and
  159. is safe to multiply with */
  160. R[0] = (uint32_t)(rt0) &0x3ffffff;
  161. R[1] = (uint32_t)((rt0 >> 26) | (rt1 << 18)) & 0x3ffffff;
  162. R[2] = (uint32_t)((rt1 >> 8)) & 0x3ffffff;
  163. R[3] = (uint32_t)((rt1 >> 34) | (rt2 << 10)) & 0x3ffffff;
  164. R[4] = (uint32_t)((rt2 >> 16));
  165. }
  166. st->flags = 0;
  167. st->leftover = 0U;
  168. }
  169. static POLY1305_NOINLINE void
  170. poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
  171. unsigned long long bytes)
  172. {
  173. CRYPTO_ALIGN(64)
  174. xmmi HIBIT =
  175. _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << 24), _MM_SHUFFLE(1, 0, 1, 0));
  176. const xmmi MMASK = _mm_shuffle_epi32(_mm_cvtsi32_si128((1 << 26) - 1),
  177. _MM_SHUFFLE(1, 0, 1, 0));
  178. const xmmi FIVE =
  179. _mm_shuffle_epi32(_mm_cvtsi32_si128(5), _MM_SHUFFLE(1, 0, 1, 0));
  180. xmmi H0, H1, H2, H3, H4;
  181. xmmi T0, T1, T2, T3, T4, T5, T6, T7, T8;
  182. xmmi M0, M1, M2, M3, M4;
  183. xmmi M5, M6, M7, M8;
  184. xmmi C1, C2;
  185. xmmi R20, R21, R22, R23, R24, S21, S22, S23, S24;
  186. xmmi R40, R41, R42, R43, R44, S41, S42, S43, S44;
  187. if (st->flags & poly1305_final_shift8) {
  188. HIBIT = _mm_srli_si128(HIBIT, 8);
  189. }
  190. if (st->flags & poly1305_final_shift16) {
  191. HIBIT = _mm_setzero_si128();
  192. }
  193. if (!(st->flags & poly1305_started)) {
  194. /* H = [Mx,My] */
  195. T5 = _mm_unpacklo_epi64(
  196. _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
  197. _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
  198. T6 = _mm_unpacklo_epi64(
  199. _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
  200. _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
  201. H0 = _mm_and_si128(MMASK, T5);
  202. H1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  203. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  204. H2 = _mm_and_si128(MMASK, T5);
  205. H3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  206. H4 = _mm_srli_epi64(T6, 40);
  207. H4 = _mm_or_si128(H4, HIBIT);
  208. m += 32;
  209. bytes -= 32;
  210. st->flags |= poly1305_started;
  211. } else {
  212. T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[0]);
  213. T1 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[4]);
  214. T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[8]);
  215. H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 0, 0));
  216. H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 2, 2));
  217. H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1, 1, 0, 0));
  218. H3 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 3, 2, 2));
  219. H4 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(1, 1, 0, 0));
  220. }
  221. if (st->flags & (poly1305_final_r2_r | poly1305_final_r_1)) {
  222. if (st->flags & poly1305_final_r2_r) {
  223. /* use [r^2, r] */
  224. T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
  225. T3 = _mm_cvtsi32_si128(st->R[4]);
  226. T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
  227. T1 = _mm_cvtsi32_si128(st->R2[4]);
  228. T4 = _mm_unpacklo_epi32(T0, T2);
  229. T5 = _mm_unpackhi_epi32(T0, T2);
  230. R24 = _mm_unpacklo_epi64(T1, T3);
  231. } else {
  232. /* use [r^1, 1] */
  233. T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
  234. T1 = _mm_cvtsi32_si128(st->R[4]);
  235. T2 = _mm_cvtsi32_si128(1);
  236. T4 = _mm_unpacklo_epi32(T0, T2);
  237. T5 = _mm_unpackhi_epi32(T0, T2);
  238. R24 = T1;
  239. }
  240. R20 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(1, 1, 0, 0));
  241. R21 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(3, 3, 2, 2));
  242. R22 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(1, 1, 0, 0));
  243. R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3, 3, 2, 2));
  244. } else {
  245. /* use [r^2, r^2] */
  246. T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
  247. T1 = _mm_cvtsi32_si128(st->R2[4]);
  248. R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
  249. R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
  250. R22 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
  251. R23 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
  252. R24 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
  253. }
  254. S21 = _mm_mul_epu32(R21, FIVE);
  255. S22 = _mm_mul_epu32(R22, FIVE);
  256. S23 = _mm_mul_epu32(R23, FIVE);
  257. S24 = _mm_mul_epu32(R24, FIVE);
  258. if (bytes >= 64) {
  259. T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R4[0]);
  260. T1 = _mm_cvtsi32_si128(st->R4[4]);
  261. R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
  262. R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
  263. R42 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
  264. R43 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
  265. R44 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
  266. S41 = _mm_mul_epu32(R41, FIVE);
  267. S42 = _mm_mul_epu32(R42, FIVE);
  268. S43 = _mm_mul_epu32(R43, FIVE);
  269. S44 = _mm_mul_epu32(R44, FIVE);
  270. while (bytes >= 64) {
  271. xmmi v00, v01, v02, v03, v04;
  272. xmmi v10, v11, v12, v13, v14;
  273. xmmi v20, v21, v22, v23, v24;
  274. xmmi v30, v31, v32, v33, v34;
  275. xmmi v40, v41, v42, v43, v44;
  276. xmmi T14, T15;
  277. /* H *= [r^4,r^4], preload [Mx,My] */
  278. T15 = S42;
  279. T0 = H4;
  280. T0 = _mm_mul_epu32(T0, S41);
  281. v01 = H3;
  282. v01 = _mm_mul_epu32(v01, T15);
  283. T14 = S43;
  284. T1 = H4;
  285. T1 = _mm_mul_epu32(T1, T15);
  286. v11 = H3;
  287. v11 = _mm_mul_epu32(v11, T14);
  288. T2 = H4;
  289. T2 = _mm_mul_epu32(T2, T14);
  290. T0 = _mm_add_epi64(T0, v01);
  291. T15 = S44;
  292. v02 = H2;
  293. v02 = _mm_mul_epu32(v02, T14);
  294. T3 = H4;
  295. T3 = _mm_mul_epu32(T3, T15);
  296. T1 = _mm_add_epi64(T1, v11);
  297. v03 = H1;
  298. v03 = _mm_mul_epu32(v03, T15);
  299. v12 = H2;
  300. v12 = _mm_mul_epu32(v12, T15);
  301. T0 = _mm_add_epi64(T0, v02);
  302. T14 = R40;
  303. v21 = H3;
  304. v21 = _mm_mul_epu32(v21, T15);
  305. v31 = H3;
  306. v31 = _mm_mul_epu32(v31, T14);
  307. T0 = _mm_add_epi64(T0, v03);
  308. T4 = H4;
  309. T4 = _mm_mul_epu32(T4, T14);
  310. T1 = _mm_add_epi64(T1, v12);
  311. v04 = H0;
  312. v04 = _mm_mul_epu32(v04, T14);
  313. T2 = _mm_add_epi64(T2, v21);
  314. v13 = H1;
  315. v13 = _mm_mul_epu32(v13, T14);
  316. T3 = _mm_add_epi64(T3, v31);
  317. T15 = R41;
  318. v22 = H2;
  319. v22 = _mm_mul_epu32(v22, T14);
  320. v32 = H2;
  321. v32 = _mm_mul_epu32(v32, T15);
  322. T0 = _mm_add_epi64(T0, v04);
  323. v41 = H3;
  324. v41 = _mm_mul_epu32(v41, T15);
  325. T1 = _mm_add_epi64(T1, v13);
  326. v14 = H0;
  327. v14 = _mm_mul_epu32(v14, T15);
  328. T2 = _mm_add_epi64(T2, v22);
  329. T14 = R42;
  330. T5 = _mm_unpacklo_epi64(
  331. _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
  332. _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
  333. v23 = H1;
  334. v23 = _mm_mul_epu32(v23, T15);
  335. T3 = _mm_add_epi64(T3, v32);
  336. v33 = H1;
  337. v33 = _mm_mul_epu32(v33, T14);
  338. T4 = _mm_add_epi64(T4, v41);
  339. v42 = H2;
  340. v42 = _mm_mul_epu32(v42, T14);
  341. T1 = _mm_add_epi64(T1, v14);
  342. T15 = R43;
  343. T6 = _mm_unpacklo_epi64(
  344. _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
  345. _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
  346. v24 = H0;
  347. v24 = _mm_mul_epu32(v24, T14);
  348. T2 = _mm_add_epi64(T2, v23);
  349. v34 = H0;
  350. v34 = _mm_mul_epu32(v34, T15);
  351. T3 = _mm_add_epi64(T3, v33);
  352. M0 = _mm_and_si128(MMASK, T5);
  353. v43 = H1;
  354. v43 = _mm_mul_epu32(v43, T15);
  355. T4 = _mm_add_epi64(T4, v42);
  356. M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
  357. v44 = H0;
  358. v44 = _mm_mul_epu32(v44, R44);
  359. T2 = _mm_add_epi64(T2, v24);
  360. T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
  361. T3 = _mm_add_epi64(T3, v34);
  362. M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T6, 14));
  363. T4 = _mm_add_epi64(T4, v43);
  364. M2 = _mm_and_si128(MMASK, T5);
  365. T4 = _mm_add_epi64(T4, v44);
  366. M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
  367. /* H += [Mx',My'] */
  368. T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 32));
  369. T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 48));
  370. T7 = _mm_unpacklo_epi32(T5, T6);
  371. T8 = _mm_unpackhi_epi32(T5, T6);
  372. M5 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
  373. M6 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
  374. M7 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
  375. M8 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
  376. M6 = _mm_slli_epi64(M6, 6);
  377. M7 = _mm_slli_epi64(M7, 12);
  378. M8 = _mm_slli_epi64(M8, 18);
  379. T0 = _mm_add_epi64(T0, M5);
  380. T1 = _mm_add_epi64(T1, M6);
  381. T2 = _mm_add_epi64(T2, M7);
  382. T3 = _mm_add_epi64(T3, M8);
  383. T4 = _mm_add_epi64(T4, HIBIT);
  384. /* H += [Mx,My]*[r^2,r^2] */
  385. T15 = S22;
  386. v00 = M4;
  387. v00 = _mm_mul_epu32(v00, S21);
  388. v01 = M3;
  389. v01 = _mm_mul_epu32(v01, T15);
  390. T14 = S23;
  391. v10 = M4;
  392. v10 = _mm_mul_epu32(v10, T15);
  393. v11 = M3;
  394. v11 = _mm_mul_epu32(v11, T14);
  395. T0 = _mm_add_epi64(T0, v00);
  396. v20 = M4;
  397. v20 = _mm_mul_epu32(v20, T14);
  398. T0 = _mm_add_epi64(T0, v01);
  399. T15 = S24;
  400. v02 = M2;
  401. v02 = _mm_mul_epu32(v02, T14);
  402. T1 = _mm_add_epi64(T1, v10);
  403. v30 = M4;
  404. v30 = _mm_mul_epu32(v30, T15);
  405. T1 = _mm_add_epi64(T1, v11);
  406. v03 = M1;
  407. v03 = _mm_mul_epu32(v03, T15);
  408. T2 = _mm_add_epi64(T2, v20);
  409. v12 = M2;
  410. v12 = _mm_mul_epu32(v12, T15);
  411. T0 = _mm_add_epi64(T0, v02);
  412. T14 = R20;
  413. v21 = M3;
  414. v21 = _mm_mul_epu32(v21, T15);
  415. T3 = _mm_add_epi64(T3, v30);
  416. v31 = M3;
  417. v31 = _mm_mul_epu32(v31, T14);
  418. T0 = _mm_add_epi64(T0, v03);
  419. v40 = M4;
  420. v40 = _mm_mul_epu32(v40, T14);
  421. T1 = _mm_add_epi64(T1, v12);
  422. v04 = M0;
  423. v04 = _mm_mul_epu32(v04, T14);
  424. T2 = _mm_add_epi64(T2, v21);
  425. v13 = M1;
  426. v13 = _mm_mul_epu32(v13, T14);
  427. T3 = _mm_add_epi64(T3, v31);
  428. T15 = R21;
  429. v22 = M2;
  430. v22 = _mm_mul_epu32(v22, T14);
  431. T4 = _mm_add_epi64(T4, v40);
  432. v32 = M2;
  433. v32 = _mm_mul_epu32(v32, T15);
  434. T0 = _mm_add_epi64(T0, v04);
  435. v41 = M3;
  436. v41 = _mm_mul_epu32(v41, T15);
  437. T1 = _mm_add_epi64(T1, v13);
  438. v14 = M0;
  439. v14 = _mm_mul_epu32(v14, T15);
  440. T2 = _mm_add_epi64(T2, v22);
  441. T14 = R22;
  442. v23 = M1;
  443. v23 = _mm_mul_epu32(v23, T15);
  444. T3 = _mm_add_epi64(T3, v32);
  445. v33 = M1;
  446. v33 = _mm_mul_epu32(v33, T14);
  447. T4 = _mm_add_epi64(T4, v41);
  448. v42 = M2;
  449. v42 = _mm_mul_epu32(v42, T14);
  450. T1 = _mm_add_epi64(T1, v14);
  451. T15 = R23;
  452. v24 = M0;
  453. v24 = _mm_mul_epu32(v24, T14);
  454. T2 = _mm_add_epi64(T2, v23);
  455. v34 = M0;
  456. v34 = _mm_mul_epu32(v34, T15);
  457. T3 = _mm_add_epi64(T3, v33);
  458. v43 = M1;
  459. v43 = _mm_mul_epu32(v43, T15);
  460. T4 = _mm_add_epi64(T4, v42);
  461. v44 = M0;
  462. v44 = _mm_mul_epu32(v44, R24);
  463. T2 = _mm_add_epi64(T2, v24);
  464. T3 = _mm_add_epi64(T3, v34);
  465. T4 = _mm_add_epi64(T4, v43);
  466. T4 = _mm_add_epi64(T4, v44);
  467. /* reduce */
  468. C1 = _mm_srli_epi64(T0, 26);
  469. C2 = _mm_srli_epi64(T3, 26);
  470. T0 = _mm_and_si128(T0, MMASK);
  471. T3 = _mm_and_si128(T3, MMASK);
  472. T1 = _mm_add_epi64(T1, C1);
  473. T4 = _mm_add_epi64(T4, C2);
  474. C1 = _mm_srli_epi64(T1, 26);
  475. C2 = _mm_srli_epi64(T4, 26);
  476. T1 = _mm_and_si128(T1, MMASK);
  477. T4 = _mm_and_si128(T4, MMASK);
  478. T2 = _mm_add_epi64(T2, C1);
  479. T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  480. C1 = _mm_srli_epi64(T2, 26);
  481. C2 = _mm_srli_epi64(T0, 26);
  482. T2 = _mm_and_si128(T2, MMASK);
  483. T0 = _mm_and_si128(T0, MMASK);
  484. T3 = _mm_add_epi64(T3, C1);
  485. T1 = _mm_add_epi64(T1, C2);
  486. C1 = _mm_srli_epi64(T3, 26);
  487. T3 = _mm_and_si128(T3, MMASK);
  488. T4 = _mm_add_epi64(T4, C1);
  489. /* Final: H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx',My']) */
  490. H0 = T0;
  491. H1 = T1;
  492. H2 = T2;
  493. H3 = T3;
  494. H4 = T4;
  495. m += 64;
  496. bytes -= 64;
  497. }
  498. }
  499. if (bytes >= 32) {
  500. xmmi v01, v02, v03, v04;
  501. xmmi v11, v12, v13, v14;
  502. xmmi v21, v22, v23, v24;
  503. xmmi v31, v32, v33, v34;
  504. xmmi v41, v42, v43, v44;
  505. xmmi T14, T15;
  506. /* H *= [r^2,r^2] */
  507. T15 = S22;
  508. T0 = H4;
  509. T0 = _mm_mul_epu32(T0, S21);
  510. v01 = H3;
  511. v01 = _mm_mul_epu32(v01, T15);
  512. T14 = S23;
  513. T1 = H4;
  514. T1 = _mm_mul_epu32(T1, T15);
  515. v11 = H3;
  516. v11 = _mm_mul_epu32(v11, T14);
  517. T2 = H4;
  518. T2 = _mm_mul_epu32(T2, T14);
  519. T0 = _mm_add_epi64(T0, v01);
  520. T15 = S24;
  521. v02 = H2;
  522. v02 = _mm_mul_epu32(v02, T14);
  523. T3 = H4;
  524. T3 = _mm_mul_epu32(T3, T15);
  525. T1 = _mm_add_epi64(T1, v11);
  526. v03 = H1;
  527. v03 = _mm_mul_epu32(v03, T15);
  528. v12 = H2;
  529. v12 = _mm_mul_epu32(v12, T15);
  530. T0 = _mm_add_epi64(T0, v02);
  531. T14 = R20;
  532. v21 = H3;
  533. v21 = _mm_mul_epu32(v21, T15);
  534. v31 = H3;
  535. v31 = _mm_mul_epu32(v31, T14);
  536. T0 = _mm_add_epi64(T0, v03);
  537. T4 = H4;
  538. T4 = _mm_mul_epu32(T4, T14);
  539. T1 = _mm_add_epi64(T1, v12);
  540. v04 = H0;
  541. v04 = _mm_mul_epu32(v04, T14);
  542. T2 = _mm_add_epi64(T2, v21);
  543. v13 = H1;
  544. v13 = _mm_mul_epu32(v13, T14);
  545. T3 = _mm_add_epi64(T3, v31);
  546. T15 = R21;
  547. v22 = H2;
  548. v22 = _mm_mul_epu32(v22, T14);
  549. v32 = H2;
  550. v32 = _mm_mul_epu32(v32, T15);
  551. T0 = _mm_add_epi64(T0, v04);
  552. v41 = H3;
  553. v41 = _mm_mul_epu32(v41, T15);
  554. T1 = _mm_add_epi64(T1, v13);
  555. v14 = H0;
  556. v14 = _mm_mul_epu32(v14, T15);
  557. T2 = _mm_add_epi64(T2, v22);
  558. T14 = R22;
  559. v23 = H1;
  560. v23 = _mm_mul_epu32(v23, T15);
  561. T3 = _mm_add_epi64(T3, v32);
  562. v33 = H1;
  563. v33 = _mm_mul_epu32(v33, T14);
  564. T4 = _mm_add_epi64(T4, v41);
  565. v42 = H2;
  566. v42 = _mm_mul_epu32(v42, T14);
  567. T1 = _mm_add_epi64(T1, v14);
  568. T15 = R23;
  569. v24 = H0;
  570. v24 = _mm_mul_epu32(v24, T14);
  571. T2 = _mm_add_epi64(T2, v23);
  572. v34 = H0;
  573. v34 = _mm_mul_epu32(v34, T15);
  574. T3 = _mm_add_epi64(T3, v33);
  575. v43 = H1;
  576. v43 = _mm_mul_epu32(v43, T15);
  577. T4 = _mm_add_epi64(T4, v42);
  578. v44 = H0;
  579. v44 = _mm_mul_epu32(v44, R24);
  580. T2 = _mm_add_epi64(T2, v24);
  581. T3 = _mm_add_epi64(T3, v34);
  582. T4 = _mm_add_epi64(T4, v43);
  583. T4 = _mm_add_epi64(T4, v44);
  584. /* H += [Mx,My] */
  585. if (m) {
  586. T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 0));
  587. T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 16));
  588. T7 = _mm_unpacklo_epi32(T5, T6);
  589. T8 = _mm_unpackhi_epi32(T5, T6);
  590. M0 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
  591. M1 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
  592. M2 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
  593. M3 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
  594. M1 = _mm_slli_epi64(M1, 6);
  595. M2 = _mm_slli_epi64(M2, 12);
  596. M3 = _mm_slli_epi64(M3, 18);
  597. T0 = _mm_add_epi64(T0, M0);
  598. T1 = _mm_add_epi64(T1, M1);
  599. T2 = _mm_add_epi64(T2, M2);
  600. T3 = _mm_add_epi64(T3, M3);
  601. T4 = _mm_add_epi64(T4, HIBIT);
  602. }
  603. /* reduce */
  604. C1 = _mm_srli_epi64(T0, 26);
  605. C2 = _mm_srli_epi64(T3, 26);
  606. T0 = _mm_and_si128(T0, MMASK);
  607. T3 = _mm_and_si128(T3, MMASK);
  608. T1 = _mm_add_epi64(T1, C1);
  609. T4 = _mm_add_epi64(T4, C2);
  610. C1 = _mm_srli_epi64(T1, 26);
  611. C2 = _mm_srli_epi64(T4, 26);
  612. T1 = _mm_and_si128(T1, MMASK);
  613. T4 = _mm_and_si128(T4, MMASK);
  614. T2 = _mm_add_epi64(T2, C1);
  615. T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
  616. C1 = _mm_srli_epi64(T2, 26);
  617. C2 = _mm_srli_epi64(T0, 26);
  618. T2 = _mm_and_si128(T2, MMASK);
  619. T0 = _mm_and_si128(T0, MMASK);
  620. T3 = _mm_add_epi64(T3, C1);
  621. T1 = _mm_add_epi64(T1, C2);
  622. C1 = _mm_srli_epi64(T3, 26);
  623. T3 = _mm_and_si128(T3, MMASK);
  624. T4 = _mm_add_epi64(T4, C1);
  625. /* H = (H*[r^2,r^2] + [Mx,My]) */
  626. H0 = T0;
  627. H1 = T1;
  628. H2 = T2;
  629. H3 = T3;
  630. H4 = T4;
  631. }
  632. if (m) {
  633. T0 = _mm_shuffle_epi32(H0, _MM_SHUFFLE(0, 0, 2, 0));
  634. T1 = _mm_shuffle_epi32(H1, _MM_SHUFFLE(0, 0, 2, 0));
  635. T2 = _mm_shuffle_epi32(H2, _MM_SHUFFLE(0, 0, 2, 0));
  636. T3 = _mm_shuffle_epi32(H3, _MM_SHUFFLE(0, 0, 2, 0));
  637. T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0, 0, 2, 0));
  638. T0 = _mm_unpacklo_epi64(T0, T1);
  639. T1 = _mm_unpacklo_epi64(T2, T3);
  640. _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], T0);
  641. _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], T1);
  642. _mm_storel_epi64((xmmi *) (void *) &st->H.hh[8], T4);
  643. } else {
  644. uint32_t t0, t1, t2, t3, t4, b;
  645. uint64_t h0, h1, h2, g0, g1, g2, c, nc;
  646. /* H = H[0]+H[1] */
  647. T0 = H0;
  648. T1 = H1;
  649. T2 = H2;
  650. T3 = H3;
  651. T4 = H4;
  652. T0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
  653. T1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
  654. T2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
  655. T3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
  656. T4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
  657. t0 = _mm_cvtsi128_si32(T0);
  658. b = (t0 >> 26);
  659. t0 &= 0x3ffffff;
  660. t1 = _mm_cvtsi128_si32(T1) + b;
  661. b = (t1 >> 26);
  662. t1 &= 0x3ffffff;
  663. t2 = _mm_cvtsi128_si32(T2) + b;
  664. b = (t2 >> 26);
  665. t2 &= 0x3ffffff;
  666. t3 = _mm_cvtsi128_si32(T3) + b;
  667. b = (t3 >> 26);
  668. t3 &= 0x3ffffff;
  669. t4 = _mm_cvtsi128_si32(T4) + b;
  670. /* everything except t4 is in range, so this is all safe */
  671. h0 = (((uint64_t) t0) | ((uint64_t) t1 << 26)) & 0xfffffffffffull;
  672. h1 = (((uint64_t) t1 >> 18) | ((uint64_t) t2 << 8) |
  673. ((uint64_t) t3 << 34)) &
  674. 0xfffffffffffull;
  675. h2 = (((uint64_t) t3 >> 10) | ((uint64_t) t4 << 16));
  676. c = (h2 >> 42);
  677. h2 &= 0x3ffffffffff;
  678. h0 += c * 5;
  679. c = (h0 >> 44);
  680. h0 &= 0xfffffffffff;
  681. h1 += c;
  682. c = (h1 >> 44);
  683. h1 &= 0xfffffffffff;
  684. h2 += c;
  685. c = (h2 >> 42);
  686. h2 &= 0x3ffffffffff;
  687. h0 += c * 5;
  688. c = (h0 >> 44);
  689. h0 &= 0xfffffffffff;
  690. h1 += c;
  691. g0 = h0 + 5;
  692. c = (g0 >> 44);
  693. g0 &= 0xfffffffffff;
  694. g1 = h1 + c;
  695. c = (g1 >> 44);
  696. g1 &= 0xfffffffffff;
  697. g2 = h2 + c - ((uint64_t) 1 << 42);
  698. c = (g2 >> 63) - 1;
  699. nc = ~c;
  700. h0 = (h0 & nc) | (g0 & c);
  701. h1 = (h1 & nc) | (g1 & c);
  702. h2 = (h2 & nc) | (g2 & c);
  703. st->H.h[0] = h0;
  704. st->H.h[1] = h1;
  705. st->H.h[2] = h2;
  706. }
  707. }
  708. static void
  709. poly1305_update(poly1305_state_internal_t *st, const unsigned char *m,
  710. unsigned long long bytes)
  711. {
  712. unsigned long long i;
  713. /* handle leftover */
  714. if (st->leftover) {
  715. unsigned long long want = (poly1305_block_size - st->leftover);
  716. if (want > bytes) {
  717. want = bytes;
  718. }
  719. for (i = 0; i < want; i++) {
  720. st->buffer[st->leftover + i] = m[i];
  721. }
  722. bytes -= want;
  723. m += want;
  724. st->leftover += want;
  725. if (st->leftover < poly1305_block_size) {
  726. return;
  727. }
  728. poly1305_blocks(st, st->buffer, poly1305_block_size);
  729. st->leftover = 0;
  730. }
  731. /* process full blocks */
  732. if (bytes >= poly1305_block_size) {
  733. unsigned long long want = (bytes & ~(poly1305_block_size - 1));
  734. poly1305_blocks(st, m, want);
  735. m += want;
  736. bytes -= want;
  737. }
  738. /* store leftover */
  739. if (bytes) {
  740. for (i = 0; i < bytes; i++) {
  741. st->buffer[st->leftover + i] = m[i];
  742. }
  743. st->leftover += bytes;
  744. }
  745. }
  746. static POLY1305_NOINLINE void
  747. poly1305_finish_ext(poly1305_state_internal_t *st, const unsigned char *m,
  748. unsigned long long leftover, unsigned char mac[16])
  749. {
  750. uint64_t h0, h1, h2;
  751. if (leftover) {
  752. CRYPTO_ALIGN(16) unsigned char final[32] = { 0 };
  753. poly1305_block_copy31(final, m, leftover);
  754. if (leftover != 16) {
  755. final[leftover] = 1;
  756. }
  757. st->flags |=
  758. (leftover >= 16) ? poly1305_final_shift8 : poly1305_final_shift16;
  759. poly1305_blocks(st, final, 32);
  760. }
  761. if (st->flags & poly1305_started) {
  762. /* finalize, H *= [r^2,r], or H *= [r,1] */
  763. if (!leftover || (leftover > 16)) {
  764. st->flags |= poly1305_final_r2_r;
  765. } else {
  766. st->flags |= poly1305_final_r_1;
  767. }
  768. poly1305_blocks(st, NULL, 32);
  769. }
  770. h0 = st->H.h[0];
  771. h1 = st->H.h[1];
  772. h2 = st->H.h[2];
  773. /* pad */
  774. h0 = ((h0) | (h1 << 44));
  775. h1 = ((h1 >> 20) | (h2 << 24));
  776. #ifdef HAVE_AMD64_ASM
  777. __asm__ __volatile__(
  778. "addq %2, %0 ;\n"
  779. "adcq %3, %1 ;\n"
  780. : "+r"(h0), "+r"(h1)
  781. : "r"(st->pad[0]), "r"(st->pad[1])
  782. : "flags", "cc");
  783. #else
  784. {
  785. uint128_t h;
  786. memcpy(&h, &st->pad[0], 16);
  787. h += ((uint128_t) h1 << 64) | h0;
  788. h0 = (uint64_t) h;
  789. h1 = (uint64_t)(h >> 64);
  790. }
  791. #endif
  792. _mm_storeu_si128((xmmi *) (void *) st + 0, _mm_setzero_si128());
  793. _mm_storeu_si128((xmmi *) (void *) st + 1, _mm_setzero_si128());
  794. _mm_storeu_si128((xmmi *) (void *) st + 2, _mm_setzero_si128());
  795. _mm_storeu_si128((xmmi *) (void *) st + 3, _mm_setzero_si128());
  796. _mm_storeu_si128((xmmi *) (void *) st + 4, _mm_setzero_si128());
  797. _mm_storeu_si128((xmmi *) (void *) st + 5, _mm_setzero_si128());
  798. _mm_storeu_si128((xmmi *) (void *) st + 6, _mm_setzero_si128());
  799. _mm_storeu_si128((xmmi *) (void *) st + 7, _mm_setzero_si128());
  800. memcpy(&mac[0], &h0, 8);
  801. memcpy(&mac[8], &h1, 8);
  802. sodium_memzero((void *) st, sizeof *st);
  803. }
  804. static void
  805. poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16])
  806. {
  807. poly1305_finish_ext(st, st->buffer, st->leftover, mac);
  808. }
  809. static int
  810. crypto_onetimeauth_poly1305_sse2_init(crypto_onetimeauth_poly1305_state *state,
  811. const unsigned char *key)
  812. {
  813. COMPILER_ASSERT(sizeof(crypto_onetimeauth_poly1305_state) >=
  814. sizeof(poly1305_state_internal_t));
  815. poly1305_init_ext((poly1305_state_internal_t *) (void *) state, key, 0U);
  816. return 0;
  817. }
  818. static int
  819. crypto_onetimeauth_poly1305_sse2_update(
  820. crypto_onetimeauth_poly1305_state *state, const unsigned char *in,
  821. unsigned long long inlen)
  822. {
  823. poly1305_update((poly1305_state_internal_t *) (void *) state, in, inlen);
  824. return 0;
  825. }
  826. static int
  827. crypto_onetimeauth_poly1305_sse2_final(crypto_onetimeauth_poly1305_state *state,
  828. unsigned char *out)
  829. {
  830. poly1305_finish((poly1305_state_internal_t *) (void *) state, out);
  831. return 0;
  832. }
  833. static int
  834. crypto_onetimeauth_poly1305_sse2(unsigned char *out, const unsigned char *m,
  835. unsigned long long inlen,
  836. const unsigned char *key)
  837. {
  838. CRYPTO_ALIGN(64) poly1305_state_internal_t st;
  839. unsigned long long blocks;
  840. poly1305_init_ext(&st, key, inlen);
  841. blocks = inlen & ~31;
  842. if (blocks > 0) {
  843. poly1305_blocks(&st, m, blocks);
  844. m += blocks;
  845. inlen -= blocks;
  846. }
  847. poly1305_finish_ext(&st, m, inlen, out);
  848. return 0;
  849. }
  850. static int
  851. crypto_onetimeauth_poly1305_sse2_verify(const unsigned char *h,
  852. const unsigned char *in,
  853. unsigned long long inlen,
  854. const unsigned char *k)
  855. {
  856. unsigned char correct[16];
  857. crypto_onetimeauth_poly1305_sse2(correct, in, inlen, k);
  858. return crypto_verify_16(h, correct);
  859. }
  860. struct crypto_onetimeauth_poly1305_implementation
  861. crypto_onetimeauth_poly1305_sse2_implementation = {
  862. SODIUM_C99(.onetimeauth =) crypto_onetimeauth_poly1305_sse2,
  863. SODIUM_C99(.onetimeauth_verify =)
  864. crypto_onetimeauth_poly1305_sse2_verify,
  865. SODIUM_C99(.onetimeauth_init =) crypto_onetimeauth_poly1305_sse2_init,
  866. SODIUM_C99(.onetimeauth_update =)
  867. crypto_onetimeauth_poly1305_sse2_update,
  868. SODIUM_C99(.onetimeauth_final =) crypto_onetimeauth_poly1305_sse2_final
  869. };
  870. #endif