u4.h 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547
  1. if (bytes >= 256) {
  2. __m128i y0, y1, y2, y3, y4, y5, y6, y7, y8, y9, y10, y11, y12, y13, y14,
  3. y15;
  4. __m128i z0, z1, z2, z3, z4, z5, z6, z7, z8, z9, z10, z11, z12, z13, z14,
  5. z15;
  6. __m128i orig0, orig1, orig2, orig3, orig4, orig5, orig6, orig7, orig8,
  7. orig9, orig10, orig11, orig12, orig13, orig14, orig15;
  8. uint32_t in8;
  9. uint32_t in9;
  10. int i;
  11. /* element broadcast immediate for _mm_shuffle_epi32 are in order:
  12. 0x00, 0x55, 0xaa, 0xff */
  13. z0 = _mm_loadu_si128((const __m128i *) (x + 0));
  14. z5 = _mm_shuffle_epi32(z0, 0x55);
  15. z10 = _mm_shuffle_epi32(z0, 0xaa);
  16. z15 = _mm_shuffle_epi32(z0, 0xff);
  17. z0 = _mm_shuffle_epi32(z0, 0x00);
  18. z1 = _mm_loadu_si128((const __m128i *) (x + 4));
  19. z6 = _mm_shuffle_epi32(z1, 0xaa);
  20. z11 = _mm_shuffle_epi32(z1, 0xff);
  21. z12 = _mm_shuffle_epi32(z1, 0x00);
  22. z1 = _mm_shuffle_epi32(z1, 0x55);
  23. z2 = _mm_loadu_si128((const __m128i *) (x + 8));
  24. z7 = _mm_shuffle_epi32(z2, 0xff);
  25. z13 = _mm_shuffle_epi32(z2, 0x55);
  26. z2 = _mm_shuffle_epi32(z2, 0xaa);
  27. /* no z8 -> first half of the nonce, will fill later */
  28. z3 = _mm_loadu_si128((const __m128i *) (x + 12));
  29. z4 = _mm_shuffle_epi32(z3, 0x00);
  30. z14 = _mm_shuffle_epi32(z3, 0xaa);
  31. z3 = _mm_shuffle_epi32(z3, 0xff);
  32. /* no z9 -> second half of the nonce, will fill later */
  33. orig0 = z0;
  34. orig1 = z1;
  35. orig2 = z2;
  36. orig3 = z3;
  37. orig4 = z4;
  38. orig5 = z5;
  39. orig6 = z6;
  40. orig7 = z7;
  41. orig10 = z10;
  42. orig11 = z11;
  43. orig12 = z12;
  44. orig13 = z13;
  45. orig14 = z14;
  46. orig15 = z15;
  47. while (bytes >= 256) {
  48. /* vector implementation for z8 and z9 */
  49. /* not sure if it helps for only 4 blocks */
  50. const __m128i addv8 = _mm_set_epi64x(1, 0);
  51. const __m128i addv9 = _mm_set_epi64x(3, 2);
  52. __m128i t8, t9;
  53. uint64_t in89;
  54. in8 = x[8];
  55. in9 = x[13];
  56. in89 = ((uint64_t) in8) | (((uint64_t) in9) << 32);
  57. t8 = _mm_set1_epi64x(in89);
  58. t9 = _mm_set1_epi64x(in89);
  59. z8 = _mm_add_epi64(addv8, t8);
  60. z9 = _mm_add_epi64(addv9, t9);
  61. t8 = _mm_unpacklo_epi32(z8, z9);
  62. t9 = _mm_unpackhi_epi32(z8, z9);
  63. z8 = _mm_unpacklo_epi32(t8, t9);
  64. z9 = _mm_unpackhi_epi32(t8, t9);
  65. orig8 = z8;
  66. orig9 = z9;
  67. in89 += 4;
  68. x[8] = in89 & 0xFFFFFFFF;
  69. x[13] = (in89 >> 32) & 0xFFFFFFFF;
  70. z5 = orig5;
  71. z10 = orig10;
  72. z15 = orig15;
  73. z14 = orig14;
  74. z3 = orig3;
  75. z6 = orig6;
  76. z11 = orig11;
  77. z1 = orig1;
  78. z7 = orig7;
  79. z13 = orig13;
  80. z2 = orig2;
  81. z9 = orig9;
  82. z0 = orig0;
  83. z12 = orig12;
  84. z4 = orig4;
  85. z8 = orig8;
  86. for (i = 0; i < ROUNDS; i += 2) {
  87. /* the inner loop is a direct translation (regexp search/replace)
  88. * from the amd64-xmm6 ASM */
  89. __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13,
  90. r14, r15;
  91. y4 = z12;
  92. y4 = _mm_add_epi32(y4, z0);
  93. r4 = y4;
  94. y4 = _mm_slli_epi32(y4, 7);
  95. z4 = _mm_xor_si128(z4, y4);
  96. r4 = _mm_srli_epi32(r4, 25);
  97. z4 = _mm_xor_si128(z4, r4);
  98. y9 = z1;
  99. y9 = _mm_add_epi32(y9, z5);
  100. r9 = y9;
  101. y9 = _mm_slli_epi32(y9, 7);
  102. z9 = _mm_xor_si128(z9, y9);
  103. r9 = _mm_srli_epi32(r9, 25);
  104. z9 = _mm_xor_si128(z9, r9);
  105. y8 = z0;
  106. y8 = _mm_add_epi32(y8, z4);
  107. r8 = y8;
  108. y8 = _mm_slli_epi32(y8, 9);
  109. z8 = _mm_xor_si128(z8, y8);
  110. r8 = _mm_srli_epi32(r8, 23);
  111. z8 = _mm_xor_si128(z8, r8);
  112. y13 = z5;
  113. y13 = _mm_add_epi32(y13, z9);
  114. r13 = y13;
  115. y13 = _mm_slli_epi32(y13, 9);
  116. z13 = _mm_xor_si128(z13, y13);
  117. r13 = _mm_srli_epi32(r13, 23);
  118. z13 = _mm_xor_si128(z13, r13);
  119. y12 = z4;
  120. y12 = _mm_add_epi32(y12, z8);
  121. r12 = y12;
  122. y12 = _mm_slli_epi32(y12, 13);
  123. z12 = _mm_xor_si128(z12, y12);
  124. r12 = _mm_srli_epi32(r12, 19);
  125. z12 = _mm_xor_si128(z12, r12);
  126. y1 = z9;
  127. y1 = _mm_add_epi32(y1, z13);
  128. r1 = y1;
  129. y1 = _mm_slli_epi32(y1, 13);
  130. z1 = _mm_xor_si128(z1, y1);
  131. r1 = _mm_srli_epi32(r1, 19);
  132. z1 = _mm_xor_si128(z1, r1);
  133. y0 = z8;
  134. y0 = _mm_add_epi32(y0, z12);
  135. r0 = y0;
  136. y0 = _mm_slli_epi32(y0, 18);
  137. z0 = _mm_xor_si128(z0, y0);
  138. r0 = _mm_srli_epi32(r0, 14);
  139. z0 = _mm_xor_si128(z0, r0);
  140. y5 = z13;
  141. y5 = _mm_add_epi32(y5, z1);
  142. r5 = y5;
  143. y5 = _mm_slli_epi32(y5, 18);
  144. z5 = _mm_xor_si128(z5, y5);
  145. r5 = _mm_srli_epi32(r5, 14);
  146. z5 = _mm_xor_si128(z5, r5);
  147. y14 = z6;
  148. y14 = _mm_add_epi32(y14, z10);
  149. r14 = y14;
  150. y14 = _mm_slli_epi32(y14, 7);
  151. z14 = _mm_xor_si128(z14, y14);
  152. r14 = _mm_srli_epi32(r14, 25);
  153. z14 = _mm_xor_si128(z14, r14);
  154. y3 = z11;
  155. y3 = _mm_add_epi32(y3, z15);
  156. r3 = y3;
  157. y3 = _mm_slli_epi32(y3, 7);
  158. z3 = _mm_xor_si128(z3, y3);
  159. r3 = _mm_srli_epi32(r3, 25);
  160. z3 = _mm_xor_si128(z3, r3);
  161. y2 = z10;
  162. y2 = _mm_add_epi32(y2, z14);
  163. r2 = y2;
  164. y2 = _mm_slli_epi32(y2, 9);
  165. z2 = _mm_xor_si128(z2, y2);
  166. r2 = _mm_srli_epi32(r2, 23);
  167. z2 = _mm_xor_si128(z2, r2);
  168. y7 = z15;
  169. y7 = _mm_add_epi32(y7, z3);
  170. r7 = y7;
  171. y7 = _mm_slli_epi32(y7, 9);
  172. z7 = _mm_xor_si128(z7, y7);
  173. r7 = _mm_srli_epi32(r7, 23);
  174. z7 = _mm_xor_si128(z7, r7);
  175. y6 = z14;
  176. y6 = _mm_add_epi32(y6, z2);
  177. r6 = y6;
  178. y6 = _mm_slli_epi32(y6, 13);
  179. z6 = _mm_xor_si128(z6, y6);
  180. r6 = _mm_srli_epi32(r6, 19);
  181. z6 = _mm_xor_si128(z6, r6);
  182. y11 = z3;
  183. y11 = _mm_add_epi32(y11, z7);
  184. r11 = y11;
  185. y11 = _mm_slli_epi32(y11, 13);
  186. z11 = _mm_xor_si128(z11, y11);
  187. r11 = _mm_srli_epi32(r11, 19);
  188. z11 = _mm_xor_si128(z11, r11);
  189. y10 = z2;
  190. y10 = _mm_add_epi32(y10, z6);
  191. r10 = y10;
  192. y10 = _mm_slli_epi32(y10, 18);
  193. z10 = _mm_xor_si128(z10, y10);
  194. r10 = _mm_srli_epi32(r10, 14);
  195. z10 = _mm_xor_si128(z10, r10);
  196. y1 = z3;
  197. y1 = _mm_add_epi32(y1, z0);
  198. r1 = y1;
  199. y1 = _mm_slli_epi32(y1, 7);
  200. z1 = _mm_xor_si128(z1, y1);
  201. r1 = _mm_srli_epi32(r1, 25);
  202. z1 = _mm_xor_si128(z1, r1);
  203. y15 = z7;
  204. y15 = _mm_add_epi32(y15, z11);
  205. r15 = y15;
  206. y15 = _mm_slli_epi32(y15, 18);
  207. z15 = _mm_xor_si128(z15, y15);
  208. r15 = _mm_srli_epi32(r15, 14);
  209. z15 = _mm_xor_si128(z15, r15);
  210. y6 = z4;
  211. y6 = _mm_add_epi32(y6, z5);
  212. r6 = y6;
  213. y6 = _mm_slli_epi32(y6, 7);
  214. z6 = _mm_xor_si128(z6, y6);
  215. r6 = _mm_srli_epi32(r6, 25);
  216. z6 = _mm_xor_si128(z6, r6);
  217. y2 = z0;
  218. y2 = _mm_add_epi32(y2, z1);
  219. r2 = y2;
  220. y2 = _mm_slli_epi32(y2, 9);
  221. z2 = _mm_xor_si128(z2, y2);
  222. r2 = _mm_srli_epi32(r2, 23);
  223. z2 = _mm_xor_si128(z2, r2);
  224. y7 = z5;
  225. y7 = _mm_add_epi32(y7, z6);
  226. r7 = y7;
  227. y7 = _mm_slli_epi32(y7, 9);
  228. z7 = _mm_xor_si128(z7, y7);
  229. r7 = _mm_srli_epi32(r7, 23);
  230. z7 = _mm_xor_si128(z7, r7);
  231. y3 = z1;
  232. y3 = _mm_add_epi32(y3, z2);
  233. r3 = y3;
  234. y3 = _mm_slli_epi32(y3, 13);
  235. z3 = _mm_xor_si128(z3, y3);
  236. r3 = _mm_srli_epi32(r3, 19);
  237. z3 = _mm_xor_si128(z3, r3);
  238. y4 = z6;
  239. y4 = _mm_add_epi32(y4, z7);
  240. r4 = y4;
  241. y4 = _mm_slli_epi32(y4, 13);
  242. z4 = _mm_xor_si128(z4, y4);
  243. r4 = _mm_srli_epi32(r4, 19);
  244. z4 = _mm_xor_si128(z4, r4);
  245. y0 = z2;
  246. y0 = _mm_add_epi32(y0, z3);
  247. r0 = y0;
  248. y0 = _mm_slli_epi32(y0, 18);
  249. z0 = _mm_xor_si128(z0, y0);
  250. r0 = _mm_srli_epi32(r0, 14);
  251. z0 = _mm_xor_si128(z0, r0);
  252. y5 = z7;
  253. y5 = _mm_add_epi32(y5, z4);
  254. r5 = y5;
  255. y5 = _mm_slli_epi32(y5, 18);
  256. z5 = _mm_xor_si128(z5, y5);
  257. r5 = _mm_srli_epi32(r5, 14);
  258. z5 = _mm_xor_si128(z5, r5);
  259. y11 = z9;
  260. y11 = _mm_add_epi32(y11, z10);
  261. r11 = y11;
  262. y11 = _mm_slli_epi32(y11, 7);
  263. z11 = _mm_xor_si128(z11, y11);
  264. r11 = _mm_srli_epi32(r11, 25);
  265. z11 = _mm_xor_si128(z11, r11);
  266. y12 = z14;
  267. y12 = _mm_add_epi32(y12, z15);
  268. r12 = y12;
  269. y12 = _mm_slli_epi32(y12, 7);
  270. z12 = _mm_xor_si128(z12, y12);
  271. r12 = _mm_srli_epi32(r12, 25);
  272. z12 = _mm_xor_si128(z12, r12);
  273. y8 = z10;
  274. y8 = _mm_add_epi32(y8, z11);
  275. r8 = y8;
  276. y8 = _mm_slli_epi32(y8, 9);
  277. z8 = _mm_xor_si128(z8, y8);
  278. r8 = _mm_srli_epi32(r8, 23);
  279. z8 = _mm_xor_si128(z8, r8);
  280. y13 = z15;
  281. y13 = _mm_add_epi32(y13, z12);
  282. r13 = y13;
  283. y13 = _mm_slli_epi32(y13, 9);
  284. z13 = _mm_xor_si128(z13, y13);
  285. r13 = _mm_srli_epi32(r13, 23);
  286. z13 = _mm_xor_si128(z13, r13);
  287. y9 = z11;
  288. y9 = _mm_add_epi32(y9, z8);
  289. r9 = y9;
  290. y9 = _mm_slli_epi32(y9, 13);
  291. z9 = _mm_xor_si128(z9, y9);
  292. r9 = _mm_srli_epi32(r9, 19);
  293. z9 = _mm_xor_si128(z9, r9);
  294. y14 = z12;
  295. y14 = _mm_add_epi32(y14, z13);
  296. r14 = y14;
  297. y14 = _mm_slli_epi32(y14, 13);
  298. z14 = _mm_xor_si128(z14, y14);
  299. r14 = _mm_srli_epi32(r14, 19);
  300. z14 = _mm_xor_si128(z14, r14);
  301. y10 = z8;
  302. y10 = _mm_add_epi32(y10, z9);
  303. r10 = y10;
  304. y10 = _mm_slli_epi32(y10, 18);
  305. z10 = _mm_xor_si128(z10, y10);
  306. r10 = _mm_srli_epi32(r10, 14);
  307. z10 = _mm_xor_si128(z10, r10);
  308. y15 = z13;
  309. y15 = _mm_add_epi32(y15, z14);
  310. r15 = y15;
  311. y15 = _mm_slli_epi32(y15, 18);
  312. z15 = _mm_xor_si128(z15, y15);
  313. r15 = _mm_srli_epi32(r15, 14);
  314. z15 = _mm_xor_si128(z15, r15);
  315. }
  316. /* store data ; this macro replicates the original amd64-xmm6 code */
  317. #define ONEQUAD_SHUFFLE(A, B, C, D) \
  318. z##A = _mm_add_epi32(z##A, orig##A); \
  319. z##B = _mm_add_epi32(z##B, orig##B); \
  320. z##C = _mm_add_epi32(z##C, orig##C); \
  321. z##D = _mm_add_epi32(z##D, orig##D); \
  322. in##A = _mm_cvtsi128_si32(z##A); \
  323. in##B = _mm_cvtsi128_si32(z##B); \
  324. in##C = _mm_cvtsi128_si32(z##C); \
  325. in##D = _mm_cvtsi128_si32(z##D); \
  326. z##A = _mm_shuffle_epi32(z##A, 0x39); \
  327. z##B = _mm_shuffle_epi32(z##B, 0x39); \
  328. z##C = _mm_shuffle_epi32(z##C, 0x39); \
  329. z##D = _mm_shuffle_epi32(z##D, 0x39); \
  330. \
  331. in##A ^= *(uint32_t *) (m + 0); \
  332. in##B ^= *(uint32_t *) (m + 4); \
  333. in##C ^= *(uint32_t *) (m + 8); \
  334. in##D ^= *(uint32_t *) (m + 12); \
  335. \
  336. *(uint32_t *) (c + 0) = in##A; \
  337. *(uint32_t *) (c + 4) = in##B; \
  338. *(uint32_t *) (c + 8) = in##C; \
  339. *(uint32_t *) (c + 12) = in##D; \
  340. \
  341. in##A = _mm_cvtsi128_si32(z##A); \
  342. in##B = _mm_cvtsi128_si32(z##B); \
  343. in##C = _mm_cvtsi128_si32(z##C); \
  344. in##D = _mm_cvtsi128_si32(z##D); \
  345. z##A = _mm_shuffle_epi32(z##A, 0x39); \
  346. z##B = _mm_shuffle_epi32(z##B, 0x39); \
  347. z##C = _mm_shuffle_epi32(z##C, 0x39); \
  348. z##D = _mm_shuffle_epi32(z##D, 0x39); \
  349. \
  350. in##A ^= *(uint32_t *) (m + 64); \
  351. in##B ^= *(uint32_t *) (m + 68); \
  352. in##C ^= *(uint32_t *) (m + 72); \
  353. in##D ^= *(uint32_t *) (m + 76); \
  354. *(uint32_t *) (c + 64) = in##A; \
  355. *(uint32_t *) (c + 68) = in##B; \
  356. *(uint32_t *) (c + 72) = in##C; \
  357. *(uint32_t *) (c + 76) = in##D; \
  358. \
  359. in##A = _mm_cvtsi128_si32(z##A); \
  360. in##B = _mm_cvtsi128_si32(z##B); \
  361. in##C = _mm_cvtsi128_si32(z##C); \
  362. in##D = _mm_cvtsi128_si32(z##D); \
  363. z##A = _mm_shuffle_epi32(z##A, 0x39); \
  364. z##B = _mm_shuffle_epi32(z##B, 0x39); \
  365. z##C = _mm_shuffle_epi32(z##C, 0x39); \
  366. z##D = _mm_shuffle_epi32(z##D, 0x39); \
  367. \
  368. in##A ^= *(uint32_t *) (m + 128); \
  369. in##B ^= *(uint32_t *) (m + 132); \
  370. in##C ^= *(uint32_t *) (m + 136); \
  371. in##D ^= *(uint32_t *) (m + 140); \
  372. *(uint32_t *) (c + 128) = in##A; \
  373. *(uint32_t *) (c + 132) = in##B; \
  374. *(uint32_t *) (c + 136) = in##C; \
  375. *(uint32_t *) (c + 140) = in##D; \
  376. \
  377. in##A = _mm_cvtsi128_si32(z##A); \
  378. in##B = _mm_cvtsi128_si32(z##B); \
  379. in##C = _mm_cvtsi128_si32(z##C); \
  380. in##D = _mm_cvtsi128_si32(z##D); \
  381. \
  382. in##A ^= *(uint32_t *) (m + 192); \
  383. in##B ^= *(uint32_t *) (m + 196); \
  384. in##C ^= *(uint32_t *) (m + 200); \
  385. in##D ^= *(uint32_t *) (m + 204); \
  386. *(uint32_t *) (c + 192) = in##A; \
  387. *(uint32_t *) (c + 196) = in##B; \
  388. *(uint32_t *) (c + 200) = in##C; \
  389. *(uint32_t *) (c + 204) = in##D
  390. /* store data ; this macro replaces shuffle+mov by a direct extract; not much
  391. * difference */
  392. #define ONEQUAD_EXTRACT(A, B, C, D) \
  393. z##A = _mm_add_epi32(z##A, orig##A); \
  394. z##B = _mm_add_epi32(z##B, orig##B); \
  395. z##C = _mm_add_epi32(z##C, orig##C); \
  396. z##D = _mm_add_epi32(z##D, orig##D); \
  397. in##A = _mm_cvtsi128_si32(z##A); \
  398. in##B = _mm_cvtsi128_si32(z##B); \
  399. in##C = _mm_cvtsi128_si32(z##C); \
  400. in##D = _mm_cvtsi128_si32(z##D); \
  401. in##A ^= *(uint32_t *) (m + 0); \
  402. in##B ^= *(uint32_t *) (m + 4); \
  403. in##C ^= *(uint32_t *) (m + 8); \
  404. in##D ^= *(uint32_t *) (m + 12); \
  405. *(uint32_t *) (c + 0) = in##A; \
  406. *(uint32_t *) (c + 4) = in##B; \
  407. *(uint32_t *) (c + 8) = in##C; \
  408. *(uint32_t *) (c + 12) = in##D; \
  409. \
  410. in##A = _mm_extract_epi32(z##A, 1); \
  411. in##B = _mm_extract_epi32(z##B, 1); \
  412. in##C = _mm_extract_epi32(z##C, 1); \
  413. in##D = _mm_extract_epi32(z##D, 1); \
  414. \
  415. in##A ^= *(uint32_t *) (m + 64); \
  416. in##B ^= *(uint32_t *) (m + 68); \
  417. in##C ^= *(uint32_t *) (m + 72); \
  418. in##D ^= *(uint32_t *) (m + 76); \
  419. *(uint32_t *) (c + 64) = in##A; \
  420. *(uint32_t *) (c + 68) = in##B; \
  421. *(uint32_t *) (c + 72) = in##C; \
  422. *(uint32_t *) (c + 76) = in##D; \
  423. \
  424. in##A = _mm_extract_epi32(z##A, 2); \
  425. in##B = _mm_extract_epi32(z##B, 2); \
  426. in##C = _mm_extract_epi32(z##C, 2); \
  427. in##D = _mm_extract_epi32(z##D, 2); \
  428. \
  429. in##A ^= *(uint32_t *) (m + 128); \
  430. in##B ^= *(uint32_t *) (m + 132); \
  431. in##C ^= *(uint32_t *) (m + 136); \
  432. in##D ^= *(uint32_t *) (m + 140); \
  433. *(uint32_t *) (c + 128) = in##A; \
  434. *(uint32_t *) (c + 132) = in##B; \
  435. *(uint32_t *) (c + 136) = in##C; \
  436. *(uint32_t *) (c + 140) = in##D; \
  437. \
  438. in##A = _mm_extract_epi32(z##A, 3); \
  439. in##B = _mm_extract_epi32(z##B, 3); \
  440. in##C = _mm_extract_epi32(z##C, 3); \
  441. in##D = _mm_extract_epi32(z##D, 3); \
  442. \
  443. in##A ^= *(uint32_t *) (m + 192); \
  444. in##B ^= *(uint32_t *) (m + 196); \
  445. in##C ^= *(uint32_t *) (m + 200); \
  446. in##D ^= *(uint32_t *) (m + 204); \
  447. *(uint32_t *) (c + 192) = in##A; \
  448. *(uint32_t *) (c + 196) = in##B; \
  449. *(uint32_t *) (c + 200) = in##C; \
  450. *(uint32_t *) (c + 204) = in##D
  451. /* store data ; this macro first transpose data in-registers, and then store
  452. * them in memory. much faster with icc. */
  453. #define ONEQUAD_TRANSPOSE(A, B, C, D) \
  454. z##A = _mm_add_epi32(z##A, orig##A); \
  455. z##B = _mm_add_epi32(z##B, orig##B); \
  456. z##C = _mm_add_epi32(z##C, orig##C); \
  457. z##D = _mm_add_epi32(z##D, orig##D); \
  458. y##A = _mm_unpacklo_epi32(z##A, z##B); \
  459. y##B = _mm_unpacklo_epi32(z##C, z##D); \
  460. y##C = _mm_unpackhi_epi32(z##A, z##B); \
  461. y##D = _mm_unpackhi_epi32(z##C, z##D); \
  462. z##A = _mm_unpacklo_epi64(y##A, y##B); \
  463. z##B = _mm_unpackhi_epi64(y##A, y##B); \
  464. z##C = _mm_unpacklo_epi64(y##C, y##D); \
  465. z##D = _mm_unpackhi_epi64(y##C, y##D); \
  466. y##A = _mm_xor_si128(z##A, _mm_loadu_si128((const __m128i *) (m + 0))); \
  467. _mm_storeu_si128((__m128i *) (c + 0), y##A); \
  468. y##B = _mm_xor_si128(z##B, _mm_loadu_si128((const __m128i *) (m + 64))); \
  469. _mm_storeu_si128((__m128i *) (c + 64), y##B); \
  470. y##C = _mm_xor_si128(z##C, _mm_loadu_si128((const __m128i *) (m + 128))); \
  471. _mm_storeu_si128((__m128i *) (c + 128), y##C); \
  472. y##D = _mm_xor_si128(z##D, _mm_loadu_si128((const __m128i *) (m + 192))); \
  473. _mm_storeu_si128((__m128i *) (c + 192), y##D)
  474. #define ONEQUAD(A, B, C, D) ONEQUAD_TRANSPOSE(A, B, C, D)
  475. ONEQUAD(0, 1, 2, 3);
  476. m += 16;
  477. c += 16;
  478. ONEQUAD(4, 5, 6, 7);
  479. m += 16;
  480. c += 16;
  481. ONEQUAD(8, 9, 10, 11);
  482. m += 16;
  483. c += 16;
  484. ONEQUAD(12, 13, 14, 15);
  485. m -= 48;
  486. c -= 48;
  487. #undef ONEQUAD
  488. #undef ONEQUAD_TRANSPOSE
  489. #undef ONEQUAD_EXTRACT
  490. #undef ONEQUAD_SHUFFLE
  491. bytes -= 256;
  492. c += 256;
  493. m += 256;
  494. }
  495. }