blake2b-load-avx2.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. #ifndef blake2b_load_avx2_H
  2. #define blake2b_load_avx2_H
  3. #define BLAKE2B_LOAD_MSG_0_1(b0) \
  4. do { \
  5. t0 = _mm256_unpacklo_epi64(m0, m1); \
  6. t1 = _mm256_unpacklo_epi64(m2, m3); \
  7. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  8. } while (0)
  9. #define BLAKE2B_LOAD_MSG_0_2(b0) \
  10. do { \
  11. t0 = _mm256_unpackhi_epi64(m0, m1); \
  12. t1 = _mm256_unpackhi_epi64(m2, m3); \
  13. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  14. } while (0)
  15. #define BLAKE2B_LOAD_MSG_0_3(b0) \
  16. do { \
  17. t0 = _mm256_unpacklo_epi64(m7, m4); \
  18. t1 = _mm256_unpacklo_epi64(m5, m6); \
  19. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  20. } while (0)
  21. #define BLAKE2B_LOAD_MSG_0_4(b0) \
  22. do { \
  23. t0 = _mm256_unpackhi_epi64(m7, m4); \
  24. t1 = _mm256_unpackhi_epi64(m5, m6); \
  25. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  26. } while (0)
  27. #define BLAKE2B_LOAD_MSG_1_1(b0) \
  28. do { \
  29. t0 = _mm256_unpacklo_epi64(m7, m2); \
  30. t1 = _mm256_unpackhi_epi64(m4, m6); \
  31. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  32. } while (0)
  33. #define BLAKE2B_LOAD_MSG_1_2(b0) \
  34. do { \
  35. t0 = _mm256_unpacklo_epi64(m5, m4); \
  36. t1 = _mm256_alignr_epi8(m3, m7, 8); \
  37. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  38. } while (0)
  39. #define BLAKE2B_LOAD_MSG_1_3(b0) \
  40. do { \
  41. t0 = _mm256_unpackhi_epi64(m2, m0); \
  42. t1 = _mm256_blend_epi32(m5, m0, 0x33); \
  43. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  44. } while (0)
  45. #define BLAKE2B_LOAD_MSG_1_4(b0) \
  46. do { \
  47. t0 = _mm256_alignr_epi8(m6, m1, 8); \
  48. t1 = _mm256_blend_epi32(m3, m1, 0x33); \
  49. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  50. } while (0)
  51. #define BLAKE2B_LOAD_MSG_2_1(b0) \
  52. do { \
  53. t0 = _mm256_alignr_epi8(m6, m5, 8); \
  54. t1 = _mm256_unpackhi_epi64(m2, m7); \
  55. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  56. } while (0)
  57. #define BLAKE2B_LOAD_MSG_2_2(b0) \
  58. do { \
  59. t0 = _mm256_unpacklo_epi64(m4, m0); \
  60. t1 = _mm256_blend_epi32(m6, m1, 0x33); \
  61. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  62. } while (0)
  63. #define BLAKE2B_LOAD_MSG_2_3(b0) \
  64. do { \
  65. t0 = _mm256_alignr_epi8(m5, m4, 8); \
  66. t1 = _mm256_unpackhi_epi64(m1, m3); \
  67. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  68. } while (0)
  69. #define BLAKE2B_LOAD_MSG_2_4(b0) \
  70. do { \
  71. t0 = _mm256_unpacklo_epi64(m2, m7); \
  72. t1 = _mm256_blend_epi32(m0, m3, 0x33); \
  73. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  74. } while (0)
  75. #define BLAKE2B_LOAD_MSG_3_1(b0) \
  76. do { \
  77. t0 = _mm256_unpackhi_epi64(m3, m1); \
  78. t1 = _mm256_unpackhi_epi64(m6, m5); \
  79. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  80. } while (0)
  81. #define BLAKE2B_LOAD_MSG_3_2(b0) \
  82. do { \
  83. t0 = _mm256_unpackhi_epi64(m4, m0); \
  84. t1 = _mm256_unpacklo_epi64(m6, m7); \
  85. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  86. } while (0)
  87. #define BLAKE2B_LOAD_MSG_3_3(b0) \
  88. do { \
  89. t0 = _mm256_alignr_epi8(m1, m7, 8); \
  90. t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE(1, 0, 3, 2)); \
  91. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  92. } while (0)
  93. #define BLAKE2B_LOAD_MSG_3_4(b0) \
  94. do { \
  95. t0 = _mm256_unpacklo_epi64(m4, m3); \
  96. t1 = _mm256_unpacklo_epi64(m5, m0); \
  97. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  98. } while (0)
  99. #define BLAKE2B_LOAD_MSG_4_1(b0) \
  100. do { \
  101. t0 = _mm256_unpackhi_epi64(m4, m2); \
  102. t1 = _mm256_unpacklo_epi64(m1, m5); \
  103. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  104. } while (0)
  105. #define BLAKE2B_LOAD_MSG_4_2(b0) \
  106. do { \
  107. t0 = _mm256_blend_epi32(m3, m0, 0x33); \
  108. t1 = _mm256_blend_epi32(m7, m2, 0x33); \
  109. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  110. } while (0)
  111. #define BLAKE2B_LOAD_MSG_4_3(b0) \
  112. do { \
  113. t0 = _mm256_alignr_epi8(m7, m1, 8); \
  114. t1 = _mm256_alignr_epi8(m3, m5, 8); \
  115. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  116. } while (0)
  117. #define BLAKE2B_LOAD_MSG_4_4(b0) \
  118. do { \
  119. t0 = _mm256_unpackhi_epi64(m6, m0); \
  120. t1 = _mm256_unpacklo_epi64(m6, m4); \
  121. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  122. } while (0)
  123. #define BLAKE2B_LOAD_MSG_5_1(b0) \
  124. do { \
  125. t0 = _mm256_unpacklo_epi64(m1, m3); \
  126. t1 = _mm256_unpacklo_epi64(m0, m4); \
  127. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  128. } while (0)
  129. #define BLAKE2B_LOAD_MSG_5_2(b0) \
  130. do { \
  131. t0 = _mm256_unpacklo_epi64(m6, m5); \
  132. t1 = _mm256_unpackhi_epi64(m5, m1); \
  133. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  134. } while (0)
  135. #define BLAKE2B_LOAD_MSG_5_3(b0) \
  136. do { \
  137. t0 = _mm256_alignr_epi8(m2, m0, 8); \
  138. t1 = _mm256_unpackhi_epi64(m3, m7); \
  139. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  140. } while (0)
  141. #define BLAKE2B_LOAD_MSG_5_4(b0) \
  142. do { \
  143. t0 = _mm256_unpackhi_epi64(m4, m6); \
  144. t1 = _mm256_alignr_epi8(m7, m2, 8); \
  145. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  146. } while (0)
  147. #define BLAKE2B_LOAD_MSG_6_1(b0) \
  148. do { \
  149. t0 = _mm256_blend_epi32(m0, m6, 0x33); \
  150. t1 = _mm256_unpacklo_epi64(m7, m2); \
  151. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  152. } while (0)
  153. #define BLAKE2B_LOAD_MSG_6_2(b0) \
  154. do { \
  155. t0 = _mm256_unpackhi_epi64(m2, m7); \
  156. t1 = _mm256_alignr_epi8(m5, m6, 8); \
  157. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  158. } while (0)
  159. #define BLAKE2B_LOAD_MSG_6_3(b0) \
  160. do { \
  161. t0 = _mm256_unpacklo_epi64(m4, m0); \
  162. t1 = _mm256_blend_epi32(m4, m3, 0x33); \
  163. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  164. } while (0)
  165. #define BLAKE2B_LOAD_MSG_6_4(b0) \
  166. do { \
  167. t0 = _mm256_unpackhi_epi64(m5, m3); \
  168. t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE(1, 0, 3, 2)); \
  169. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  170. } while (0)
  171. #define BLAKE2B_LOAD_MSG_7_1(b0) \
  172. do { \
  173. t0 = _mm256_unpackhi_epi64(m6, m3); \
  174. t1 = _mm256_blend_epi32(m1, m6, 0x33); \
  175. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  176. } while (0)
  177. #define BLAKE2B_LOAD_MSG_7_2(b0) \
  178. do { \
  179. t0 = _mm256_alignr_epi8(m7, m5, 8); \
  180. t1 = _mm256_unpackhi_epi64(m0, m4); \
  181. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  182. } while (0)
  183. #define BLAKE2B_LOAD_MSG_7_3(b0) \
  184. do { \
  185. t0 = _mm256_blend_epi32(m2, m1, 0x33); \
  186. t1 = _mm256_alignr_epi8(m4, m7, 8); \
  187. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  188. } while (0)
  189. #define BLAKE2B_LOAD_MSG_7_4(b0) \
  190. do { \
  191. t0 = _mm256_unpacklo_epi64(m5, m0); \
  192. t1 = _mm256_unpacklo_epi64(m2, m3); \
  193. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  194. } while (0)
  195. #define BLAKE2B_LOAD_MSG_8_1(b0) \
  196. do { \
  197. t0 = _mm256_unpacklo_epi64(m3, m7); \
  198. t1 = _mm256_alignr_epi8(m0, m5, 8); \
  199. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  200. } while (0)
  201. #define BLAKE2B_LOAD_MSG_8_2(b0) \
  202. do { \
  203. t0 = _mm256_unpackhi_epi64(m7, m4); \
  204. t1 = _mm256_alignr_epi8(m4, m1, 8); \
  205. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  206. } while (0)
  207. #define BLAKE2B_LOAD_MSG_8_3(b0) \
  208. do { \
  209. t0 = _mm256_unpacklo_epi64(m5, m6); \
  210. t1 = _mm256_unpackhi_epi64(m6, m0); \
  211. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  212. } while (0)
  213. #define BLAKE2B_LOAD_MSG_8_4(b0) \
  214. do { \
  215. t0 = _mm256_alignr_epi8(m1, m2, 8); \
  216. t1 = _mm256_alignr_epi8(m2, m3, 8); \
  217. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  218. } while (0)
  219. #define BLAKE2B_LOAD_MSG_9_1(b0) \
  220. do { \
  221. t0 = _mm256_unpacklo_epi64(m5, m4); \
  222. t1 = _mm256_unpackhi_epi64(m3, m0); \
  223. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  224. } while (0)
  225. #define BLAKE2B_LOAD_MSG_9_2(b0) \
  226. do { \
  227. t0 = _mm256_unpacklo_epi64(m1, m2); \
  228. t1 = _mm256_blend_epi32(m2, m3, 0x33); \
  229. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  230. } while (0)
  231. #define BLAKE2B_LOAD_MSG_9_3(b0) \
  232. do { \
  233. t0 = _mm256_unpackhi_epi64(m6, m7); \
  234. t1 = _mm256_unpackhi_epi64(m4, m1); \
  235. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  236. } while (0)
  237. #define BLAKE2B_LOAD_MSG_9_4(b0) \
  238. do { \
  239. t0 = _mm256_blend_epi32(m5, m0, 0x33); \
  240. t1 = _mm256_unpacklo_epi64(m7, m6); \
  241. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  242. } while (0)
  243. #define BLAKE2B_LOAD_MSG_10_1(b0) \
  244. do { \
  245. t0 = _mm256_unpacklo_epi64(m0, m1); \
  246. t1 = _mm256_unpacklo_epi64(m2, m3); \
  247. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  248. } while (0)
  249. #define BLAKE2B_LOAD_MSG_10_2(b0) \
  250. do { \
  251. t0 = _mm256_unpackhi_epi64(m0, m1); \
  252. t1 = _mm256_unpackhi_epi64(m2, m3); \
  253. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  254. } while (0)
  255. #define BLAKE2B_LOAD_MSG_10_3(b0) \
  256. do { \
  257. t0 = _mm256_unpacklo_epi64(m7, m4); \
  258. t1 = _mm256_unpacklo_epi64(m5, m6); \
  259. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  260. } while (0)
  261. #define BLAKE2B_LOAD_MSG_10_4(b0) \
  262. do { \
  263. t0 = _mm256_unpackhi_epi64(m7, m4); \
  264. t1 = _mm256_unpackhi_epi64(m5, m6); \
  265. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  266. } while (0)
  267. #define BLAKE2B_LOAD_MSG_11_1(b0) \
  268. do { \
  269. t0 = _mm256_unpacklo_epi64(m7, m2); \
  270. t1 = _mm256_unpackhi_epi64(m4, m6); \
  271. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  272. } while (0)
  273. #define BLAKE2B_LOAD_MSG_11_2(b0) \
  274. do { \
  275. t0 = _mm256_unpacklo_epi64(m5, m4); \
  276. t1 = _mm256_alignr_epi8(m3, m7, 8); \
  277. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  278. } while (0)
  279. #define BLAKE2B_LOAD_MSG_11_3(b0) \
  280. do { \
  281. t0 = _mm256_unpackhi_epi64(m2, m0); \
  282. t1 = _mm256_blend_epi32(m5, m0, 0x33); \
  283. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  284. } while (0)
  285. #define BLAKE2B_LOAD_MSG_11_4(b0) \
  286. do { \
  287. t0 = _mm256_alignr_epi8(m6, m1, 8); \
  288. t1 = _mm256_blend_epi32(m3, m1, 0x33); \
  289. b0 = _mm256_blend_epi32(t0, t1, 0xF0); \
  290. } while (0)
  291. #endif