blake2b-compress-avx2.h 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. #ifndef blake2b_compress_avx2_H
  2. #define blake2b_compress_avx2_H
  3. #define LOADU128(p) _mm_loadu_si128((const __m128i *) (p))
  4. #define STOREU128(p, r) _mm_storeu_si128((__m128i *) (p), r)
  5. #define LOADU(p) _mm256_loadu_si256((const __m256i *) (p))
  6. #define STOREU(p, r) _mm256_storeu_si256((__m256i *) (p), r)
  7. #if defined(__INTEL_COMPILER) || defined(_MSC_VER) || defined(__GNUC__)
  8. # define LOAD(p) _mm256_load_si256((const __m256i *) (p))
  9. # define STORE(p, r) _mm256_store_si256((__m256i *) (p), r)
  10. #else
  11. # define LOAD(p) LOADU(p)
  12. # define STORE(p, r) STOREU(p, r)
  13. #endif
  14. static inline uint64_t
  15. LOADU64(const void *p)
  16. {
  17. uint64_t v;
  18. memcpy(&v, p, sizeof v);
  19. return v;
  20. }
  21. #define ROTATE16 \
  22. _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, \
  23. 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9)
  24. #define ROTATE24 \
  25. _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, \
  26. 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10)
  27. #define ADD(a, b) _mm256_add_epi64(a, b)
  28. #define SUB(a, b) _mm256_sub_epi64(a, b)
  29. #define XOR(a, b) _mm256_xor_si256(a, b)
  30. #define AND(a, b) _mm256_and_si256(a, b)
  31. #define OR(a, b) _mm256_or_si256(a, b)
  32. #define ROT32(x) _mm256_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1))
  33. #define ROT24(x) _mm256_shuffle_epi8((x), ROTATE24)
  34. #define ROT16(x) _mm256_shuffle_epi8((x), ROTATE16)
  35. #define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x)))
  36. #define BLAKE2B_G1_V1(a, b, c, d, m) \
  37. do { \
  38. a = ADD(a, m); \
  39. a = ADD(a, b); \
  40. d = XOR(d, a); \
  41. d = ROT32(d); \
  42. c = ADD(c, d); \
  43. b = XOR(b, c); \
  44. b = ROT24(b); \
  45. } while (0)
  46. #define BLAKE2B_G2_V1(a, b, c, d, m) \
  47. do { \
  48. a = ADD(a, m); \
  49. a = ADD(a, b); \
  50. d = XOR(d, a); \
  51. d = ROT16(d); \
  52. c = ADD(c, d); \
  53. b = XOR(b, c); \
  54. b = ROT63(b); \
  55. } while (0)
  56. #define BLAKE2B_DIAG_V1(a, b, c, d) \
  57. do { \
  58. a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(2, 1, 0, 3)); \
  59. d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(1, 0, 3, 2)); \
  60. c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(0, 3, 2, 1)); \
  61. } while(0)
  62. #define BLAKE2B_UNDIAG_V1(a, b, c, d) \
  63. do { \
  64. a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(0, 3, 2, 1)); \
  65. d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(1, 0, 3, 2)); \
  66. c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(2, 1, 0, 3)); \
  67. } while(0)
  68. #include "blake2b-load-avx2.h"
  69. #define BLAKE2B_ROUND_V1(a, b, c, d, r, m) \
  70. do { \
  71. __m256i b0; \
  72. BLAKE2B_LOAD_MSG_##r##_1(b0); \
  73. BLAKE2B_G1_V1(a, b, c, d, b0); \
  74. BLAKE2B_LOAD_MSG_##r##_2(b0); \
  75. BLAKE2B_G2_V1(a, b, c, d, b0); \
  76. BLAKE2B_DIAG_V1(a, b, c, d); \
  77. BLAKE2B_LOAD_MSG_##r##_3(b0); \
  78. BLAKE2B_G1_V1(a, b, c, d, b0); \
  79. BLAKE2B_LOAD_MSG_##r##_4(b0); \
  80. BLAKE2B_G2_V1(a, b, c, d, b0); \
  81. BLAKE2B_UNDIAG_V1(a, b, c, d); \
  82. } while (0)
  83. #define BLAKE2B_ROUNDS_V1(a, b, c, d, m) \
  84. do { \
  85. BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \
  86. BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \
  87. BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \
  88. BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \
  89. BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \
  90. BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \
  91. BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \
  92. BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \
  93. BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \
  94. BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \
  95. BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \
  96. BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \
  97. } while (0)
  98. #define DECLARE_MESSAGE_WORDS(m) \
  99. const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \
  100. const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \
  101. const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \
  102. const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \
  103. const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \
  104. const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \
  105. const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \
  106. const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \
  107. __m256i t0, t1;
  108. #define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) \
  109. do { \
  110. DECLARE_MESSAGE_WORDS(m) \
  111. const __m256i iv0 = a; \
  112. const __m256i iv1 = b; \
  113. __m256i c = LOAD(&blake2b_IV[0]); \
  114. __m256i d = \
  115. XOR(LOAD(&blake2b_IV[4]), _mm256_set_epi64x(f1, f0, t1, t0)); \
  116. BLAKE2B_ROUNDS_V1(a, b, c, d, m); \
  117. a = XOR(a, c); \
  118. b = XOR(b, d); \
  119. a = XOR(a, iv0); \
  120. b = XOR(b, iv1); \
  121. } while (0)
  122. #endif