blake2b-compress-ssse3.c 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. #include <stdint.h>
  2. #include <string.h>
  3. #include "blake2.h"
  4. #include "private/common.h"
  5. #include "private/sse2_64_32.h"
  6. #if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H)
  7. # ifdef __GNUC__
  8. # pragma GCC target("sse2")
  9. # pragma GCC target("ssse3")
  10. # endif
  11. # include <emmintrin.h>
  12. # include <tmmintrin.h>
  13. # include "blake2b-compress-ssse3.h"
  14. CRYPTO_ALIGN(64)
  15. static const uint64_t blake2b_IV[8] = {
  16. 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL,
  17. 0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
  18. 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
  19. };
  20. int
  21. blake2b_compress_ssse3(blake2b_state *S,
  22. const uint8_t block[BLAKE2B_BLOCKBYTES])
  23. {
  24. __m128i row1l, row1h;
  25. __m128i row2l, row2h;
  26. __m128i row3l, row3h;
  27. __m128i row4l, row4h;
  28. __m128i b0, b1;
  29. __m128i t0, t1;
  30. const __m128i r16 =
  31. _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
  32. const __m128i r24 =
  33. _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
  34. const uint64_t m0 = ((const uint64_t *) block)[0];
  35. const uint64_t m1 = ((const uint64_t *) block)[1];
  36. const uint64_t m2 = ((const uint64_t *) block)[2];
  37. const uint64_t m3 = ((const uint64_t *) block)[3];
  38. const uint64_t m4 = ((const uint64_t *) block)[4];
  39. const uint64_t m5 = ((const uint64_t *) block)[5];
  40. const uint64_t m6 = ((const uint64_t *) block)[6];
  41. const uint64_t m7 = ((const uint64_t *) block)[7];
  42. const uint64_t m8 = ((const uint64_t *) block)[8];
  43. const uint64_t m9 = ((const uint64_t *) block)[9];
  44. const uint64_t m10 = ((const uint64_t *) block)[10];
  45. const uint64_t m11 = ((const uint64_t *) block)[11];
  46. const uint64_t m12 = ((const uint64_t *) block)[12];
  47. const uint64_t m13 = ((const uint64_t *) block)[13];
  48. const uint64_t m14 = ((const uint64_t *) block)[14];
  49. const uint64_t m15 = ((const uint64_t *) block)[15];
  50. row1l = LOADU(&S->h[0]);
  51. row1h = LOADU(&S->h[2]);
  52. row2l = LOADU(&S->h[4]);
  53. row2h = LOADU(&S->h[6]);
  54. row3l = LOADU(&blake2b_IV[0]);
  55. row3h = LOADU(&blake2b_IV[2]);
  56. row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
  57. row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
  58. ROUND(0);
  59. ROUND(1);
  60. ROUND(2);
  61. ROUND(3);
  62. ROUND(4);
  63. ROUND(5);
  64. ROUND(6);
  65. ROUND(7);
  66. ROUND(8);
  67. ROUND(9);
  68. ROUND(10);
  69. ROUND(11);
  70. row1l = _mm_xor_si128(row3l, row1l);
  71. row1h = _mm_xor_si128(row3h, row1h);
  72. STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
  73. STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
  74. row2l = _mm_xor_si128(row4l, row2l);
  75. row2h = _mm_xor_si128(row4h, row2h);
  76. STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
  77. STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
  78. return 0;
  79. }
  80. #endif