blake2b-compress-sse41.c 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. #define BLAKE2_USE_SSSE3
  2. #define BLAKE2_USE_SSE41
  3. #include <stdint.h>
  4. #include <string.h>
  5. #include "blake2.h"
  6. #include "private/common.h"
  7. #include "private/sse2_64_32.h"
  8. #if defined(HAVE_EMMINTRIN_H) && defined(HAVE_TMMINTRIN_H) && \
  9. defined(HAVE_SMMINTRIN_H)
  10. # ifdef __GNUC__
  11. # pragma GCC target("sse2")
  12. # pragma GCC target("ssse3")
  13. # pragma GCC target("sse4.1")
  14. # endif
  15. # include <emmintrin.h>
  16. # include <smmintrin.h>
  17. # include <tmmintrin.h>
  18. # include "blake2b-compress-sse41.h"
  19. CRYPTO_ALIGN(64)
  20. static const uint64_t blake2b_IV[8] = {
  21. 0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL, 0x3c6ef372fe94f82bULL,
  22. 0xa54ff53a5f1d36f1ULL, 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
  23. 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
  24. };
  25. int
  26. blake2b_compress_sse41(blake2b_state *S,
  27. const uint8_t block[BLAKE2B_BLOCKBYTES])
  28. {
  29. __m128i row1l, row1h;
  30. __m128i row2l, row2h;
  31. __m128i row3l, row3h;
  32. __m128i row4l, row4h;
  33. __m128i b0, b1;
  34. __m128i t0, t1;
  35. const __m128i r16 =
  36. _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
  37. const __m128i r24 =
  38. _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
  39. const __m128i m0 = LOADU(block + 00);
  40. const __m128i m1 = LOADU(block + 16);
  41. const __m128i m2 = LOADU(block + 32);
  42. const __m128i m3 = LOADU(block + 48);
  43. const __m128i m4 = LOADU(block + 64);
  44. const __m128i m5 = LOADU(block + 80);
  45. const __m128i m6 = LOADU(block + 96);
  46. const __m128i m7 = LOADU(block + 112);
  47. row1l = LOADU(&S->h[0]);
  48. row1h = LOADU(&S->h[2]);
  49. row2l = LOADU(&S->h[4]);
  50. row2h = LOADU(&S->h[6]);
  51. row3l = LOADU(&blake2b_IV[0]);
  52. row3h = LOADU(&blake2b_IV[2]);
  53. row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
  54. row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
  55. ROUND(0);
  56. ROUND(1);
  57. ROUND(2);
  58. ROUND(3);
  59. ROUND(4);
  60. ROUND(5);
  61. ROUND(6);
  62. ROUND(7);
  63. ROUND(8);
  64. ROUND(9);
  65. ROUND(10);
  66. ROUND(11);
  67. row1l = _mm_xor_si128(row3l, row1l);
  68. row1h = _mm_xor_si128(row3h, row1h);
  69. STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
  70. STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
  71. row2l = _mm_xor_si128(row4l, row2l);
  72. row2h = _mm_xor_si128(row4h, row2h);
  73. STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
  74. STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
  75. return 0;
  76. }
  77. #endif