u1.h 3.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. while (bytes >= 64) {
  2. __m128i x_0, x_1, x_2, x_3;
  3. __m128i t_1;
  4. const __m128i rot16 =
  5. _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2);
  6. const __m128i rot8 =
  7. _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3);
  8. uint32_t in12;
  9. uint32_t in13;
  10. int i;
  11. x_0 = _mm_loadu_si128((const __m128i*) (x + 0));
  12. x_1 = _mm_loadu_si128((const __m128i*) (x + 4));
  13. x_2 = _mm_loadu_si128((const __m128i*) (x + 8));
  14. x_3 = _mm_loadu_si128((const __m128i*) (x + 12));
  15. for (i = 0; i < ROUNDS; i += 2) {
  16. x_0 = _mm_add_epi32(x_0, x_1);
  17. x_3 = _mm_xor_si128(x_3, x_0);
  18. x_3 = _mm_shuffle_epi8(x_3, rot16);
  19. x_2 = _mm_add_epi32(x_2, x_3);
  20. x_1 = _mm_xor_si128(x_1, x_2);
  21. t_1 = x_1;
  22. x_1 = _mm_slli_epi32(x_1, 12);
  23. t_1 = _mm_srli_epi32(t_1, 20);
  24. x_1 = _mm_xor_si128(x_1, t_1);
  25. x_0 = _mm_add_epi32(x_0, x_1);
  26. x_3 = _mm_xor_si128(x_3, x_0);
  27. x_0 = _mm_shuffle_epi32(x_0, 0x93);
  28. x_3 = _mm_shuffle_epi8(x_3, rot8);
  29. x_2 = _mm_add_epi32(x_2, x_3);
  30. x_3 = _mm_shuffle_epi32(x_3, 0x4e);
  31. x_1 = _mm_xor_si128(x_1, x_2);
  32. x_2 = _mm_shuffle_epi32(x_2, 0x39);
  33. t_1 = x_1;
  34. x_1 = _mm_slli_epi32(x_1, 7);
  35. t_1 = _mm_srli_epi32(t_1, 25);
  36. x_1 = _mm_xor_si128(x_1, t_1);
  37. x_0 = _mm_add_epi32(x_0, x_1);
  38. x_3 = _mm_xor_si128(x_3, x_0);
  39. x_3 = _mm_shuffle_epi8(x_3, rot16);
  40. x_2 = _mm_add_epi32(x_2, x_3);
  41. x_1 = _mm_xor_si128(x_1, x_2);
  42. t_1 = x_1;
  43. x_1 = _mm_slli_epi32(x_1, 12);
  44. t_1 = _mm_srli_epi32(t_1, 20);
  45. x_1 = _mm_xor_si128(x_1, t_1);
  46. x_0 = _mm_add_epi32(x_0, x_1);
  47. x_3 = _mm_xor_si128(x_3, x_0);
  48. x_0 = _mm_shuffle_epi32(x_0, 0x39);
  49. x_3 = _mm_shuffle_epi8(x_3, rot8);
  50. x_2 = _mm_add_epi32(x_2, x_3);
  51. x_3 = _mm_shuffle_epi32(x_3, 0x4e);
  52. x_1 = _mm_xor_si128(x_1, x_2);
  53. x_2 = _mm_shuffle_epi32(x_2, 0x93);
  54. t_1 = x_1;
  55. x_1 = _mm_slli_epi32(x_1, 7);
  56. t_1 = _mm_srli_epi32(t_1, 25);
  57. x_1 = _mm_xor_si128(x_1, t_1);
  58. }
  59. x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((const __m128i*) (x + 0)));
  60. x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((const __m128i*) (x + 4)));
  61. x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((const __m128i*) (x + 8)));
  62. x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((const __m128i*) (x + 12)));
  63. x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((const __m128i*) (m + 0)));
  64. x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((const __m128i*) (m + 16)));
  65. x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((const __m128i*) (m + 32)));
  66. x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((const __m128i*) (m + 48)));
  67. _mm_storeu_si128((__m128i*) (c + 0), x_0);
  68. _mm_storeu_si128((__m128i*) (c + 16), x_1);
  69. _mm_storeu_si128((__m128i*) (c + 32), x_2);
  70. _mm_storeu_si128((__m128i*) (c + 48), x_3);
  71. in12 = x[12];
  72. in13 = x[13];
  73. in12++;
  74. if (in12 == 0) {
  75. in13++;
  76. }
  77. x[12] = in12;
  78. x[13] = in13;
  79. bytes -= 64;
  80. c += 64;
  81. m += 64;
  82. }