fe51_nsquare.S 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. #ifdef IN_SANDY2X
  2. /*
  3. This file is adapted from amd64-51/fe25519_square.s:
  4. Adding loop to perform n squares.
  5. */
  6. #include "fe51_namespace.h"
  7. #include "consts_namespace.h"
  8. .p2align 5
  9. #ifdef ASM_HIDE_SYMBOL
  10. ASM_HIDE_SYMBOL fe51_nsquare
  11. ASM_HIDE_SYMBOL _fe51_nsquare
  12. #endif
  13. .globl fe51_nsquare
  14. .globl _fe51_nsquare
  15. #ifdef __ELF__
  16. .type fe51_nsquare, @function
  17. .type _fe51_nsquare, @function
  18. #endif
  19. fe51_nsquare:
  20. _fe51_nsquare:
  21. mov %rsp,%r11
  22. and $31,%r11
  23. add $64,%r11
  24. sub %r11,%rsp
  25. movq %r11,0(%rsp)
  26. movq %r12,8(%rsp)
  27. movq %r13,16(%rsp)
  28. movq %r14,24(%rsp)
  29. movq %r15,32(%rsp)
  30. movq %rbx,40(%rsp)
  31. movq %rbp,48(%rsp)
  32. movq 0(%rsi),%rcx
  33. movq 8(%rsi),%r8
  34. movq 16(%rsi),%r9
  35. movq 24(%rsi),%rax
  36. movq 32(%rsi),%rsi
  37. movq %r9,16(%rdi)
  38. movq %rax,24(%rdi)
  39. movq %rsi,32(%rdi)
  40. mov %rdx,%rsi
  41. .p2align 4
  42. ._loop:
  43. sub $1,%rsi
  44. mov %rcx,%rax
  45. mul %rcx
  46. add %rcx,%rcx
  47. mov %rax,%r9
  48. mov %rdx,%r10
  49. mov %rcx,%rax
  50. mul %r8
  51. mov %rax,%r11
  52. mov %rdx,%r12
  53. mov %rcx,%rax
  54. mulq 16(%rdi)
  55. mov %rax,%r13
  56. mov %rdx,%r14
  57. mov %rcx,%rax
  58. mulq 24(%rdi)
  59. mov %rax,%r15
  60. mov %rdx,%rbx
  61. mov %rcx,%rax
  62. mulq 32(%rdi)
  63. mov %rax,%rcx
  64. mov %rdx,%rbp
  65. mov %r8,%rax
  66. mul %r8
  67. add %r8,%r8
  68. add %rax,%r13
  69. adc %rdx,%r14
  70. mov %r8,%rax
  71. mulq 16(%rdi)
  72. add %rax,%r15
  73. adc %rdx,%rbx
  74. mov %r8,%rax
  75. imulq $19, %r8,%r8
  76. mulq 24(%rdi)
  77. add %rax,%rcx
  78. adc %rdx,%rbp
  79. mov %r8,%rax
  80. mulq 32(%rdi)
  81. add %rax,%r9
  82. adc %rdx,%r10
  83. movq 16(%rdi),%rax
  84. mulq 16(%rdi)
  85. add %rax,%rcx
  86. adc %rdx,%rbp
  87. shld $13,%rcx,%rbp
  88. movq 16(%rdi),%rax
  89. imulq $38, %rax,%rax
  90. mulq 24(%rdi)
  91. add %rax,%r9
  92. adc %rdx,%r10
  93. shld $13,%r9,%r10
  94. movq 16(%rdi),%rax
  95. imulq $38, %rax,%rax
  96. mulq 32(%rdi)
  97. add %rax,%r11
  98. adc %rdx,%r12
  99. movq 24(%rdi),%rax
  100. imulq $19, %rax,%rax
  101. mulq 24(%rdi)
  102. add %rax,%r11
  103. adc %rdx,%r12
  104. shld $13,%r11,%r12
  105. movq 24(%rdi),%rax
  106. imulq $38, %rax,%rax
  107. mulq 32(%rdi)
  108. add %rax,%r13
  109. adc %rdx,%r14
  110. shld $13,%r13,%r14
  111. movq 32(%rdi),%rax
  112. imulq $19, %rax,%rax
  113. mulq 32(%rdi)
  114. add %rax,%r15
  115. adc %rdx,%rbx
  116. shld $13,%r15,%rbx
  117. movq REDMASK51(%rip),%rdx
  118. and %rdx,%rcx
  119. add %rbx,%rcx
  120. and %rdx,%r9
  121. and %rdx,%r11
  122. add %r10,%r11
  123. and %rdx,%r13
  124. add %r12,%r13
  125. and %rdx,%r15
  126. add %r14,%r15
  127. imulq $19, %rbp,%rbp
  128. lea (%r9,%rbp),%r9
  129. mov %r9,%rax
  130. shr $51,%r9
  131. add %r11,%r9
  132. and %rdx,%rax
  133. mov %r9,%r8
  134. shr $51,%r9
  135. add %r13,%r9
  136. and %rdx,%r8
  137. mov %r9,%r10
  138. shr $51,%r9
  139. add %r15,%r9
  140. and %rdx,%r10
  141. movq %r10,16(%rdi)
  142. mov %r9,%r10
  143. shr $51,%r9
  144. add %rcx,%r9
  145. and %rdx,%r10
  146. movq %r10,24(%rdi)
  147. mov %r9,%r10
  148. shr $51,%r9
  149. imulq $19, %r9,%r9
  150. lea (%rax,%r9),%rcx
  151. and %rdx,%r10
  152. movq %r10,32(%rdi)
  153. cmp $0,%rsi
  154. jne ._loop
  155. movq %rcx,0(%rdi)
  156. movq %r8,8(%rdi)
  157. movq 0(%rsp),%r11
  158. movq 8(%rsp),%r12
  159. movq 16(%rsp),%r13
  160. movq 24(%rsp),%r14
  161. movq 32(%rsp),%r15
  162. movq 40(%rsp),%rbx
  163. movq 48(%rsp),%rbp
  164. add %r11,%rsp
  165. ret
  166. #endif