lshift.S 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107
  1. # Alpha 21064 __mpn_lshift --
  2. # Copyright (C) 1994-2026 Free Software Foundation, Inc.
  3. # This file is part of the GNU MP Library.
  4. # The GNU MP Library is free software; you can redistribute it and/or modify
  5. # it under the terms of the GNU Lesser General Public License as published by
  6. # the Free Software Foundation; either version 2.1 of the License, or (at your
  7. # option) any later version.
  8. # The GNU MP Library is distributed in the hope that it will be useful, but
  9. # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  10. # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
  11. # License for more details.
  12. # You should have received a copy of the GNU Lesser General Public License
  13. # along with the GNU MP Library. If not, see <https://www.gnu.org/licenses/>.
  14. # INPUT PARAMETERS
  15. # res_ptr r16
  16. # s1_ptr r17
  17. # size r18
  18. # cnt r19
  19. # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
  20. # it would take 4 cycles/limb. It should be possible to get down to 3
  21. # cycles/limb since both ldq and stq can be paired with the other used
  22. # instructions. But there are many restrictions in the 21064 pipeline that
  23. # makes it hard, if not impossible, to get down to 3 cycles/limb:
  24. # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
  25. # 2. Only aligned instruction pairs can be paired.
  26. # 3. The store buffer or silo might not be able to deal with the bandwidth.
  27. .set noreorder
  28. .set noat
  29. .text
  30. .align 3
  31. .globl __mpn_lshift
  32. .ent __mpn_lshift
  33. __mpn_lshift:
  34. .frame $30,0,$26,0
  35. s8addq $18,$17,$17 # make r17 point at end of s1
  36. ldq $4,-8($17) # load first limb
  37. subq $17,8,$17
  38. subq $31,$19,$7
  39. s8addq $18,$16,$16 # make r16 point at end of RES
  40. subq $18,1,$18
  41. and $18,4-1,$20 # number of limbs in first loop
  42. srl $4,$7,$0 # compute function result
  43. beq $20,.L0
  44. subq $18,$20,$18
  45. .align 3
  46. .Loop0:
  47. ldq $3,-8($17)
  48. subq $16,8,$16
  49. subq $17,8,$17
  50. subq $20,1,$20
  51. sll $4,$19,$5
  52. srl $3,$7,$6
  53. bis $3,$3,$4
  54. bis $5,$6,$8
  55. stq $8,0($16)
  56. bne $20,.Loop0
  57. .L0: beq $18,.Lend
  58. .align 3
  59. .Loop: ldq $3,-8($17)
  60. subq $16,32,$16
  61. subq $18,4,$18
  62. sll $4,$19,$5
  63. srl $3,$7,$6
  64. ldq $4,-16($17)
  65. sll $3,$19,$1
  66. bis $5,$6,$8
  67. stq $8,24($16)
  68. srl $4,$7,$2
  69. ldq $3,-24($17)
  70. sll $4,$19,$5
  71. bis $1,$2,$8
  72. stq $8,16($16)
  73. srl $3,$7,$6
  74. ldq $4,-32($17)
  75. sll $3,$19,$1
  76. bis $5,$6,$8
  77. stq $8,8($16)
  78. srl $4,$7,$2
  79. subq $17,32,$17
  80. bis $1,$2,$8
  81. stq $8,0($16)
  82. bgt $18,.Loop
  83. .Lend: sll $4,$19,$8
  84. stq $8,-8($16)
  85. ret $31,($26),1
  86. .end __mpn_lshift