rshift.S 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. # Alpha 21064 __mpn_rshift --
  2. # Copyright (C) 1994-2026 Free Software Foundation, Inc.
  3. # This file is part of the GNU MP Library.
  4. # The GNU MP Library is free software; you can redistribute it and/or modify
  5. # it under the terms of the GNU Lesser General Public License as published by
  6. # the Free Software Foundation; either version 2.1 of the License, or (at your
  7. # option) any later version.
  8. # The GNU MP Library is distributed in the hope that it will be useful, but
  9. # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  10. # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
  11. # License for more details.
  12. # You should have received a copy of the GNU Lesser General Public License
  13. # along with the GNU MP Library. If not, see <https://www.gnu.org/licenses/>.
  14. # INPUT PARAMETERS
  15. # res_ptr r16
  16. # s1_ptr r17
  17. # size r18
  18. # cnt r19
  19. # This code runs at 4.8 cycles/limb on the 21064. With infinite unrolling,
  20. # it would take 4 cycles/limb. It should be possible to get down to 3
  21. # cycles/limb since both ldq and stq can be paired with the other used
  22. # instructions. But there are many restrictions in the 21064 pipeline that
  23. # makes it hard, if not impossible, to get down to 3 cycles/limb:
  24. # 1. ldq has a 3 cycle delay, srl and sll have a 2 cycle delay.
  25. # 2. Only aligned instruction pairs can be paired.
  26. # 3. The store buffer or silo might not be able to deal with the bandwidth.
  27. .set noreorder
  28. .set noat
  29. .text
  30. .align 3
  31. .globl __mpn_rshift
  32. .ent __mpn_rshift
  33. __mpn_rshift:
  34. .frame $30,0,$26,0
  35. ldq $4,0($17) # load first limb
  36. addq $17,8,$17
  37. subq $31,$19,$7
  38. subq $18,1,$18
  39. and $18,4-1,$20 # number of limbs in first loop
  40. sll $4,$7,$0 # compute function result
  41. beq $20,.L0
  42. subq $18,$20,$18
  43. .align 3
  44. .Loop0:
  45. ldq $3,0($17)
  46. addq $16,8,$16
  47. addq $17,8,$17
  48. subq $20,1,$20
  49. srl $4,$19,$5
  50. sll $3,$7,$6
  51. bis $3,$3,$4
  52. bis $5,$6,$8
  53. stq $8,-8($16)
  54. bne $20,.Loop0
  55. .L0: beq $18,.Lend
  56. .align 3
  57. .Loop: ldq $3,0($17)
  58. addq $16,32,$16
  59. subq $18,4,$18
  60. srl $4,$19,$5
  61. sll $3,$7,$6
  62. ldq $4,8($17)
  63. srl $3,$19,$1
  64. bis $5,$6,$8
  65. stq $8,-32($16)
  66. sll $4,$7,$2
  67. ldq $3,16($17)
  68. srl $4,$19,$5
  69. bis $1,$2,$8
  70. stq $8,-24($16)
  71. sll $3,$7,$6
  72. ldq $4,24($17)
  73. srl $3,$19,$1
  74. bis $5,$6,$8
  75. stq $8,-16($16)
  76. sll $4,$7,$2
  77. addq $17,32,$17
  78. bis $1,$2,$8
  79. stq $8,-8($16)
  80. bgt $18,.Loop
  81. .Lend: srl $4,$19,$8
  82. stq $8,0($16)
  83. ret $31,($26),1
  84. .end __mpn_rshift