mul_1.S 2.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. # Alpha 21064 __mpn_mul_1 -- Multiply a limb vector with a limb and store
  2. # the result in a second limb vector.
  3. # Copyright (C) 1992-2026 Free Software Foundation, Inc.
  4. # This file is part of the GNU MP Library.
  5. # The GNU MP Library is free software; you can redistribute it and/or modify
  6. # it under the terms of the GNU Lesser General Public License as published by
  7. # the Free Software Foundation; either version 2.1 of the License, or (at your
  8. # option) any later version.
  9. # The GNU MP Library is distributed in the hope that it will be useful, but
  10. # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  11. # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
  12. # License for more details.
  13. # You should have received a copy of the GNU Lesser General Public License
  14. # along with the GNU MP Library. If not, see <https://www.gnu.org/licenses/>.
  15. # INPUT PARAMETERS
  16. # res_ptr r16
  17. # s1_ptr r17
  18. # size r18
  19. # s2_limb r19
  20. # This code runs at 42 cycles/limb on the EV4 and 18 cycles/limb on the EV5.
  21. # To improve performance for long multiplications, we would use
  22. # 'fetch' for S1 and 'fetch_m' for RES. It's not obvious how to use
  23. # these instructions without slowing down the general code: 1. We can
  24. # only have two prefetches in operation at any time in the Alpha
  25. # architecture. 2. There will seldom be any special alignment
  26. # between RES_PTR and S1_PTR. Maybe we can simply divide the current
  27. # loop into an inner and outer loop, having the inner loop handle
  28. # exactly one prefetch block?
  29. .set noreorder
  30. .set noat
  31. .text
  32. .align 3
  33. .globl __mpn_mul_1
  34. .ent __mpn_mul_1 2
  35. __mpn_mul_1:
  36. .frame $30,0,$26
  37. ldq $2,0($17) # $2 = s1_limb
  38. subq $18,1,$18 # size--
  39. mulq $2,$19,$3 # $3 = prod_low
  40. bic $31,$31,$4 # clear cy_limb
  41. umulh $2,$19,$0 # $0 = prod_high
  42. beq $18,Lend1 # jump if size was == 1
  43. ldq $2,8($17) # $2 = s1_limb
  44. subq $18,1,$18 # size--
  45. stq $3,0($16)
  46. beq $18,Lend2 # jump if size was == 2
  47. .align 3
  48. Loop: mulq $2,$19,$3 # $3 = prod_low
  49. addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
  50. subq $18,1,$18 # size--
  51. umulh $2,$19,$4 # $4 = cy_limb
  52. ldq $2,16($17) # $2 = s1_limb
  53. addq $17,8,$17 # s1_ptr++
  54. addq $3,$0,$3 # $3 = cy_limb + prod_low
  55. stq $3,8($16)
  56. cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
  57. addq $16,8,$16 # res_ptr++
  58. bne $18,Loop
  59. Lend2: mulq $2,$19,$3 # $3 = prod_low
  60. addq $4,$0,$0 # cy_limb = cy_limb + 'cy'
  61. umulh $2,$19,$4 # $4 = cy_limb
  62. addq $3,$0,$3 # $3 = cy_limb + prod_low
  63. cmpult $3,$0,$0 # $0 = carry from (cy_limb + prod_low)
  64. stq $3,8($16)
  65. addq $4,$0,$0 # cy_limb = prod_high + cy
  66. ret $31,($26),1
  67. Lend1: stq $3,0($16)
  68. ret $31,($26),1
  69. .end __mpn_mul_1