memcpy.S 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. /* Copyright (C) 2000-2026 Free Software Foundation, Inc.
  2. This file is part of the GNU C Library.
  3. EV6 optimized by Rick Gorton <rick.gorton@alpha-processor.com>.
  4. The GNU C Library is free software; you can redistribute it and/or
  5. modify it under the terms of the GNU Lesser General Public
  6. License as published by the Free Software Foundation; either
  7. version 2.1 of the License, or (at your option) any later version.
  8. The GNU C Library is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public
  13. License along with the GNU C Library. If not, see
  14. <https://www.gnu.org/licenses/>. */
  15. /*
  16. * Much of the information about 21264 scheduling/coding comes from:
  17. * Compiler Writer's Guide for the Alpha 21264
  18. * abbreviated as 'CWG' in other comments here
  19. * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
  20. * Scheduling notation:
  21. * E - either cluster
  22. * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1
  23. * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1
  24. *
  25. * Temp usage notes:
  26. * $0 - destination address
  27. * $1,$2, - scratch
  28. */
  29. #include <sysdep.h>
  30. .arch ev6
  31. .set noreorder
  32. .set noat
  33. ENTRY(memcpy)
  34. .prologue 0
  35. mov $16, $0 # E : copy dest to return
  36. ble $18, $nomoredata # U : done with the copy?
  37. xor $16, $17, $1 # E : are source and dest alignments the same?
  38. and $1, 7, $1 # E : are they the same mod 8?
  39. bne $1, $misaligned # U : Nope - gotta do this the slow way
  40. /* source and dest are same mod 8 address */
  41. and $16, 7, $1 # E : Are both 0mod8?
  42. beq $1, $both_0mod8 # U : Yes
  43. nop # E :
  44. /*
  45. * source and dest are same misalignment. move a byte at a time
  46. * until a 0mod8 alignment for both is reached.
  47. * At least one byte more to move
  48. */
  49. $head_align:
  50. ldbu $1, 0($17) # L : grab a byte
  51. subq $18, 1, $18 # E : count--
  52. addq $17, 1, $17 # E : src++
  53. stb $1, 0($16) # L :
  54. addq $16, 1, $16 # E : dest++
  55. and $16, 7, $1 # E : Are we at 0mod8 yet?
  56. ble $18, $nomoredata # U : done with the copy?
  57. bne $1, $head_align # U :
  58. $both_0mod8:
  59. cmple $18, 127, $1 # E : Can we unroll the loop?
  60. bne $1, $no_unroll # U :
  61. and $16, 63, $1 # E : get mod64 alignment
  62. beq $1, $do_unroll # U : no single quads to fiddle
  63. $single_head_quad:
  64. ldq $1, 0($17) # L : get 8 bytes
  65. subq $18, 8, $18 # E : count -= 8
  66. addq $17, 8, $17 # E : src += 8
  67. nop # E :
  68. stq $1, 0($16) # L : store
  69. addq $16, 8, $16 # E : dest += 8
  70. and $16, 63, $1 # E : get mod64 alignment
  71. bne $1, $single_head_quad # U : still not fully aligned
  72. $do_unroll:
  73. addq $16, 64, $7 # E : Initial (+1 trip) wh64 address
  74. cmple $18, 127, $1 # E : Can we go through the unrolled loop?
  75. bne $1, $tail_quads # U : Nope
  76. nop # E :
  77. $unroll_body:
  78. wh64 ($7) # L1 : memory subsystem hint: 64 bytes at
  79. # ($7) are about to be over-written
  80. ldq $6, 0($17) # L0 : bytes 0..7
  81. nop # E :
  82. nop # E :
  83. ldq $4, 8($17) # L : bytes 8..15
  84. ldq $5, 16($17) # L : bytes 16..23
  85. addq $7, 64, $7 # E : Update next wh64 address
  86. nop # E :
  87. ldq $3, 24($17) # L : bytes 24..31
  88. addq $16, 64, $1 # E : fallback value for wh64
  89. nop # E :
  90. nop # E :
  91. addq $17, 32, $17 # E : src += 32 bytes
  92. stq $6, 0($16) # L : bytes 0..7
  93. nop # E :
  94. nop # E :
  95. stq $4, 8($16) # L : bytes 8..15
  96. stq $5, 16($16) # L : bytes 16..23
  97. subq $18, 192, $2 # E : At least two more trips to go?
  98. nop # E :
  99. stq $3, 24($16) # L : bytes 24..31
  100. addq $16, 32, $16 # E : dest += 32 bytes
  101. nop # E :
  102. nop # E :
  103. ldq $6, 0($17) # L : bytes 0..7
  104. ldq $4, 8($17) # L : bytes 8..15
  105. cmovlt $2, $1, $7 # E : Latency 2, extra map slot - Use
  106. # fallback wh64 address if < 2 more trips
  107. nop # E :
  108. ldq $5, 16($17) # L : bytes 16..23
  109. ldq $3, 24($17) # L : bytes 24..31
  110. addq $16, 32, $16 # E : dest += 32
  111. subq $18, 64, $18 # E : count -= 64
  112. addq $17, 32, $17 # E : src += 32
  113. stq $6, -32($16) # L : bytes 0..7
  114. stq $4, -24($16) # L : bytes 8..15
  115. cmple $18, 63, $1 # E : At least one more trip?
  116. stq $5, -16($16) # L : bytes 16..23
  117. stq $3, -8($16) # L : bytes 24..31
  118. nop # E :
  119. beq $1, $unroll_body
  120. $tail_quads:
  121. $no_unroll:
  122. .align 4
  123. subq $18, 8, $18 # E : At least a quad left?
  124. blt $18, $less_than_8 # U : Nope
  125. nop # E :
  126. nop # E :
  127. $move_a_quad:
  128. ldq $1, 0($17) # L : fetch 8
  129. subq $18, 8, $18 # E : count -= 8
  130. addq $17, 8, $17 # E : src += 8
  131. nop # E :
  132. stq $1, 0($16) # L : store 8
  133. addq $16, 8, $16 # E : dest += 8
  134. bge $18, $move_a_quad # U :
  135. nop # E :
  136. $less_than_8:
  137. .align 4
  138. addq $18, 8, $18 # E : add back for trailing bytes
  139. ble $18, $nomoredata # U : All-done
  140. nop # E :
  141. nop # E :
  142. /* Trailing bytes */
  143. $tail_bytes:
  144. subq $18, 1, $18 # E : count--
  145. ldbu $1, 0($17) # L : fetch a byte
  146. addq $17, 1, $17 # E : src++
  147. nop # E :
  148. stb $1, 0($16) # L : store a byte
  149. addq $16, 1, $16 # E : dest++
  150. bgt $18, $tail_bytes # U : more to be done?
  151. nop # E :
  152. /* branching to exit takes 3 extra cycles, so replicate exit here */
  153. ret $31, ($26), 1 # L0 :
  154. nop # E :
  155. nop # E :
  156. nop # E :
  157. $misaligned:
  158. mov $0, $4 # E : dest temp
  159. and $0, 7, $1 # E : dest alignment mod8
  160. beq $1, $dest_0mod8 # U : life doesn't totally suck
  161. nop
  162. $aligndest:
  163. ble $18, $nomoredata # U :
  164. ldbu $1, 0($17) # L : fetch a byte
  165. subq $18, 1, $18 # E : count--
  166. addq $17, 1, $17 # E : src++
  167. stb $1, 0($4) # L : store it
  168. addq $4, 1, $4 # E : dest++
  169. and $4, 7, $1 # E : dest 0mod8 yet?
  170. bne $1, $aligndest # U : go until we are aligned.
  171. /* Source has unknown alignment, but dest is known to be 0mod8 */
  172. $dest_0mod8:
  173. subq $18, 8, $18 # E : At least a quad left?
  174. blt $18, $misalign_tail # U : Nope
  175. ldq_u $3, 0($17) # L : seed (rotating load) of 8 bytes
  176. nop # E :
  177. $mis_quad:
  178. ldq_u $16, 8($17) # L : Fetch next 8
  179. extql $3, $17, $3 # U : masking
  180. extqh $16, $17, $1 # U : masking
  181. bis $3, $1, $1 # E : merged bytes to store
  182. subq $18, 8, $18 # E : count -= 8
  183. addq $17, 8, $17 # E : src += 8
  184. stq $1, 0($4) # L : store 8 (aligned)
  185. mov $16, $3 # E : "rotate" source data
  186. addq $4, 8, $4 # E : dest += 8
  187. bge $18, $mis_quad # U : More quads to move
  188. nop
  189. nop
  190. $misalign_tail:
  191. addq $18, 8, $18 # E : account for tail stuff
  192. ble $18, $nomoredata # U :
  193. nop
  194. nop
  195. $misalign_byte:
  196. ldbu $1, 0($17) # L : fetch 1
  197. subq $18, 1, $18 # E : count--
  198. addq $17, 1, $17 # E : src++
  199. nop # E :
  200. stb $1, 0($4) # L : store
  201. addq $4, 1, $4 # E : dest++
  202. bgt $18, $misalign_byte # U : more to go?
  203. nop
  204. $nomoredata:
  205. ret $31, ($26), 1 # L0 :
  206. nop # E :
  207. nop # E :
  208. nop # E :
  209. END(memcpy)
  210. libc_hidden_builtin_def (memcpy)