memcpy.S 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246
  1. /* Generic optimized memcpy using SIMD.
  2. Copyright (C) 2012-2026 Free Software Foundation, Inc.
  3. This file is part of the GNU C Library.
  4. The GNU C Library is free software; you can redistribute it and/or
  5. modify it under the terms of the GNU Lesser General Public
  6. License as published by the Free Software Foundation; either
  7. version 2.1 of the License, or (at your option) any later version.
  8. The GNU C Library is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public
  13. License along with the GNU C Library. If not, see
  14. <https://www.gnu.org/licenses/>. */
  15. #include <sysdep.h>
  16. /* Assumptions:
  17. *
  18. * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  19. *
  20. */
  21. #define dstin x0
  22. #define src x1
  23. #define count x2
  24. #define dst x3
  25. #define srcend x4
  26. #define dstend x5
  27. #define A_l x6
  28. #define A_lw w6
  29. #define A_h x7
  30. #define B_l x8
  31. #define B_lw w8
  32. #define B_h x9
  33. #define C_lw w10
  34. #define tmp1 x14
  35. #define A_q q0
  36. #define B_q q1
  37. #define C_q q2
  38. #define D_q q3
  39. #define E_q q4
  40. #define F_q q5
  41. #define G_q q6
  42. #define H_q q7
  43. #ifndef MEMMOVE
  44. # define MEMMOVE memmove
  45. #endif
  46. #ifndef MEMCPY
  47. # define MEMCPY memcpy
  48. #endif
  49. /* This implementation supports both memcpy and memmove and shares most code.
  50. It uses unaligned accesses and branchless sequences to keep the code small,
  51. simple and improve performance.
  52. Copies are split into 3 main cases: small copies of up to 32 bytes, medium
  53. copies of up to 128 bytes, and large copies. The overhead of the overlap
  54. check in memmove is negligible since it is only required for large copies.
  55. Large copies use a software pipelined loop processing 64 bytes per
  56. iteration. The destination pointer is 16-byte aligned to minimize
  57. unaligned accesses. The loop tail is handled by always copying 64 bytes
  58. from the end. */
  59. ENTRY (MEMCPY)
  60. add srcend, src, count
  61. add dstend, dstin, count
  62. cmp count, 128
  63. b.hi L(copy_long)
  64. cmp count, 32
  65. b.hi L(copy32_128)
  66. /* Small copies: 0..32 bytes. */
  67. cmp count, 16
  68. b.lo L(copy16)
  69. ldr A_q, [src]
  70. ldr B_q, [srcend, -16]
  71. str A_q, [dstin]
  72. str B_q, [dstend, -16]
  73. ret
  74. /* Copy 8-15 bytes. */
  75. L(copy16):
  76. tbz count, 3, L(copy8)
  77. ldr A_l, [src]
  78. ldr A_h, [srcend, -8]
  79. str A_l, [dstin]
  80. str A_h, [dstend, -8]
  81. ret
  82. /* Copy 4-7 bytes. */
  83. L(copy8):
  84. tbz count, 2, L(copy4)
  85. ldr A_lw, [src]
  86. ldr B_lw, [srcend, -4]
  87. str A_lw, [dstin]
  88. str B_lw, [dstend, -4]
  89. ret
  90. /* Copy 0..3 bytes using a branchless sequence. */
  91. L(copy4):
  92. cbz count, L(copy0)
  93. lsr tmp1, count, 1
  94. ldrb A_lw, [src]
  95. ldrb C_lw, [srcend, -1]
  96. ldrb B_lw, [src, tmp1]
  97. strb A_lw, [dstin]
  98. strb B_lw, [dstin, tmp1]
  99. strb C_lw, [dstend, -1]
  100. L(copy0):
  101. ret
  102. .p2align 4
  103. /* Medium copies: 33..128 bytes. */
  104. L(copy32_128):
  105. ldp A_q, B_q, [src]
  106. ldp C_q, D_q, [srcend, -32]
  107. cmp count, 64
  108. b.hi L(copy128)
  109. stp A_q, B_q, [dstin]
  110. stp C_q, D_q, [dstend, -32]
  111. ret
  112. .p2align 4
  113. /* Copy 65..128 bytes. */
  114. L(copy128):
  115. ldp E_q, F_q, [src, 32]
  116. cmp count, 96
  117. b.ls L(copy96)
  118. ldp G_q, H_q, [srcend, -64]
  119. stp G_q, H_q, [dstend, -64]
  120. L(copy96):
  121. stp A_q, B_q, [dstin]
  122. stp E_q, F_q, [dstin, 32]
  123. stp C_q, D_q, [dstend, -32]
  124. ret
  125. /* Align loop64 below to 16 bytes. */
  126. nop
  127. /* Copy more than 128 bytes. */
  128. L(copy_long):
  129. /* Copy 16 bytes and then align src to 16-byte alignment. */
  130. ldr D_q, [src]
  131. and tmp1, src, 15
  132. bic src, src, 15
  133. sub dst, dstin, tmp1
  134. add count, count, tmp1 /* Count is now 16 too large. */
  135. ldp A_q, B_q, [src, 16]
  136. str D_q, [dstin]
  137. ldp C_q, D_q, [src, 48]
  138. subs count, count, 128 + 16 /* Test and readjust count. */
  139. b.ls L(copy64_from_end)
  140. L(loop64):
  141. stp A_q, B_q, [dst, 16]
  142. ldp A_q, B_q, [src, 80]
  143. stp C_q, D_q, [dst, 48]
  144. ldp C_q, D_q, [src, 112]
  145. add src, src, 64
  146. add dst, dst, 64
  147. subs count, count, 64
  148. b.hi L(loop64)
  149. /* Write the last iteration and copy 64 bytes from the end. */
  150. L(copy64_from_end):
  151. ldp E_q, F_q, [srcend, -64]
  152. stp A_q, B_q, [dst, 16]
  153. ldp A_q, B_q, [srcend, -32]
  154. stp C_q, D_q, [dst, 48]
  155. stp E_q, F_q, [dstend, -64]
  156. stp A_q, B_q, [dstend, -32]
  157. ret
  158. END (MEMCPY)
  159. libc_hidden_builtin_def (MEMCPY)
  160. ENTRY (MEMMOVE)
  161. add srcend, src, count
  162. add dstend, dstin, count
  163. cmp count, 128
  164. b.hi L(move_long)
  165. cmp count, 32
  166. b.hi L(copy32_128)
  167. /* Small moves: 0..32 bytes. */
  168. cmp count, 16
  169. b.lo L(copy16)
  170. ldr A_q, [src]
  171. ldr B_q, [srcend, -16]
  172. str A_q, [dstin]
  173. str B_q, [dstend, -16]
  174. ret
  175. L(move_long):
  176. /* Only use backward copy if there is an overlap. */
  177. sub tmp1, dstin, src
  178. cbz tmp1, L(move0)
  179. cmp tmp1, count
  180. b.hs L(copy_long)
  181. /* Large backwards copy for overlapping copies.
  182. Copy 16 bytes and then align srcend to 16-byte alignment. */
  183. L(copy_long_backwards):
  184. ldr D_q, [srcend, -16]
  185. and tmp1, srcend, 15
  186. bic srcend, srcend, 15
  187. sub count, count, tmp1
  188. ldp A_q, B_q, [srcend, -32]
  189. str D_q, [dstend, -16]
  190. ldp C_q, D_q, [srcend, -64]
  191. sub dstend, dstend, tmp1
  192. subs count, count, 128
  193. b.ls L(copy64_from_start)
  194. L(loop64_backwards):
  195. str B_q, [dstend, -16]
  196. str A_q, [dstend, -32]
  197. ldp A_q, B_q, [srcend, -96]
  198. str D_q, [dstend, -48]
  199. str C_q, [dstend, -64]!
  200. ldp C_q, D_q, [srcend, -128]
  201. sub srcend, srcend, 64
  202. subs count, count, 64
  203. b.hi L(loop64_backwards)
  204. /* Write the last iteration and copy 64 bytes from the start. */
  205. L(copy64_from_start):
  206. ldp E_q, F_q, [src, 32]
  207. stp A_q, B_q, [dstend, -32]
  208. ldp A_q, B_q, [src]
  209. stp C_q, D_q, [dstend, -64]
  210. stp E_q, F_q, [dstin, 32]
  211. stp A_q, B_q, [dstin]
  212. L(move0):
  213. ret
  214. END (MEMMOVE)
  215. libc_hidden_builtin_def (MEMMOVE)