memcpy_noalignment.S 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158
  1. /* memcpy for RISC-V, ignoring buffer alignment
  2. Copyright (C) 2024-2026 Free Software Foundation, Inc.
  3. This file is part of the GNU C Library.
  4. The GNU C Library is free software; you can redistribute it and/or
  5. modify it under the terms of the GNU Lesser General Public
  6. License as published by the Free Software Foundation; either
  7. version 2.1 of the License, or (at your option) any later version.
  8. The GNU C Library is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public
  13. License along with the GNU C Library. If not, see
  14. <https://www.gnu.org/licenses/>. */
  15. #include <sysdep.h>
  16. #include <sys/asm.h>
  17. /* memcpy optimization for CPUs with fast unaligned support
  18. (RISCV_HWPROBE_MISALIGNED_FAST).
  19. Copies are split into 3 main cases: small copies up to SZREG, copies up to
  20. BLOCK_SIZE (128 for 64 bits, 64 for 32 bits), and copies larger than BLOCK_SIZE.
  21. Large copies use a software pipelined loop processing BLOCK_SIZE bytes per
  22. iteration. The destination pointer is SZREG-byte aligned to minimize store
  23. unaligned accesses.
  24. The tail is handled with branchless copies. */
  25. #define BLOCK_SIZE (16 * SZREG)
  26. .attribute unaligned_access, 1
  27. ENTRY (__memcpy_noalignment)
  28. beq a2, zero, L(ret)
  29. /* if LEN < SZREG jump to tail handling. */
  30. li a5, SZREG-1
  31. mv a6, a0
  32. bleu a2, a5, L(tail)
  33. /* Copy the first word, align DEST to word, and adjust DEST/SRC/LEN
  34. based on the amount adjusted to align DEST. */
  35. REG_L a3, 0(a1)
  36. andi a5, a0, SZREG-1
  37. addi a2, a2, -SZREG
  38. li a4, SZREG
  39. sub a4, a4, a5
  40. REG_S a3, 0(a0)
  41. add a2, a5, a2
  42. /* If LEN < BLOCK_SIZE jump to word copy. */
  43. li a3, BLOCK_SIZE-1
  44. add a5, a0, a4
  45. add a1, a1, a4
  46. bleu a2, a3, L(word_copy_adjust)
  47. andi a7, a2, -BLOCK_SIZE
  48. add a3, a5, a7
  49. mv a4, a1
  50. L(block_copy):
  51. REG_L a6, 0(a4)
  52. REG_L t0, SZREG(a4)
  53. REG_L t1, (2*SZREG)(a4)
  54. REG_L t2, (3*SZREG)(a4)
  55. REG_L t3, (4*SZREG)(a4)
  56. REG_L t4, (5*SZREG)(a4)
  57. REG_L t5, (6*SZREG)(a4)
  58. REG_L t6, (7*SZREG)(a4)
  59. REG_S a6, 0(a5)
  60. REG_S t0, SZREG(a5)
  61. REG_S t1, (2*SZREG)(a5)
  62. REG_S t2, (3*SZREG)(a5)
  63. REG_S t3, (4*SZREG)(a5)
  64. REG_S t4, (5*SZREG)(a5)
  65. REG_S t5, (6*SZREG)(a5)
  66. REG_S t6, (7*SZREG)(a5)
  67. REG_L a6, (8*SZREG)(a4)
  68. REG_L t0, (9*SZREG)(a4)
  69. REG_L t1, (10*SZREG)(a4)
  70. REG_L t2, (11*SZREG)(a4)
  71. REG_L t3, (12*SZREG)(a4)
  72. REG_L t4, (13*SZREG)(a4)
  73. REG_L t5, (14*SZREG)(a4)
  74. REG_L t6, (15*SZREG)(a4)
  75. addi a4, a4, BLOCK_SIZE
  76. REG_S a6, (8*SZREG)(a5)
  77. REG_S t0, (9*SZREG)(a5)
  78. REG_S t1, (10*SZREG)(a5)
  79. REG_S t2, (11*SZREG)(a5)
  80. REG_S t3, (12*SZREG)(a5)
  81. REG_S t4, (13*SZREG)(a5)
  82. REG_S t5, (14*SZREG)(a5)
  83. REG_S t6, (15*SZREG)(a5)
  84. addi a5, a5, BLOCK_SIZE
  85. bne a5, a3, L(block_copy)
  86. add a1, a1, a7
  87. andi a2, a2, BLOCK_SIZE-1
  88. /* 0 <= a2/LEN < BLOCK_SIZE. */
  89. L(word_copy):
  90. li a5, SZREG-1
  91. /* if LEN < SZREG jump to tail handling. */
  92. bleu a2, a5, L(tail_adjust)
  93. andi a7, a2, -SZREG
  94. add a6, a3, a7
  95. mv a5, a1
  96. L(word_copy_loop):
  97. REG_L a4, 0(a5)
  98. addi a5, a5, SZREG
  99. REG_S a4, 0(a3)
  100. addi a3, a3, SZREG
  101. bne a3, a6, L(word_copy_loop)
  102. add a1, a1, a7
  103. andi a2, a2, SZREG-1
  104. /* Copy the last word unaligned. */
  105. add a1, a1, a2
  106. add a2, a6, a2
  107. REG_L a3, -SZREG(a1)
  108. REG_S a3, -SZREG(a2)
  109. ret
  110. L(tail):
  111. /* Copy 4-7 bytes. */
  112. andi a5, a2, 4
  113. add a3, a1, a2
  114. add a4, a6, a2
  115. beq a5, zero, L(copy_0_3)
  116. lw a2, 0(a1)
  117. lw t1, -4(a3)
  118. sw a2, 0(a6)
  119. sw t1, -4(a4)
  120. ret
  121. /* Copy 0-3 bytes. */
  122. L(copy_0_3):
  123. beq a2, zero, L(ret)
  124. lbu a3, -1(a3)
  125. srli a2, a2, 1
  126. add a5, a1, a2
  127. lbu a1, 0(a1)
  128. sb a3, -1(a4)
  129. lbu a4, 0(a5)
  130. add a2, a6, a2
  131. sb a1, 0(a6)
  132. sb a4, 0(a2)
  133. L(ret):
  134. ret
  135. L(tail_adjust):
  136. mv a6, a3
  137. j L(tail)
  138. L(word_copy_adjust):
  139. mv a3, a5
  140. j L(word_copy)
  141. END (__memcpy_noalignment)