memcmp.S 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206
  1. /* memcmp - compare memory
  2. Copyright (C) 2013-2026 Free Software Foundation, Inc.
  3. This file is part of the GNU C Library.
  4. The GNU C Library is free software; you can redistribute it and/or
  5. modify it under the terms of the GNU Lesser General Public
  6. License as published by the Free Software Foundation; either
  7. version 2.1 of the License, or (at your option) any later version.
  8. The GNU C Library is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public
  13. License along with the GNU C Library. If not, see
  14. <https://www.gnu.org/licenses/>. */
  15. #include <sysdep.h>
  16. /* Assumptions:
  17. *
  18. * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  19. */
  20. #define src1 x0
  21. #define src2 x1
  22. #define limit x2
  23. #define result w0
  24. #define data1 x3
  25. #define data1w w3
  26. #define data2 x4
  27. #define data2w w4
  28. #define data3 x5
  29. #define data3w w5
  30. #define data4 x6
  31. #define data4w w6
  32. #define tmp x6
  33. #define src1end x7
  34. #define src2end x8
  35. ENTRY (memcmp)
  36. cmp limit, 16
  37. b.lo L(less16)
  38. ldp data1, data3, [src1]
  39. ldp data2, data4, [src2]
  40. ccmp data1, data2, 0, ne
  41. ccmp data3, data4, 0, eq
  42. b.ne L(return2)
  43. add src1end, src1, limit
  44. add src2end, src2, limit
  45. cmp limit, 32
  46. b.ls L(last_bytes)
  47. cmp limit, 160
  48. b.hs L(loop_align)
  49. sub limit, limit, 32
  50. .p2align 4
  51. L(loop32):
  52. ldp data1, data3, [src1, 16]
  53. ldp data2, data4, [src2, 16]
  54. cmp data1, data2
  55. ccmp data3, data4, 0, eq
  56. b.ne L(return2)
  57. cmp limit, 16
  58. b.ls L(last_bytes)
  59. ldp data1, data3, [src1, 32]
  60. ldp data2, data4, [src2, 32]
  61. cmp data1, data2
  62. ccmp data3, data4, 0, eq
  63. b.ne L(return2)
  64. add src1, src1, 32
  65. add src2, src2, 32
  66. L(last64):
  67. subs limit, limit, 32
  68. b.hi L(loop32)
  69. /* Compare last 1-16 bytes using unaligned access. */
  70. L(last_bytes):
  71. ldp data1, data3, [src1end, -16]
  72. ldp data2, data4, [src2end, -16]
  73. L(return2):
  74. cmp data1, data2
  75. csel data1, data1, data3, ne
  76. csel data2, data2, data4, ne
  77. /* Compare data bytes and set return value to 0, -1 or 1. */
  78. L(return):
  79. #ifndef __AARCH64EB__
  80. rev data1, data1
  81. rev data2, data2
  82. #endif
  83. cmp data1, data2
  84. cset result, ne
  85. cneg result, result, lo
  86. ret
  87. .p2align 4
  88. L(less16):
  89. add src1end, src1, limit
  90. add src2end, src2, limit
  91. tbz limit, 3, L(less8)
  92. ldr data1, [src1]
  93. ldr data2, [src2]
  94. ldr data3, [src1end, -8]
  95. ldr data4, [src2end, -8]
  96. b L(return2)
  97. .p2align 4
  98. L(less8):
  99. tbz limit, 2, L(less4)
  100. ldr data1w, [src1]
  101. ldr data2w, [src2]
  102. ldr data3w, [src1end, -4]
  103. ldr data4w, [src2end, -4]
  104. b L(return2)
  105. L(less4):
  106. tbz limit, 1, L(less2)
  107. ldrh data1w, [src1]
  108. ldrh data2w, [src2]
  109. cmp data1w, data2w
  110. b.ne L(return)
  111. L(less2):
  112. mov result, 0
  113. tbz limit, 0, L(return_zero)
  114. ldrb data1w, [src1end, -1]
  115. ldrb data2w, [src2end, -1]
  116. sub result, data1w, data2w
  117. L(return_zero):
  118. ret
  119. L(loop_align):
  120. ldp data1, data3, [src1, 16]
  121. ldp data2, data4, [src2, 16]
  122. cmp data1, data2
  123. ccmp data3, data4, 0, eq
  124. b.ne L(return2)
  125. /* Align src2 and adjust src1, src2 and limit. */
  126. and tmp, src2, 15
  127. sub tmp, tmp, 16
  128. sub src2, src2, tmp
  129. add limit, limit, tmp
  130. sub src1, src1, tmp
  131. sub limit, limit, 64 + 16
  132. .p2align 4
  133. L(loop64):
  134. ldr q0, [src1, 16]
  135. ldr q1, [src2, 16]
  136. subs limit, limit, 64
  137. ldr q2, [src1, 32]
  138. ldr q3, [src2, 32]
  139. eor v0.16b, v0.16b, v1.16b
  140. eor v1.16b, v2.16b, v3.16b
  141. ldr q2, [src1, 48]
  142. ldr q3, [src2, 48]
  143. umaxp v0.16b, v0.16b, v1.16b
  144. ldr q4, [src1, 64]!
  145. ldr q5, [src2, 64]!
  146. eor v1.16b, v2.16b, v3.16b
  147. eor v2.16b, v4.16b, v5.16b
  148. umaxp v1.16b, v1.16b, v2.16b
  149. umaxp v0.16b, v0.16b, v1.16b
  150. umaxp v0.16b, v0.16b, v0.16b
  151. fmov tmp, d0
  152. ccmp tmp, 0, 0, hi
  153. b.eq L(loop64)
  154. /* If equal, process last 1-64 bytes using scalar loop. */
  155. add limit, limit, 64 + 16
  156. cbz tmp, L(last64)
  157. /* Determine the 8-byte aligned offset of the first difference. */
  158. #ifdef __AARCH64EB__
  159. rev16 tmp, tmp
  160. #endif
  161. rev tmp, tmp
  162. clz tmp, tmp
  163. bic tmp, tmp, 7
  164. sub tmp, tmp, 48
  165. ldr data1, [src1, tmp]
  166. ldr data2, [src2, tmp]
  167. #ifndef __AARCH64EB__
  168. rev data1, data1
  169. rev data2, data2
  170. #endif
  171. mov result, 1
  172. cmp data1, data2
  173. cneg result, result, lo
  174. ret
  175. END (memcmp)
  176. #undef bcmp
  177. weak_alias (memcmp, bcmp)
  178. #undef __memcmpeq
  179. strong_alias (memcmp, __memcmpeq)
  180. libc_hidden_builtin_def (memcmp)
  181. libc_hidden_def (__memcmpeq)