| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206 |
- /* memcmp - compare memory
- Copyright (C) 2013-2026 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library. If not, see
- <https://www.gnu.org/licenses/>. */
- #include <sysdep.h>
- /* Assumptions:
- *
- * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
- */
- #define src1 x0
- #define src2 x1
- #define limit x2
- #define result w0
- #define data1 x3
- #define data1w w3
- #define data2 x4
- #define data2w w4
- #define data3 x5
- #define data3w w5
- #define data4 x6
- #define data4w w6
- #define tmp x6
- #define src1end x7
- #define src2end x8
- ENTRY (memcmp)
- cmp limit, 16
- b.lo L(less16)
- ldp data1, data3, [src1]
- ldp data2, data4, [src2]
- ccmp data1, data2, 0, ne
- ccmp data3, data4, 0, eq
- b.ne L(return2)
- add src1end, src1, limit
- add src2end, src2, limit
- cmp limit, 32
- b.ls L(last_bytes)
- cmp limit, 160
- b.hs L(loop_align)
- sub limit, limit, 32
- .p2align 4
- L(loop32):
- ldp data1, data3, [src1, 16]
- ldp data2, data4, [src2, 16]
- cmp data1, data2
- ccmp data3, data4, 0, eq
- b.ne L(return2)
- cmp limit, 16
- b.ls L(last_bytes)
- ldp data1, data3, [src1, 32]
- ldp data2, data4, [src2, 32]
- cmp data1, data2
- ccmp data3, data4, 0, eq
- b.ne L(return2)
- add src1, src1, 32
- add src2, src2, 32
- L(last64):
- subs limit, limit, 32
- b.hi L(loop32)
- /* Compare last 1-16 bytes using unaligned access. */
- L(last_bytes):
- ldp data1, data3, [src1end, -16]
- ldp data2, data4, [src2end, -16]
- L(return2):
- cmp data1, data2
- csel data1, data1, data3, ne
- csel data2, data2, data4, ne
- /* Compare data bytes and set return value to 0, -1 or 1. */
- L(return):
- #ifndef __AARCH64EB__
- rev data1, data1
- rev data2, data2
- #endif
- cmp data1, data2
- cset result, ne
- cneg result, result, lo
- ret
- .p2align 4
- L(less16):
- add src1end, src1, limit
- add src2end, src2, limit
- tbz limit, 3, L(less8)
- ldr data1, [src1]
- ldr data2, [src2]
- ldr data3, [src1end, -8]
- ldr data4, [src2end, -8]
- b L(return2)
- .p2align 4
- L(less8):
- tbz limit, 2, L(less4)
- ldr data1w, [src1]
- ldr data2w, [src2]
- ldr data3w, [src1end, -4]
- ldr data4w, [src2end, -4]
- b L(return2)
- L(less4):
- tbz limit, 1, L(less2)
- ldrh data1w, [src1]
- ldrh data2w, [src2]
- cmp data1w, data2w
- b.ne L(return)
- L(less2):
- mov result, 0
- tbz limit, 0, L(return_zero)
- ldrb data1w, [src1end, -1]
- ldrb data2w, [src2end, -1]
- sub result, data1w, data2w
- L(return_zero):
- ret
- L(loop_align):
- ldp data1, data3, [src1, 16]
- ldp data2, data4, [src2, 16]
- cmp data1, data2
- ccmp data3, data4, 0, eq
- b.ne L(return2)
- /* Align src2 and adjust src1, src2 and limit. */
- and tmp, src2, 15
- sub tmp, tmp, 16
- sub src2, src2, tmp
- add limit, limit, tmp
- sub src1, src1, tmp
- sub limit, limit, 64 + 16
- .p2align 4
- L(loop64):
- ldr q0, [src1, 16]
- ldr q1, [src2, 16]
- subs limit, limit, 64
- ldr q2, [src1, 32]
- ldr q3, [src2, 32]
- eor v0.16b, v0.16b, v1.16b
- eor v1.16b, v2.16b, v3.16b
- ldr q2, [src1, 48]
- ldr q3, [src2, 48]
- umaxp v0.16b, v0.16b, v1.16b
- ldr q4, [src1, 64]!
- ldr q5, [src2, 64]!
- eor v1.16b, v2.16b, v3.16b
- eor v2.16b, v4.16b, v5.16b
- umaxp v1.16b, v1.16b, v2.16b
- umaxp v0.16b, v0.16b, v1.16b
- umaxp v0.16b, v0.16b, v0.16b
- fmov tmp, d0
- ccmp tmp, 0, 0, hi
- b.eq L(loop64)
- /* If equal, process last 1-64 bytes using scalar loop. */
- add limit, limit, 64 + 16
- cbz tmp, L(last64)
- /* Determine the 8-byte aligned offset of the first difference. */
- #ifdef __AARCH64EB__
- rev16 tmp, tmp
- #endif
- rev tmp, tmp
- clz tmp, tmp
- bic tmp, tmp, 7
- sub tmp, tmp, 48
- ldr data1, [src1, tmp]
- ldr data2, [src2, tmp]
- #ifndef __AARCH64EB__
- rev data1, data1
- rev data2, data2
- #endif
- mov result, 1
- cmp data1, data2
- cneg result, result, lo
- ret
- END (memcmp)
- #undef bcmp
- weak_alias (memcmp, bcmp)
- #undef __memcmpeq
- strong_alias (memcmp, __memcmpeq)
- libc_hidden_builtin_def (memcmp)
- libc_hidden_def (__memcmpeq)
|