memchr.c 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. /* Copyright (C) 2010-2026 Free Software Foundation, Inc.
  2. This file is part of the GNU C Library.
  3. The GNU C Library is free software; you can redistribute it and/or
  4. modify it under the terms of the GNU Lesser General Public
  5. License as published by the Free Software Foundation; either
  6. version 2.1 of the License, or (at your option) any later version.
  7. The GNU C Library is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. Lesser General Public License for more details.
  11. You should have received a copy of the GNU Lesser General Public
  12. License along with the GNU C Library. If not, see
  13. <https://www.gnu.org/licenses/>. */
  14. #include <string.h>
  15. typedef unsigned long word;
  16. static inline word
  17. ldq_u(const void *s)
  18. {
  19. return *(const word *)((word)s & -8);
  20. }
  21. #define unlikely(X) __builtin_expect ((X), 0)
  22. #define prefetch(X) __builtin_prefetch ((void *)(X), 0)
  23. #define cmpbeq0(X) __builtin_alpha_cmpbge(0, (X))
  24. #define find(X, Y) cmpbeq0 ((X) ^ (Y))
  25. /* Search no more than N bytes of S for C. */
  26. void *
  27. __memchr (const void *s, int xc, size_t n)
  28. {
  29. const word *s_align;
  30. word t, current, found, mask, offset;
  31. if (unlikely (n == 0))
  32. return 0;
  33. current = ldq_u (s);
  34. /* Replicate low byte of XC into all bytes of C. */
  35. t = xc & 0xff; /* 0000000c */
  36. t = (t << 8) | t; /* 000000cc */
  37. t = (t << 16) | t; /* 0000cccc */
  38. const word c = (t << 32) | t; /* cccccccc */
  39. /* Align the source, and decrement the count by the number
  40. of bytes searched in the first word. */
  41. s_align = (const word *)((word)s & -8);
  42. {
  43. size_t inc = n + ((word)s & 7);
  44. n = inc | -(inc < n);
  45. }
  46. /* Deal with misalignment in the first word for the comparison. */
  47. mask = (1ul << ((word)s & 7)) - 1;
  48. /* If the entire string fits within one word, we may need masking
  49. at both the front and the back of the string. */
  50. if (unlikely (n <= 8))
  51. {
  52. mask |= -1ul << n;
  53. goto last_quad;
  54. }
  55. found = find (current, c) & ~mask;
  56. if (unlikely (found))
  57. goto found_it;
  58. s_align++;
  59. n -= 8;
  60. /* If the block is sufficiently large, align to cacheline and prefetch. */
  61. if (unlikely (n >= 256))
  62. {
  63. /* Prefetch 3 cache lines beyond the one we're working on. */
  64. prefetch (s_align + 8);
  65. prefetch (s_align + 16);
  66. prefetch (s_align + 24);
  67. while ((word)s_align & 63)
  68. {
  69. current = *s_align;
  70. found = find (current, c);
  71. if (found)
  72. goto found_it;
  73. s_align++;
  74. n -= 8;
  75. }
  76. /* Within each cacheline, advance the load for the next word
  77. before the test for the previous word is complete. This
  78. allows us to hide the 3 cycle L1 cache load latency. We
  79. only perform this advance load within a cacheline to prevent
  80. reading across page boundary. */
  81. #define CACHELINE_LOOP \
  82. do { \
  83. word i, next = s_align[0]; \
  84. for (i = 0; i < 7; ++i) \
  85. { \
  86. current = next; \
  87. next = s_align[1]; \
  88. found = find (current, c); \
  89. if (unlikely (found)) \
  90. goto found_it; \
  91. s_align++; \
  92. } \
  93. current = next; \
  94. found = find (current, c); \
  95. if (unlikely (found)) \
  96. goto found_it; \
  97. s_align++; \
  98. n -= 64; \
  99. } while (0)
  100. /* While there's still lots more data to potentially be read,
  101. continue issuing prefetches for the 4th cacheline out. */
  102. while (n >= 256)
  103. {
  104. prefetch (s_align + 24);
  105. CACHELINE_LOOP;
  106. }
  107. /* Up to 3 cache lines remaining. Continue issuing advanced
  108. loads, but stop prefetching. */
  109. while (n >= 64)
  110. CACHELINE_LOOP;
  111. /* We may have exhausted the buffer. */
  112. if (n == 0)
  113. return NULL;
  114. }
  115. /* Quadword aligned loop. */
  116. current = *s_align;
  117. while (n > 8)
  118. {
  119. found = find (current, c);
  120. if (unlikely (found))
  121. goto found_it;
  122. current = *++s_align;
  123. n -= 8;
  124. }
  125. /* The last word may need masking at the tail of the compare. */
  126. mask = -1ul << n;
  127. last_quad:
  128. found = find (current, c) & ~mask;
  129. if (found == 0)
  130. return NULL;
  131. found_it:
  132. #ifdef __alpha_cix__
  133. offset = __builtin_alpha_cttz (found);
  134. #else
  135. /* Extract LSB. */
  136. found &= -found;
  137. /* Binary search for the LSB. */
  138. offset = (found & 0x0f ? 0 : 4);
  139. offset += (found & 0x33 ? 0 : 2);
  140. offset += (found & 0x55 ? 0 : 1);
  141. #endif
  142. return (void *)((word)s_align + offset);
  143. }
  144. #ifdef weak_alias
  145. weak_alias (__memchr, memchr)
  146. #endif
  147. libc_hidden_builtin_def (memchr)