memset.S 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. /* Copyright (C) 1996-2026 Free Software Foundation, Inc.
  2. This file is part of the GNU C Library.
  3. The GNU C Library is free software; you can redistribute it and/or
  4. modify it under the terms of the GNU Lesser General Public
  5. License as published by the Free Software Foundation; either
  6. version 2.1 of the License, or (at your option) any later version.
  7. The GNU C Library is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. Lesser General Public License for more details.
  11. You should have received a copy of the GNU Lesser General Public
  12. License along with the GNU C Library. If not, see
  13. <https://www.gnu.org/licenses/>. */
  14. /* Fill a block of memory with a character. Optimized for the Alpha
  15. architecture:
  16. - memory accessed as aligned quadwords only
  17. - destination memory not read unless needed for good cache behaviour
  18. - basic blocks arranged to optimize branch prediction for full-quadword
  19. aligned memory blocks.
  20. - partial head and tail quadwords constructed with byte-mask instructions
  21. This is generally scheduled for the EV5 (got to look out for my own
  22. interests :-), but with EV4 needs in mind. There *should* be no more
  23. stalls for the EV4 than there are for the EV5.
  24. */
  25. #include <sysdep.h>
  26. .set noat
  27. .set noreorder
  28. .text
  29. .type memset, @function
  30. .globl memset
  31. .usepv memset, USEPV_PROF
  32. cfi_startproc
  33. /* On entry to this basic block:
  34. t3 == loop counter
  35. t4 == bytes in partial final word
  36. a0 == possibly misaligned destination pointer
  37. a1 == replicated source character */
  38. .align 3
  39. memset_loop:
  40. beq t3, $tail
  41. blbc t3, 0f # skip single store if count even
  42. stq_u a1, 0(a0) # e0 : store one word
  43. subq t3, 1, t3 # .. e1 :
  44. addq a0, 8, a0 # e0 :
  45. beq t3, $tail # .. e1 :
  46. 0: stq_u a1, 0(a0) # e0 : store two words
  47. subq t3, 2, t3 # .. e1 :
  48. stq_u a1, 8(a0) # e0 :
  49. addq a0, 16, a0 # .. e1 :
  50. bne t3, 0b # e1 :
  51. $tail: bne t4, 1f # is there a tail to do?
  52. ret # no
  53. .align 3
  54. 1: ldq_u t0, 0(a0) # e1 : yes, load original data
  55. mskql a1, t4, t1 # .. e0 :
  56. mskqh t0, t4, t0 # e0 :
  57. or t0, t1, t0 # e1 (stall)
  58. stq_u t0, 0(a0) # e0 :
  59. ret # .. e1 :
  60. memset:
  61. #ifdef PROF
  62. ldgp gp, 0(pv)
  63. lda AT, _mcount
  64. jsr AT, (AT), _mcount
  65. #endif
  66. and a1, 0xff, a1 # e0 : zero extend input character
  67. mov a0, v0 # .. e1 : move return value in place
  68. sll a1, 8, t0 # e0 : begin replicating the char
  69. beq a2, $done # .. e1 : early exit for zero-length store
  70. or t0, a1, a1 # e0 :
  71. and a0, 7, t1 # .. e1 : dest misalignment
  72. sll a1, 16, t0 # e0 :
  73. addq a2, t1, a2 # .. e1 : add dest misalignment to count
  74. or t0, a1, a1 # e0 :
  75. srl a2, 3, t3 # .. e1 : loop = count >> 3
  76. sll a1, 32, t0 # e0 :
  77. and a2, 7, t4 # .. e1 : find number of bytes in tail
  78. or t0, a1, a1 # e0 : character replication done
  79. beq t1, memset_loop # .. e1 : aligned head, jump right in
  80. ldq_u t0, 0(a0) # e1 : load original data to mask into
  81. mskqh a1, a0, t1 # .. e0 :
  82. cmpult a2, 8, t2 # e0 : is this a sub-word set?
  83. bne t2, $oneq # .. e1 (zdb)
  84. mskql t0, a0, t0 # e0 : we span words. finish this partial
  85. subq t3, 1, t3 # .. e1 :
  86. addq a0, 8, a0 # e0 :
  87. or t0, t1, t0 # .. e1 :
  88. stq_u t0, -8(a0) # e0 :
  89. br memset_loop # .. e1 :
  90. .align 3
  91. $oneq:
  92. mskql t1, a2, t1 # e0 : entire operation within one word
  93. mskql t0, a0, t2 # e0 :
  94. mskqh t0, a2, t3 # e0 :
  95. or t1, t2, t0 # .. e1 :
  96. or t0, t3, t0 # e1 :
  97. stq_u t0, 0(a0) # e0 (stall)
  98. $done: ret
  99. cfi_endproc
  100. libc_hidden_builtin_def (memset)