rqspinlock.h 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. /* SPDX-License-Identifier: GPL-2.0-or-later */
  2. /*
  3. * Resilient Queued Spin Lock
  4. *
  5. * (C) Copyright 2024-2025 Meta Platforms, Inc. and affiliates.
  6. *
  7. * Authors: Kumar Kartikeya Dwivedi <memxor@gmail.com>
  8. */
  9. #ifndef __ASM_GENERIC_RQSPINLOCK_H
  10. #define __ASM_GENERIC_RQSPINLOCK_H
  11. #include <linux/types.h>
  12. #include <vdso/time64.h>
  13. #include <linux/percpu.h>
  14. #ifdef CONFIG_QUEUED_SPINLOCKS
  15. #include <asm/qspinlock.h>
  16. #endif
  17. struct rqspinlock {
  18. union {
  19. atomic_t val;
  20. u32 locked;
  21. };
  22. };
  23. /* Even though this is same as struct rqspinlock, we need to emit a distinct
  24. * type in BTF for BPF programs.
  25. */
  26. struct bpf_res_spin_lock {
  27. u32 val;
  28. } __aligned(__alignof__(struct rqspinlock));
  29. struct qspinlock;
  30. #ifdef CONFIG_QUEUED_SPINLOCKS
  31. typedef struct qspinlock rqspinlock_t;
  32. #else
  33. typedef struct rqspinlock rqspinlock_t;
  34. #endif
  35. extern int resilient_tas_spin_lock(rqspinlock_t *lock);
  36. #ifdef CONFIG_QUEUED_SPINLOCKS
  37. extern int resilient_queued_spin_lock_slowpath(rqspinlock_t *lock, u32 val);
  38. #endif
  39. #ifndef resilient_virt_spin_lock_enabled
  40. static __always_inline bool resilient_virt_spin_lock_enabled(void)
  41. {
  42. return false;
  43. }
  44. #endif
  45. #ifndef resilient_virt_spin_lock
  46. static __always_inline int resilient_virt_spin_lock(rqspinlock_t *lock)
  47. {
  48. return 0;
  49. }
  50. #endif
  51. /*
  52. * Default timeout for waiting loops is 0.25 seconds
  53. */
  54. #define RES_DEF_TIMEOUT (NSEC_PER_SEC / 4)
  55. /*
  56. * Choose 31 as it makes rqspinlock_held cacheline-aligned.
  57. */
  58. #define RES_NR_HELD 31
  59. struct rqspinlock_held {
  60. int cnt;
  61. void *locks[RES_NR_HELD];
  62. };
  63. DECLARE_PER_CPU_ALIGNED(struct rqspinlock_held, rqspinlock_held_locks);
  64. static __always_inline void grab_held_lock_entry(void *lock)
  65. {
  66. int cnt = this_cpu_inc_return(rqspinlock_held_locks.cnt);
  67. if (unlikely(cnt > RES_NR_HELD)) {
  68. /* Still keep the inc so we decrement later. */
  69. return;
  70. }
  71. /*
  72. * Implied compiler barrier in per-CPU operations; otherwise we can have
  73. * the compiler reorder inc with write to table, allowing interrupts to
  74. * overwrite and erase our write to the table (as on interrupt exit it
  75. * will be reset to NULL).
  76. *
  77. * It is fine for cnt inc to be reordered wrt remote readers though,
  78. * they won't observe our entry until the cnt update is visible, that's
  79. * all.
  80. */
  81. this_cpu_write(rqspinlock_held_locks.locks[cnt - 1], lock);
  82. }
  83. /*
  84. * We simply don't support out-of-order unlocks, and keep the logic simple here.
  85. * The verifier prevents BPF programs from unlocking out-of-order, and the same
  86. * holds for in-kernel users.
  87. *
  88. * It is possible to run into misdetection scenarios of AA deadlocks on the same
  89. * CPU, and missed ABBA deadlocks on remote CPUs if this function pops entries
  90. * out of order (due to lock A, lock B, unlock A, unlock B) pattern. The correct
  91. * logic to preserve right entries in the table would be to walk the array of
  92. * held locks and swap and clear out-of-order entries, but that's too
  93. * complicated and we don't have a compelling use case for out of order unlocking.
  94. */
  95. static __always_inline void release_held_lock_entry(void)
  96. {
  97. struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
  98. if (unlikely(rqh->cnt > RES_NR_HELD))
  99. goto dec;
  100. WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
  101. dec:
  102. /*
  103. * Reordering of clearing above with inc and its write in
  104. * grab_held_lock_entry that came before us (in same acquisition
  105. * attempt) is ok, we either see a valid entry or NULL when it's
  106. * visible.
  107. *
  108. * But this helper is invoked when we unwind upon failing to acquire the
  109. * lock. Unlike the unlock path which constitutes a release store after
  110. * we clear the entry, we need to emit a write barrier here. Otherwise,
  111. * we may have a situation as follows:
  112. *
  113. * <error> for lock B
  114. * release_held_lock_entry
  115. *
  116. * grab_held_lock_entry
  117. * try_cmpxchg_acquire for lock A
  118. *
  119. * Lack of any ordering means reordering may occur such that dec, inc
  120. * are done before entry is overwritten. This permits a remote lock
  121. * holder of lock B (which this CPU failed to acquire) to now observe it
  122. * as being attempted on this CPU, and may lead to misdetection (if this
  123. * CPU holds a lock it is attempting to acquire, leading to false ABBA
  124. * diagnosis).
  125. *
  126. * The case of unlock is treated differently due to NMI reentrancy, see
  127. * comments in res_spin_unlock.
  128. *
  129. * In theory we don't have a problem if the dec and WRITE_ONCE above get
  130. * reordered with each other, we either notice an empty NULL entry on
  131. * top (if dec succeeds WRITE_ONCE), or a potentially stale entry which
  132. * cannot be observed (if dec precedes WRITE_ONCE).
  133. *
  134. * Emit the write barrier _before_ the dec, this permits dec-inc
  135. * reordering but that is harmless as we'd have new entry set to NULL
  136. * already, i.e. they cannot precede the NULL store above.
  137. */
  138. smp_wmb();
  139. this_cpu_dec(rqspinlock_held_locks.cnt);
  140. }
  141. #ifdef CONFIG_QUEUED_SPINLOCKS
  142. /**
  143. * res_spin_lock - acquire a queued spinlock
  144. * @lock: Pointer to queued spinlock structure
  145. *
  146. * Return:
  147. * * 0 - Lock was acquired successfully.
  148. * * -EDEADLK - Lock acquisition failed because of AA/ABBA deadlock.
  149. * * -ETIMEDOUT - Lock acquisition failed because of timeout.
  150. */
  151. static __always_inline int res_spin_lock(rqspinlock_t *lock)
  152. {
  153. int val = 0;
  154. /*
  155. * Grab the deadlock detection entry before doing the cmpxchg, so that
  156. * reentrancy due to NMIs between the succeeding cmpxchg and creation of
  157. * held lock entry can correctly detect an acquisition attempt in the
  158. * interrupted context.
  159. *
  160. * cmpxchg lock A
  161. * <NMI>
  162. * res_spin_lock(A) --> missed AA, leads to timeout
  163. * </NMI>
  164. * grab_held_lock_entry(A)
  165. */
  166. grab_held_lock_entry(lock);
  167. if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)))
  168. return 0;
  169. return resilient_queued_spin_lock_slowpath(lock, val);
  170. }
  171. #else
  172. #define res_spin_lock(lock) ({ grab_held_lock_entry(lock); resilient_tas_spin_lock(lock); })
  173. #endif /* CONFIG_QUEUED_SPINLOCKS */
  174. static __always_inline void res_spin_unlock(rqspinlock_t *lock)
  175. {
  176. struct rqspinlock_held *rqh = this_cpu_ptr(&rqspinlock_held_locks);
  177. /*
  178. * Release barrier, ensures correct ordering. Perform release store
  179. * instead of queued_spin_unlock, since we use this function for the TAS
  180. * fallback as well. When we have CONFIG_QUEUED_SPINLOCKS=n, we clear
  181. * the full 4-byte lockword.
  182. *
  183. * Perform the smp_store_release before clearing the lock entry so that
  184. * NMIs landing in the unlock path can correctly detect AA issues. The
  185. * opposite order shown below may lead to missed AA checks:
  186. *
  187. * WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL)
  188. * <NMI>
  189. * res_spin_lock(A) --> missed AA, leads to timeout
  190. * </NMI>
  191. * smp_store_release(A->locked, 0)
  192. */
  193. smp_store_release(&lock->locked, 0);
  194. if (likely(rqh->cnt <= RES_NR_HELD))
  195. WRITE_ONCE(rqh->locks[rqh->cnt - 1], NULL);
  196. this_cpu_dec(rqspinlock_held_locks.cnt);
  197. }
  198. #ifdef CONFIG_QUEUED_SPINLOCKS
  199. #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; })
  200. #else
  201. #define raw_res_spin_lock_init(lock) ({ *(lock) = (rqspinlock_t){0}; })
  202. #endif
  203. #define raw_res_spin_lock(lock) \
  204. ({ \
  205. int __ret; \
  206. preempt_disable(); \
  207. __ret = res_spin_lock(lock); \
  208. if (__ret) \
  209. preempt_enable(); \
  210. __ret; \
  211. })
  212. #define raw_res_spin_unlock(lock) ({ res_spin_unlock(lock); preempt_enable(); })
  213. #define raw_res_spin_lock_irqsave(lock, flags) \
  214. ({ \
  215. int __ret; \
  216. local_irq_save(flags); \
  217. __ret = raw_res_spin_lock(lock); \
  218. if (__ret) \
  219. local_irq_restore(flags); \
  220. __ret; \
  221. })
  222. #define raw_res_spin_unlock_irqrestore(lock, flags) ({ raw_res_spin_unlock(lock); local_irq_restore(flags); })
  223. #endif /* __ASM_GENERIC_RQSPINLOCK_H */