mmap_lock.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570
  1. // SPDX-License-Identifier: GPL-2.0
  2. #define CREATE_TRACE_POINTS
  3. #include <trace/events/mmap_lock.h>
  4. #include <linux/mm.h>
  5. #include <linux/cgroup.h>
  6. #include <linux/memcontrol.h>
  7. #include <linux/mmap_lock.h>
  8. #include <linux/mutex.h>
  9. #include <linux/percpu.h>
  10. #include <linux/rcupdate.h>
  11. #include <linux/smp.h>
  12. #include <linux/trace_events.h>
  13. #include <linux/local_lock.h>
  14. EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
  15. EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
  16. EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
  17. #ifdef CONFIG_TRACING
  18. /*
  19. * Trace calls must be in a separate file, as otherwise there's a circular
  20. * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
  21. */
  22. void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
  23. {
  24. trace_mmap_lock_start_locking(mm, write);
  25. }
  26. EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
  27. void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
  28. bool success)
  29. {
  30. trace_mmap_lock_acquire_returned(mm, write, success);
  31. }
  32. EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
  33. void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
  34. {
  35. trace_mmap_lock_released(mm, write);
  36. }
  37. EXPORT_SYMBOL(__mmap_lock_do_trace_released);
  38. #endif /* CONFIG_TRACING */
  39. #ifdef CONFIG_MMU
  40. #ifdef CONFIG_PER_VMA_LOCK
  41. /* State shared across __vma_[start, end]_exclude_readers. */
  42. struct vma_exclude_readers_state {
  43. /* Input parameters. */
  44. struct vm_area_struct *vma;
  45. int state; /* TASK_KILLABLE or TASK_UNINTERRUPTIBLE. */
  46. bool detaching;
  47. /* Output parameters. */
  48. bool detached;
  49. bool exclusive; /* Are we exclusively locked? */
  50. };
  51. /*
  52. * Now that all readers have been evicted, mark the VMA as being out of the
  53. * 'exclude readers' state.
  54. */
  55. static void __vma_end_exclude_readers(struct vma_exclude_readers_state *ves)
  56. {
  57. struct vm_area_struct *vma = ves->vma;
  58. VM_WARN_ON_ONCE(ves->detached);
  59. ves->detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG,
  60. &vma->vm_refcnt);
  61. __vma_lockdep_release_exclusive(vma);
  62. }
  63. static unsigned int get_target_refcnt(struct vma_exclude_readers_state *ves)
  64. {
  65. const unsigned int tgt = ves->detaching ? 0 : 1;
  66. return tgt | VM_REFCNT_EXCLUDE_READERS_FLAG;
  67. }
  68. /*
  69. * Mark the VMA as being in a state of excluding readers, check to see if any
  70. * VMA read locks are indeed held, and if so wait for them to be released.
  71. *
  72. * Note that this function pairs with vma_refcount_put() which will wake up this
  73. * thread when it detects that the last reader has released its lock.
  74. *
  75. * The ves->state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases
  76. * where we wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal
  77. * signal is permitted to kill it.
  78. *
  79. * The function sets the ves->exclusive parameter to true if readers were
  80. * excluded, or false if the VMA was detached or an error arose on wait.
  81. *
  82. * If the function indicates an exclusive lock was acquired via ves->exclusive
  83. * the caller is required to invoke __vma_end_exclude_readers() once the
  84. * exclusive state is no longer required.
  85. *
  86. * If ves->state is set to something other than TASK_UNINTERRUPTIBLE, the
  87. * function may also return -EINTR to indicate a fatal signal was received while
  88. * waiting. Otherwise, the function returns 0.
  89. */
  90. static int __vma_start_exclude_readers(struct vma_exclude_readers_state *ves)
  91. {
  92. struct vm_area_struct *vma = ves->vma;
  93. unsigned int tgt_refcnt = get_target_refcnt(ves);
  94. int err = 0;
  95. mmap_assert_write_locked(vma->vm_mm);
  96. /*
  97. * If vma is detached then only vma_mark_attached() can raise the
  98. * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
  99. *
  100. * See the comment describing the vm_area_struct->vm_refcnt field for
  101. * details of possible refcnt values.
  102. */
  103. if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) {
  104. ves->detached = true;
  105. return 0;
  106. }
  107. __vma_lockdep_acquire_exclusive(vma);
  108. err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
  109. refcount_read(&vma->vm_refcnt) == tgt_refcnt,
  110. ves->state);
  111. if (err) {
  112. __vma_end_exclude_readers(ves);
  113. return err;
  114. }
  115. __vma_lockdep_stat_mark_acquired(vma);
  116. ves->exclusive = true;
  117. return 0;
  118. }
  119. int __vma_start_write(struct vm_area_struct *vma, int state)
  120. {
  121. const unsigned int mm_lock_seq = __vma_raw_mm_seqnum(vma);
  122. struct vma_exclude_readers_state ves = {
  123. .vma = vma,
  124. .state = state,
  125. };
  126. int err;
  127. err = __vma_start_exclude_readers(&ves);
  128. if (err) {
  129. WARN_ON_ONCE(ves.detached);
  130. return err;
  131. }
  132. /*
  133. * We should use WRITE_ONCE() here because we can have concurrent reads
  134. * from the early lockless pessimistic check in vma_start_read().
  135. * We don't really care about the correctness of that early check, but
  136. * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
  137. */
  138. WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
  139. if (ves.exclusive) {
  140. __vma_end_exclude_readers(&ves);
  141. /* VMA should remain attached. */
  142. WARN_ON_ONCE(ves.detached);
  143. }
  144. return 0;
  145. }
  146. EXPORT_SYMBOL_GPL(__vma_start_write);
  147. void __vma_exclude_readers_for_detach(struct vm_area_struct *vma)
  148. {
  149. struct vma_exclude_readers_state ves = {
  150. .vma = vma,
  151. .state = TASK_UNINTERRUPTIBLE,
  152. .detaching = true,
  153. };
  154. int err;
  155. /*
  156. * Wait until the VMA is detached with no readers. Since we hold the VMA
  157. * write lock, the only read locks that might be present are those from
  158. * threads trying to acquire the read lock and incrementing the
  159. * reference count before realising the write lock is held and
  160. * decrementing it.
  161. */
  162. err = __vma_start_exclude_readers(&ves);
  163. if (!err && ves.exclusive) {
  164. /*
  165. * Once this is complete, no readers can increment the
  166. * reference count, and the VMA is marked detached.
  167. */
  168. __vma_end_exclude_readers(&ves);
  169. }
  170. /* If an error arose but we were detached anyway, we don't care. */
  171. WARN_ON_ONCE(!ves.detached);
  172. }
  173. /*
  174. * Try to read-lock a vma. The function is allowed to occasionally yield false
  175. * locked result to avoid performance overhead, in which case we fall back to
  176. * using mmap_lock. The function should never yield false unlocked result.
  177. * False locked result is possible if mm_lock_seq overflows or if vma gets
  178. * reused and attached to a different mm before we lock it.
  179. * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
  180. * detached.
  181. *
  182. * IMPORTANT: RCU lock must be held upon entering the function, but upon error
  183. * IT IS RELEASED. The caller must handle this correctly.
  184. */
  185. static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
  186. struct vm_area_struct *vma)
  187. {
  188. struct mm_struct *other_mm;
  189. int oldcnt;
  190. RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
  191. /*
  192. * Check before locking. A race might cause false locked result.
  193. * We can use READ_ONCE() for the mm_lock_seq here, and don't need
  194. * ACQUIRE semantics, because this is just a lockless check whose result
  195. * we don't rely on for anything - the mm_lock_seq read against which we
  196. * need ordering is below.
  197. */
  198. if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
  199. vma = NULL;
  200. goto err;
  201. }
  202. /*
  203. * If VM_REFCNT_EXCLUDE_READERS_FLAG is set,
  204. * __refcount_inc_not_zero_limited_acquire() will fail because
  205. * VM_REFCNT_LIMIT is less than VM_REFCNT_EXCLUDE_READERS_FLAG.
  206. *
  207. * Acquire fence is required here to avoid reordering against later
  208. * vm_lock_seq check and checks inside lock_vma_under_rcu().
  209. */
  210. if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
  211. VM_REFCNT_LIMIT))) {
  212. /* return EAGAIN if vma got detached from under us */
  213. vma = oldcnt ? NULL : ERR_PTR(-EAGAIN);
  214. goto err;
  215. }
  216. __vma_lockdep_acquire_read(vma);
  217. if (unlikely(vma->vm_mm != mm))
  218. goto err_unstable;
  219. /*
  220. * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
  221. * False unlocked result is impossible because we modify and check
  222. * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
  223. * modification invalidates all existing locks.
  224. *
  225. * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
  226. * racing with vma_end_write_all(), we only start reading from the VMA
  227. * after it has been unlocked.
  228. * This pairs with RELEASE semantics in vma_end_write_all().
  229. */
  230. if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
  231. vma_refcount_put(vma);
  232. vma = NULL;
  233. goto err;
  234. }
  235. return vma;
  236. err:
  237. rcu_read_unlock();
  238. return vma;
  239. err_unstable:
  240. /*
  241. * If vma got attached to another mm from under us, that mm is not
  242. * stable and can be freed in the narrow window after vma->vm_refcnt
  243. * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
  244. * releasing vma->vm_refcnt.
  245. */
  246. other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */
  247. /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */
  248. rcu_read_unlock();
  249. mmgrab(other_mm);
  250. vma_refcount_put(vma);
  251. mmdrop(other_mm);
  252. return NULL;
  253. }
  254. /*
  255. * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
  256. * stable and not isolated. If the VMA is not found or is being modified the
  257. * function returns NULL.
  258. */
  259. struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
  260. unsigned long address)
  261. {
  262. MA_STATE(mas, &mm->mm_mt, address, address);
  263. struct vm_area_struct *vma;
  264. retry:
  265. rcu_read_lock();
  266. vma = mas_walk(&mas);
  267. if (!vma) {
  268. rcu_read_unlock();
  269. goto inval;
  270. }
  271. vma = vma_start_read(mm, vma);
  272. if (IS_ERR_OR_NULL(vma)) {
  273. /* Check if the VMA got isolated after we found it */
  274. if (PTR_ERR(vma) == -EAGAIN) {
  275. count_vm_vma_lock_event(VMA_LOCK_MISS);
  276. /* The area was replaced with another one */
  277. mas_set(&mas, address);
  278. goto retry;
  279. }
  280. /* Failed to lock the VMA */
  281. goto inval;
  282. }
  283. /*
  284. * At this point, we have a stable reference to a VMA: The VMA is
  285. * locked and we know it hasn't already been isolated.
  286. * From here on, we can access the VMA without worrying about which
  287. * fields are accessible for RCU readers.
  288. */
  289. rcu_read_unlock();
  290. /* Check if the vma we locked is the right one. */
  291. if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
  292. vma_end_read(vma);
  293. goto inval;
  294. }
  295. return vma;
  296. inval:
  297. count_vm_vma_lock_event(VMA_LOCK_ABORT);
  298. return NULL;
  299. }
  300. static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
  301. struct vma_iterator *vmi,
  302. unsigned long from_addr)
  303. {
  304. struct vm_area_struct *vma;
  305. int ret;
  306. ret = mmap_read_lock_killable(mm);
  307. if (ret)
  308. return ERR_PTR(ret);
  309. /* Lookup the vma at the last position again under mmap_read_lock */
  310. vma_iter_set(vmi, from_addr);
  311. vma = vma_next(vmi);
  312. if (vma) {
  313. /* Very unlikely vma->vm_refcnt overflow case */
  314. if (unlikely(!vma_start_read_locked(vma)))
  315. vma = ERR_PTR(-EAGAIN);
  316. }
  317. mmap_read_unlock(mm);
  318. return vma;
  319. }
  320. struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
  321. struct vma_iterator *vmi,
  322. unsigned long from_addr)
  323. {
  324. struct vm_area_struct *vma;
  325. unsigned int mm_wr_seq;
  326. bool mmap_unlocked;
  327. RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
  328. retry:
  329. /* Start mmap_lock speculation in case we need to verify the vma later */
  330. mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
  331. vma = vma_next(vmi);
  332. if (!vma)
  333. return NULL;
  334. vma = vma_start_read(mm, vma);
  335. if (IS_ERR_OR_NULL(vma)) {
  336. /*
  337. * Retry immediately if the vma gets detached from under us.
  338. * Infinite loop should not happen because the vma we find will
  339. * have to be constantly knocked out from under us.
  340. */
  341. if (PTR_ERR(vma) == -EAGAIN) {
  342. /* reset to search from the last address */
  343. rcu_read_lock();
  344. vma_iter_set(vmi, from_addr);
  345. goto retry;
  346. }
  347. goto fallback;
  348. }
  349. /* Verify the vma is not behind the last search position. */
  350. if (unlikely(from_addr >= vma->vm_end))
  351. goto fallback_unlock;
  352. /*
  353. * vma can be ahead of the last search position but we need to verify
  354. * it was not shrunk after we found it and another vma has not been
  355. * installed ahead of it. Otherwise we might observe a gap that should
  356. * not be there.
  357. */
  358. if (from_addr < vma->vm_start) {
  359. /* Verify only if the address space might have changed since vma lookup. */
  360. if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
  361. vma_iter_set(vmi, from_addr);
  362. if (vma != vma_next(vmi))
  363. goto fallback_unlock;
  364. }
  365. }
  366. return vma;
  367. fallback_unlock:
  368. rcu_read_unlock();
  369. vma_end_read(vma);
  370. fallback:
  371. vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
  372. rcu_read_lock();
  373. /* Reinitialize the iterator after re-entering rcu read section */
  374. vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);
  375. return vma;
  376. }
  377. #endif /* CONFIG_PER_VMA_LOCK */
  378. #ifdef CONFIG_LOCK_MM_AND_FIND_VMA
  379. #include <linux/extable.h>
  380. static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
  381. {
  382. if (likely(mmap_read_trylock(mm)))
  383. return true;
  384. if (regs && !user_mode(regs)) {
  385. unsigned long ip = exception_ip(regs);
  386. if (!search_exception_tables(ip))
  387. return false;
  388. }
  389. return !mmap_read_lock_killable(mm);
  390. }
  391. static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
  392. {
  393. /*
  394. * We don't have this operation yet.
  395. *
  396. * It should be easy enough to do: it's basically a
  397. * atomic_long_try_cmpxchg_acquire()
  398. * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
  399. * it also needs the proper lockdep magic etc.
  400. */
  401. return false;
  402. }
  403. static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
  404. {
  405. mmap_read_unlock(mm);
  406. if (regs && !user_mode(regs)) {
  407. unsigned long ip = exception_ip(regs);
  408. if (!search_exception_tables(ip))
  409. return false;
  410. }
  411. return !mmap_write_lock_killable(mm);
  412. }
  413. /*
  414. * Helper for page fault handling.
  415. *
  416. * This is kind of equivalent to "mmap_read_lock()" followed
  417. * by "find_extend_vma()", except it's a lot more careful about
  418. * the locking (and will drop the lock on failure).
  419. *
  420. * For example, if we have a kernel bug that causes a page
  421. * fault, we don't want to just use mmap_read_lock() to get
  422. * the mm lock, because that would deadlock if the bug were
  423. * to happen while we're holding the mm lock for writing.
  424. *
  425. * So this checks the exception tables on kernel faults in
  426. * order to only do this all for instructions that are actually
  427. * expected to fault.
  428. *
  429. * We can also actually take the mm lock for writing if we
  430. * need to extend the vma, which helps the VM layer a lot.
  431. */
  432. struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
  433. unsigned long addr, struct pt_regs *regs)
  434. {
  435. struct vm_area_struct *vma;
  436. if (!get_mmap_lock_carefully(mm, regs))
  437. return NULL;
  438. vma = find_vma(mm, addr);
  439. if (likely(vma && (vma->vm_start <= addr)))
  440. return vma;
  441. /*
  442. * Well, dang. We might still be successful, but only
  443. * if we can extend a vma to do so.
  444. */
  445. if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
  446. mmap_read_unlock(mm);
  447. return NULL;
  448. }
  449. /*
  450. * We can try to upgrade the mmap lock atomically,
  451. * in which case we can continue to use the vma
  452. * we already looked up.
  453. *
  454. * Otherwise we'll have to drop the mmap lock and
  455. * re-take it, and also look up the vma again,
  456. * re-checking it.
  457. */
  458. if (!mmap_upgrade_trylock(mm)) {
  459. if (!upgrade_mmap_lock_carefully(mm, regs))
  460. return NULL;
  461. vma = find_vma(mm, addr);
  462. if (!vma)
  463. goto fail;
  464. if (vma->vm_start <= addr)
  465. goto success;
  466. if (!(vma->vm_flags & VM_GROWSDOWN))
  467. goto fail;
  468. }
  469. if (expand_stack_locked(vma, addr))
  470. goto fail;
  471. success:
  472. mmap_write_downgrade(mm);
  473. return vma;
  474. fail:
  475. mmap_write_unlock(mm);
  476. return NULL;
  477. }
  478. #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
  479. #else /* CONFIG_MMU */
  480. /*
  481. * At least xtensa ends up having protection faults even with no
  482. * MMU.. No stack expansion, at least.
  483. */
  484. struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
  485. unsigned long addr, struct pt_regs *regs)
  486. {
  487. struct vm_area_struct *vma;
  488. mmap_read_lock(mm);
  489. vma = vma_lookup(mm, addr);
  490. if (!vma)
  491. mmap_read_unlock(mm);
  492. return vma;
  493. }
  494. #endif /* CONFIG_MMU */