| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570 |
- // SPDX-License-Identifier: GPL-2.0
- #define CREATE_TRACE_POINTS
- #include <trace/events/mmap_lock.h>
- #include <linux/mm.h>
- #include <linux/cgroup.h>
- #include <linux/memcontrol.h>
- #include <linux/mmap_lock.h>
- #include <linux/mutex.h>
- #include <linux/percpu.h>
- #include <linux/rcupdate.h>
- #include <linux/smp.h>
- #include <linux/trace_events.h>
- #include <linux/local_lock.h>
- EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
- EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
- EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
- #ifdef CONFIG_TRACING
- /*
- * Trace calls must be in a separate file, as otherwise there's a circular
- * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
- */
- void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
- {
- trace_mmap_lock_start_locking(mm, write);
- }
- EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
- void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
- bool success)
- {
- trace_mmap_lock_acquire_returned(mm, write, success);
- }
- EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
- void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
- {
- trace_mmap_lock_released(mm, write);
- }
- EXPORT_SYMBOL(__mmap_lock_do_trace_released);
- #endif /* CONFIG_TRACING */
- #ifdef CONFIG_MMU
- #ifdef CONFIG_PER_VMA_LOCK
- /* State shared across __vma_[start, end]_exclude_readers. */
- struct vma_exclude_readers_state {
- /* Input parameters. */
- struct vm_area_struct *vma;
- int state; /* TASK_KILLABLE or TASK_UNINTERRUPTIBLE. */
- bool detaching;
- /* Output parameters. */
- bool detached;
- bool exclusive; /* Are we exclusively locked? */
- };
- /*
- * Now that all readers have been evicted, mark the VMA as being out of the
- * 'exclude readers' state.
- */
- static void __vma_end_exclude_readers(struct vma_exclude_readers_state *ves)
- {
- struct vm_area_struct *vma = ves->vma;
- VM_WARN_ON_ONCE(ves->detached);
- ves->detached = refcount_sub_and_test(VM_REFCNT_EXCLUDE_READERS_FLAG,
- &vma->vm_refcnt);
- __vma_lockdep_release_exclusive(vma);
- }
- static unsigned int get_target_refcnt(struct vma_exclude_readers_state *ves)
- {
- const unsigned int tgt = ves->detaching ? 0 : 1;
- return tgt | VM_REFCNT_EXCLUDE_READERS_FLAG;
- }
- /*
- * Mark the VMA as being in a state of excluding readers, check to see if any
- * VMA read locks are indeed held, and if so wait for them to be released.
- *
- * Note that this function pairs with vma_refcount_put() which will wake up this
- * thread when it detects that the last reader has released its lock.
- *
- * The ves->state parameter ought to be set to TASK_UNINTERRUPTIBLE in cases
- * where we wish the thread to sleep uninterruptibly or TASK_KILLABLE if a fatal
- * signal is permitted to kill it.
- *
- * The function sets the ves->exclusive parameter to true if readers were
- * excluded, or false if the VMA was detached or an error arose on wait.
- *
- * If the function indicates an exclusive lock was acquired via ves->exclusive
- * the caller is required to invoke __vma_end_exclude_readers() once the
- * exclusive state is no longer required.
- *
- * If ves->state is set to something other than TASK_UNINTERRUPTIBLE, the
- * function may also return -EINTR to indicate a fatal signal was received while
- * waiting. Otherwise, the function returns 0.
- */
- static int __vma_start_exclude_readers(struct vma_exclude_readers_state *ves)
- {
- struct vm_area_struct *vma = ves->vma;
- unsigned int tgt_refcnt = get_target_refcnt(ves);
- int err = 0;
- mmap_assert_write_locked(vma->vm_mm);
- /*
- * If vma is detached then only vma_mark_attached() can raise the
- * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached().
- *
- * See the comment describing the vm_area_struct->vm_refcnt field for
- * details of possible refcnt values.
- */
- if (!refcount_add_not_zero(VM_REFCNT_EXCLUDE_READERS_FLAG, &vma->vm_refcnt)) {
- ves->detached = true;
- return 0;
- }
- __vma_lockdep_acquire_exclusive(vma);
- err = rcuwait_wait_event(&vma->vm_mm->vma_writer_wait,
- refcount_read(&vma->vm_refcnt) == tgt_refcnt,
- ves->state);
- if (err) {
- __vma_end_exclude_readers(ves);
- return err;
- }
- __vma_lockdep_stat_mark_acquired(vma);
- ves->exclusive = true;
- return 0;
- }
- int __vma_start_write(struct vm_area_struct *vma, int state)
- {
- const unsigned int mm_lock_seq = __vma_raw_mm_seqnum(vma);
- struct vma_exclude_readers_state ves = {
- .vma = vma,
- .state = state,
- };
- int err;
- err = __vma_start_exclude_readers(&ves);
- if (err) {
- WARN_ON_ONCE(ves.detached);
- return err;
- }
- /*
- * We should use WRITE_ONCE() here because we can have concurrent reads
- * from the early lockless pessimistic check in vma_start_read().
- * We don't really care about the correctness of that early check, but
- * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
- */
- WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
- if (ves.exclusive) {
- __vma_end_exclude_readers(&ves);
- /* VMA should remain attached. */
- WARN_ON_ONCE(ves.detached);
- }
- return 0;
- }
- EXPORT_SYMBOL_GPL(__vma_start_write);
- void __vma_exclude_readers_for_detach(struct vm_area_struct *vma)
- {
- struct vma_exclude_readers_state ves = {
- .vma = vma,
- .state = TASK_UNINTERRUPTIBLE,
- .detaching = true,
- };
- int err;
- /*
- * Wait until the VMA is detached with no readers. Since we hold the VMA
- * write lock, the only read locks that might be present are those from
- * threads trying to acquire the read lock and incrementing the
- * reference count before realising the write lock is held and
- * decrementing it.
- */
- err = __vma_start_exclude_readers(&ves);
- if (!err && ves.exclusive) {
- /*
- * Once this is complete, no readers can increment the
- * reference count, and the VMA is marked detached.
- */
- __vma_end_exclude_readers(&ves);
- }
- /* If an error arose but we were detached anyway, we don't care. */
- WARN_ON_ONCE(!ves.detached);
- }
- /*
- * Try to read-lock a vma. The function is allowed to occasionally yield false
- * locked result to avoid performance overhead, in which case we fall back to
- * using mmap_lock. The function should never yield false unlocked result.
- * False locked result is possible if mm_lock_seq overflows or if vma gets
- * reused and attached to a different mm before we lock it.
- * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got
- * detached.
- *
- * IMPORTANT: RCU lock must be held upon entering the function, but upon error
- * IT IS RELEASED. The caller must handle this correctly.
- */
- static inline struct vm_area_struct *vma_start_read(struct mm_struct *mm,
- struct vm_area_struct *vma)
- {
- struct mm_struct *other_mm;
- int oldcnt;
- RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu lock held");
- /*
- * Check before locking. A race might cause false locked result.
- * We can use READ_ONCE() for the mm_lock_seq here, and don't need
- * ACQUIRE semantics, because this is just a lockless check whose result
- * we don't rely on for anything - the mm_lock_seq read against which we
- * need ordering is below.
- */
- if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(mm->mm_lock_seq.sequence)) {
- vma = NULL;
- goto err;
- }
- /*
- * If VM_REFCNT_EXCLUDE_READERS_FLAG is set,
- * __refcount_inc_not_zero_limited_acquire() will fail because
- * VM_REFCNT_LIMIT is less than VM_REFCNT_EXCLUDE_READERS_FLAG.
- *
- * Acquire fence is required here to avoid reordering against later
- * vm_lock_seq check and checks inside lock_vma_under_rcu().
- */
- if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt,
- VM_REFCNT_LIMIT))) {
- /* return EAGAIN if vma got detached from under us */
- vma = oldcnt ? NULL : ERR_PTR(-EAGAIN);
- goto err;
- }
- __vma_lockdep_acquire_read(vma);
- if (unlikely(vma->vm_mm != mm))
- goto err_unstable;
- /*
- * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result.
- * False unlocked result is impossible because we modify and check
- * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq
- * modification invalidates all existing locks.
- *
- * We must use ACQUIRE semantics for the mm_lock_seq so that if we are
- * racing with vma_end_write_all(), we only start reading from the VMA
- * after it has been unlocked.
- * This pairs with RELEASE semantics in vma_end_write_all().
- */
- if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&mm->mm_lock_seq))) {
- vma_refcount_put(vma);
- vma = NULL;
- goto err;
- }
- return vma;
- err:
- rcu_read_unlock();
- return vma;
- err_unstable:
- /*
- * If vma got attached to another mm from under us, that mm is not
- * stable and can be freed in the narrow window after vma->vm_refcnt
- * is dropped and before rcuwait_wake_up(mm) is called. Grab it before
- * releasing vma->vm_refcnt.
- */
- other_mm = vma->vm_mm; /* use a copy as vma can be freed after we drop vm_refcnt */
- /* __mmdrop() is a heavy operation, do it after dropping RCU lock. */
- rcu_read_unlock();
- mmgrab(other_mm);
- vma_refcount_put(vma);
- mmdrop(other_mm);
- return NULL;
- }
- /*
- * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be
- * stable and not isolated. If the VMA is not found or is being modified the
- * function returns NULL.
- */
- struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
- unsigned long address)
- {
- MA_STATE(mas, &mm->mm_mt, address, address);
- struct vm_area_struct *vma;
- retry:
- rcu_read_lock();
- vma = mas_walk(&mas);
- if (!vma) {
- rcu_read_unlock();
- goto inval;
- }
- vma = vma_start_read(mm, vma);
- if (IS_ERR_OR_NULL(vma)) {
- /* Check if the VMA got isolated after we found it */
- if (PTR_ERR(vma) == -EAGAIN) {
- count_vm_vma_lock_event(VMA_LOCK_MISS);
- /* The area was replaced with another one */
- mas_set(&mas, address);
- goto retry;
- }
- /* Failed to lock the VMA */
- goto inval;
- }
- /*
- * At this point, we have a stable reference to a VMA: The VMA is
- * locked and we know it hasn't already been isolated.
- * From here on, we can access the VMA without worrying about which
- * fields are accessible for RCU readers.
- */
- rcu_read_unlock();
- /* Check if the vma we locked is the right one. */
- if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
- vma_end_read(vma);
- goto inval;
- }
- return vma;
- inval:
- count_vm_vma_lock_event(VMA_LOCK_ABORT);
- return NULL;
- }
- static struct vm_area_struct *lock_next_vma_under_mmap_lock(struct mm_struct *mm,
- struct vma_iterator *vmi,
- unsigned long from_addr)
- {
- struct vm_area_struct *vma;
- int ret;
- ret = mmap_read_lock_killable(mm);
- if (ret)
- return ERR_PTR(ret);
- /* Lookup the vma at the last position again under mmap_read_lock */
- vma_iter_set(vmi, from_addr);
- vma = vma_next(vmi);
- if (vma) {
- /* Very unlikely vma->vm_refcnt overflow case */
- if (unlikely(!vma_start_read_locked(vma)))
- vma = ERR_PTR(-EAGAIN);
- }
- mmap_read_unlock(mm);
- return vma;
- }
- struct vm_area_struct *lock_next_vma(struct mm_struct *mm,
- struct vma_iterator *vmi,
- unsigned long from_addr)
- {
- struct vm_area_struct *vma;
- unsigned int mm_wr_seq;
- bool mmap_unlocked;
- RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "no rcu read lock held");
- retry:
- /* Start mmap_lock speculation in case we need to verify the vma later */
- mmap_unlocked = mmap_lock_speculate_try_begin(mm, &mm_wr_seq);
- vma = vma_next(vmi);
- if (!vma)
- return NULL;
- vma = vma_start_read(mm, vma);
- if (IS_ERR_OR_NULL(vma)) {
- /*
- * Retry immediately if the vma gets detached from under us.
- * Infinite loop should not happen because the vma we find will
- * have to be constantly knocked out from under us.
- */
- if (PTR_ERR(vma) == -EAGAIN) {
- /* reset to search from the last address */
- rcu_read_lock();
- vma_iter_set(vmi, from_addr);
- goto retry;
- }
- goto fallback;
- }
- /* Verify the vma is not behind the last search position. */
- if (unlikely(from_addr >= vma->vm_end))
- goto fallback_unlock;
- /*
- * vma can be ahead of the last search position but we need to verify
- * it was not shrunk after we found it and another vma has not been
- * installed ahead of it. Otherwise we might observe a gap that should
- * not be there.
- */
- if (from_addr < vma->vm_start) {
- /* Verify only if the address space might have changed since vma lookup. */
- if (!mmap_unlocked || mmap_lock_speculate_retry(mm, mm_wr_seq)) {
- vma_iter_set(vmi, from_addr);
- if (vma != vma_next(vmi))
- goto fallback_unlock;
- }
- }
- return vma;
- fallback_unlock:
- rcu_read_unlock();
- vma_end_read(vma);
- fallback:
- vma = lock_next_vma_under_mmap_lock(mm, vmi, from_addr);
- rcu_read_lock();
- /* Reinitialize the iterator after re-entering rcu read section */
- vma_iter_set(vmi, IS_ERR_OR_NULL(vma) ? from_addr : vma->vm_end);
- return vma;
- }
- #endif /* CONFIG_PER_VMA_LOCK */
- #ifdef CONFIG_LOCK_MM_AND_FIND_VMA
- #include <linux/extable.h>
- static inline bool get_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
- {
- if (likely(mmap_read_trylock(mm)))
- return true;
- if (regs && !user_mode(regs)) {
- unsigned long ip = exception_ip(regs);
- if (!search_exception_tables(ip))
- return false;
- }
- return !mmap_read_lock_killable(mm);
- }
- static inline bool mmap_upgrade_trylock(struct mm_struct *mm)
- {
- /*
- * We don't have this operation yet.
- *
- * It should be easy enough to do: it's basically a
- * atomic_long_try_cmpxchg_acquire()
- * from RWSEM_READER_BIAS -> RWSEM_WRITER_LOCKED, but
- * it also needs the proper lockdep magic etc.
- */
- return false;
- }
- static inline bool upgrade_mmap_lock_carefully(struct mm_struct *mm, struct pt_regs *regs)
- {
- mmap_read_unlock(mm);
- if (regs && !user_mode(regs)) {
- unsigned long ip = exception_ip(regs);
- if (!search_exception_tables(ip))
- return false;
- }
- return !mmap_write_lock_killable(mm);
- }
- /*
- * Helper for page fault handling.
- *
- * This is kind of equivalent to "mmap_read_lock()" followed
- * by "find_extend_vma()", except it's a lot more careful about
- * the locking (and will drop the lock on failure).
- *
- * For example, if we have a kernel bug that causes a page
- * fault, we don't want to just use mmap_read_lock() to get
- * the mm lock, because that would deadlock if the bug were
- * to happen while we're holding the mm lock for writing.
- *
- * So this checks the exception tables on kernel faults in
- * order to only do this all for instructions that are actually
- * expected to fault.
- *
- * We can also actually take the mm lock for writing if we
- * need to extend the vma, which helps the VM layer a lot.
- */
- struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
- unsigned long addr, struct pt_regs *regs)
- {
- struct vm_area_struct *vma;
- if (!get_mmap_lock_carefully(mm, regs))
- return NULL;
- vma = find_vma(mm, addr);
- if (likely(vma && (vma->vm_start <= addr)))
- return vma;
- /*
- * Well, dang. We might still be successful, but only
- * if we can extend a vma to do so.
- */
- if (!vma || !(vma->vm_flags & VM_GROWSDOWN)) {
- mmap_read_unlock(mm);
- return NULL;
- }
- /*
- * We can try to upgrade the mmap lock atomically,
- * in which case we can continue to use the vma
- * we already looked up.
- *
- * Otherwise we'll have to drop the mmap lock and
- * re-take it, and also look up the vma again,
- * re-checking it.
- */
- if (!mmap_upgrade_trylock(mm)) {
- if (!upgrade_mmap_lock_carefully(mm, regs))
- return NULL;
- vma = find_vma(mm, addr);
- if (!vma)
- goto fail;
- if (vma->vm_start <= addr)
- goto success;
- if (!(vma->vm_flags & VM_GROWSDOWN))
- goto fail;
- }
- if (expand_stack_locked(vma, addr))
- goto fail;
- success:
- mmap_write_downgrade(mm);
- return vma;
- fail:
- mmap_write_unlock(mm);
- return NULL;
- }
- #endif /* CONFIG_LOCK_MM_AND_FIND_VMA */
- #else /* CONFIG_MMU */
- /*
- * At least xtensa ends up having protection faults even with no
- * MMU.. No stack expansion, at least.
- */
- struct vm_area_struct *lock_mm_and_find_vma(struct mm_struct *mm,
- unsigned long addr, struct pt_regs *regs)
- {
- struct vm_area_struct *vma;
- mmap_read_lock(mm);
- vma = vma_lookup(mm, addr);
- if (!vma)
- mmap_read_unlock(mm);
- return vma;
- }
- #endif /* CONFIG_MMU */
|