mremap.c 54 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * mm/mremap.c
  4. *
  5. * (C) Copyright 1996 Linus Torvalds
  6. *
  7. * Address space accounting code <alan@lxorguk.ukuu.org.uk>
  8. * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
  9. */
  10. #include <linux/mm.h>
  11. #include <linux/mm_inline.h>
  12. #include <linux/hugetlb.h>
  13. #include <linux/shm.h>
  14. #include <linux/ksm.h>
  15. #include <linux/mman.h>
  16. #include <linux/swap.h>
  17. #include <linux/capability.h>
  18. #include <linux/fs.h>
  19. #include <linux/leafops.h>
  20. #include <linux/highmem.h>
  21. #include <linux/security.h>
  22. #include <linux/syscalls.h>
  23. #include <linux/mmu_notifier.h>
  24. #include <linux/uaccess.h>
  25. #include <linux/userfaultfd_k.h>
  26. #include <linux/mempolicy.h>
  27. #include <linux/pgalloc.h>
  28. #include <asm/cacheflush.h>
  29. #include <asm/tlb.h>
  30. #include "internal.h"
  31. /* Classify the kind of remap operation being performed. */
  32. enum mremap_type {
  33. MREMAP_INVALID, /* Initial state. */
  34. MREMAP_NO_RESIZE, /* old_len == new_len, if not moved, do nothing. */
  35. MREMAP_SHRINK, /* old_len > new_len. */
  36. MREMAP_EXPAND, /* old_len < new_len. */
  37. };
  38. /*
  39. * Describes a VMA mremap() operation and is threaded throughout it.
  40. *
  41. * Any of the fields may be mutated by the operation, however these values will
  42. * always accurately reflect the remap (for instance, we may adjust lengths and
  43. * delta to account for hugetlb alignment).
  44. */
  45. struct vma_remap_struct {
  46. /* User-provided state. */
  47. unsigned long addr; /* User-specified address from which we remap. */
  48. unsigned long old_len; /* Length of range being remapped. */
  49. unsigned long new_len; /* Desired new length of mapping. */
  50. const unsigned long flags; /* user-specified MREMAP_* flags. */
  51. unsigned long new_addr; /* Optionally, desired new address. */
  52. /* uffd state. */
  53. struct vm_userfaultfd_ctx *uf;
  54. struct list_head *uf_unmap_early;
  55. struct list_head *uf_unmap;
  56. /* VMA state, determined in do_mremap(). */
  57. struct vm_area_struct *vma;
  58. /* Internal state, determined in do_mremap(). */
  59. unsigned long delta; /* Absolute delta of old_len,new_len. */
  60. bool populate_expand; /* mlock()'d expanded, must populate. */
  61. enum mremap_type remap_type; /* expand, shrink, etc. */
  62. bool mmap_locked; /* Is mm currently write-locked? */
  63. unsigned long charged; /* If VM_ACCOUNT, # pages to account. */
  64. bool vmi_needs_invalidate; /* Is the VMA iterator invalidated? */
  65. };
  66. static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
  67. {
  68. pgd_t *pgd;
  69. p4d_t *p4d;
  70. pud_t *pud;
  71. pgd = pgd_offset(mm, addr);
  72. if (pgd_none_or_clear_bad(pgd))
  73. return NULL;
  74. p4d = p4d_offset(pgd, addr);
  75. if (p4d_none_or_clear_bad(p4d))
  76. return NULL;
  77. pud = pud_offset(p4d, addr);
  78. if (pud_none_or_clear_bad(pud))
  79. return NULL;
  80. return pud;
  81. }
  82. static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
  83. {
  84. pud_t *pud;
  85. pmd_t *pmd;
  86. pud = get_old_pud(mm, addr);
  87. if (!pud)
  88. return NULL;
  89. pmd = pmd_offset(pud, addr);
  90. if (pmd_none(*pmd))
  91. return NULL;
  92. return pmd;
  93. }
  94. static pud_t *alloc_new_pud(struct mm_struct *mm, unsigned long addr)
  95. {
  96. pgd_t *pgd;
  97. p4d_t *p4d;
  98. pgd = pgd_offset(mm, addr);
  99. p4d = p4d_alloc(mm, pgd, addr);
  100. if (!p4d)
  101. return NULL;
  102. return pud_alloc(mm, p4d, addr);
  103. }
  104. static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr)
  105. {
  106. pud_t *pud;
  107. pmd_t *pmd;
  108. pud = alloc_new_pud(mm, addr);
  109. if (!pud)
  110. return NULL;
  111. pmd = pmd_alloc(mm, pud, addr);
  112. if (!pmd)
  113. return NULL;
  114. VM_BUG_ON(pmd_trans_huge(*pmd));
  115. return pmd;
  116. }
  117. static void take_rmap_locks(struct vm_area_struct *vma)
  118. {
  119. if (vma->vm_file)
  120. i_mmap_lock_write(vma->vm_file->f_mapping);
  121. if (vma->anon_vma)
  122. anon_vma_lock_write(vma->anon_vma);
  123. }
  124. static void drop_rmap_locks(struct vm_area_struct *vma)
  125. {
  126. if (vma->anon_vma)
  127. anon_vma_unlock_write(vma->anon_vma);
  128. if (vma->vm_file)
  129. i_mmap_unlock_write(vma->vm_file->f_mapping);
  130. }
  131. static pte_t move_soft_dirty_pte(pte_t pte)
  132. {
  133. if (pte_none(pte))
  134. return pte;
  135. /*
  136. * Set soft dirty bit so we can notice
  137. * in userspace the ptes were moved.
  138. */
  139. if (pgtable_supports_soft_dirty()) {
  140. if (pte_present(pte))
  141. pte = pte_mksoft_dirty(pte);
  142. else
  143. pte = pte_swp_mksoft_dirty(pte);
  144. }
  145. return pte;
  146. }
  147. static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr,
  148. pte_t *ptep, pte_t pte, int max_nr)
  149. {
  150. struct folio *folio;
  151. if (max_nr == 1)
  152. return 1;
  153. /* Avoid expensive folio lookup if we stand no chance of benefit. */
  154. if (pte_batch_hint(ptep, pte) == 1)
  155. return 1;
  156. folio = vm_normal_folio(vma, addr, pte);
  157. if (!folio || !folio_test_large(folio))
  158. return 1;
  159. return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, FPB_RESPECT_WRITE);
  160. }
  161. static int move_ptes(struct pagetable_move_control *pmc,
  162. unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd)
  163. {
  164. struct vm_area_struct *vma = pmc->old;
  165. bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma);
  166. struct mm_struct *mm = vma->vm_mm;
  167. pte_t *old_ptep, *new_ptep;
  168. pte_t old_pte, pte;
  169. pmd_t dummy_pmdval;
  170. spinlock_t *old_ptl, *new_ptl;
  171. bool force_flush = false;
  172. unsigned long old_addr = pmc->old_addr;
  173. unsigned long new_addr = pmc->new_addr;
  174. unsigned long old_end = old_addr + extent;
  175. unsigned long len = old_end - old_addr;
  176. int max_nr_ptes;
  177. int nr_ptes;
  178. int err = 0;
  179. /*
  180. * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
  181. * locks to ensure that rmap will always observe either the old or the
  182. * new ptes. This is the easiest way to avoid races with
  183. * truncate_pagecache(), page migration, etc...
  184. *
  185. * When need_rmap_locks is false, we use other ways to avoid
  186. * such races:
  187. *
  188. * - During exec() shift_arg_pages(), we use a specially tagged vma
  189. * which rmap call sites look for using vma_is_temporary_stack().
  190. *
  191. * - During mremap(), new_vma is often known to be placed after vma
  192. * in rmap traversal order. This ensures rmap will always observe
  193. * either the old pte, or the new pte, or both (the page table locks
  194. * serialize access to individual ptes, but only rmap traversal
  195. * order guarantees that we won't miss both the old and new ptes).
  196. */
  197. if (pmc->need_rmap_locks)
  198. take_rmap_locks(vma);
  199. /*
  200. * We don't have to worry about the ordering of src and dst
  201. * pte locks because exclusive mmap_lock prevents deadlock.
  202. */
  203. old_ptep = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
  204. if (!old_ptep) {
  205. err = -EAGAIN;
  206. goto out;
  207. }
  208. /*
  209. * Now new_pte is none, so hpage_collapse_scan_file() path can not find
  210. * this by traversing file->f_mapping, so there is no concurrency with
  211. * retract_page_tables(). In addition, we already hold the exclusive
  212. * mmap_lock, so this new_pte page is stable, so there is no need to get
  213. * pmdval and do pmd_same() check.
  214. */
  215. new_ptep = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval,
  216. &new_ptl);
  217. if (!new_ptep) {
  218. pte_unmap_unlock(old_ptep, old_ptl);
  219. err = -EAGAIN;
  220. goto out;
  221. }
  222. if (new_ptl != old_ptl)
  223. spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
  224. flush_tlb_batched_pending(vma->vm_mm);
  225. lazy_mmu_mode_enable();
  226. for (; old_addr < old_end; old_ptep += nr_ptes, old_addr += nr_ptes * PAGE_SIZE,
  227. new_ptep += nr_ptes, new_addr += nr_ptes * PAGE_SIZE) {
  228. VM_WARN_ON_ONCE(!pte_none(*new_ptep));
  229. nr_ptes = 1;
  230. max_nr_ptes = (old_end - old_addr) >> PAGE_SHIFT;
  231. old_pte = ptep_get(old_ptep);
  232. if (pte_none(old_pte))
  233. continue;
  234. /*
  235. * If we are remapping a valid PTE, make sure
  236. * to flush TLB before we drop the PTL for the
  237. * PTE.
  238. *
  239. * NOTE! Both old and new PTL matter: the old one
  240. * for racing with folio_mkclean(), the new one to
  241. * make sure the physical page stays valid until
  242. * the TLB entry for the old mapping has been
  243. * flushed.
  244. */
  245. if (pte_present(old_pte)) {
  246. nr_ptes = mremap_folio_pte_batch(vma, old_addr, old_ptep,
  247. old_pte, max_nr_ptes);
  248. force_flush = true;
  249. }
  250. pte = get_and_clear_ptes(mm, old_addr, old_ptep, nr_ptes);
  251. pte = move_pte(pte, old_addr, new_addr);
  252. pte = move_soft_dirty_pte(pte);
  253. if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte))
  254. pte_clear(mm, new_addr, new_ptep);
  255. else {
  256. if (need_clear_uffd_wp) {
  257. if (pte_present(pte))
  258. pte = pte_clear_uffd_wp(pte);
  259. else
  260. pte = pte_swp_clear_uffd_wp(pte);
  261. }
  262. set_ptes(mm, new_addr, new_ptep, pte, nr_ptes);
  263. }
  264. }
  265. lazy_mmu_mode_disable();
  266. if (force_flush)
  267. flush_tlb_range(vma, old_end - len, old_end);
  268. if (new_ptl != old_ptl)
  269. spin_unlock(new_ptl);
  270. pte_unmap(new_ptep - 1);
  271. pte_unmap_unlock(old_ptep - 1, old_ptl);
  272. out:
  273. if (pmc->need_rmap_locks)
  274. drop_rmap_locks(vma);
  275. return err;
  276. }
  277. #ifndef arch_supports_page_table_move
  278. #define arch_supports_page_table_move arch_supports_page_table_move
  279. static inline bool arch_supports_page_table_move(void)
  280. {
  281. return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
  282. IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
  283. }
  284. #endif
  285. static inline bool uffd_supports_page_table_move(struct pagetable_move_control *pmc)
  286. {
  287. /*
  288. * If we are moving a VMA that has uffd-wp registered but with
  289. * remap events disabled (new VMA will not be registered with uffd), we
  290. * need to ensure that the uffd-wp state is cleared from all pgtables.
  291. * This means recursing into lower page tables in move_page_tables().
  292. *
  293. * We might get called with VMAs reversed when recovering from a
  294. * failed page table move. In that case, the
  295. * "old"-but-actually-"originally new" VMA during recovery will not have
  296. * a uffd context. Recursing into lower page tables during the original
  297. * move but not during the recovery move will cause trouble, because we
  298. * run into already-existing page tables. So check both VMAs.
  299. */
  300. return !vma_has_uffd_without_event_remap(pmc->old) &&
  301. !vma_has_uffd_without_event_remap(pmc->new);
  302. }
  303. #ifdef CONFIG_HAVE_MOVE_PMD
  304. static bool move_normal_pmd(struct pagetable_move_control *pmc,
  305. pmd_t *old_pmd, pmd_t *new_pmd)
  306. {
  307. spinlock_t *old_ptl, *new_ptl;
  308. struct vm_area_struct *vma = pmc->old;
  309. struct mm_struct *mm = vma->vm_mm;
  310. bool res = false;
  311. pmd_t pmd;
  312. if (!arch_supports_page_table_move())
  313. return false;
  314. if (!uffd_supports_page_table_move(pmc))
  315. return false;
  316. /*
  317. * The destination pmd shouldn't be established, free_pgtables()
  318. * should have released it.
  319. *
  320. * However, there's a case during execve() where we use mremap
  321. * to move the initial stack, and in that case the target area
  322. * may overlap the source area (always moving down).
  323. *
  324. * If everything is PMD-aligned, that works fine, as moving
  325. * each pmd down will clear the source pmd. But if we first
  326. * have a few 4kB-only pages that get moved down, and then
  327. * hit the "now the rest is PMD-aligned, let's do everything
  328. * one pmd at a time", we will still have the old (now empty
  329. * of any 4kB pages, but still there) PMD in the page table
  330. * tree.
  331. *
  332. * Warn on it once - because we really should try to figure
  333. * out how to do this better - but then say "I won't move
  334. * this pmd".
  335. *
  336. * One alternative might be to just unmap the target pmd at
  337. * this point, and verify that it really is empty. We'll see.
  338. */
  339. if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
  340. return false;
  341. /*
  342. * We don't have to worry about the ordering of src and dst
  343. * ptlocks because exclusive mmap_lock prevents deadlock.
  344. */
  345. old_ptl = pmd_lock(mm, old_pmd);
  346. new_ptl = pmd_lockptr(mm, new_pmd);
  347. if (new_ptl != old_ptl)
  348. spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
  349. pmd = *old_pmd;
  350. /* Racing with collapse? */
  351. if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd)))
  352. goto out_unlock;
  353. /* Clear the pmd */
  354. pmd_clear(old_pmd);
  355. res = true;
  356. VM_BUG_ON(!pmd_none(*new_pmd));
  357. pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
  358. flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PMD_SIZE);
  359. out_unlock:
  360. if (new_ptl != old_ptl)
  361. spin_unlock(new_ptl);
  362. spin_unlock(old_ptl);
  363. return res;
  364. }
  365. #else
  366. static inline bool move_normal_pmd(struct pagetable_move_control *pmc,
  367. pmd_t *old_pmd, pmd_t *new_pmd)
  368. {
  369. return false;
  370. }
  371. #endif
  372. #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
  373. static bool move_normal_pud(struct pagetable_move_control *pmc,
  374. pud_t *old_pud, pud_t *new_pud)
  375. {
  376. spinlock_t *old_ptl, *new_ptl;
  377. struct vm_area_struct *vma = pmc->old;
  378. struct mm_struct *mm = vma->vm_mm;
  379. pud_t pud;
  380. if (!arch_supports_page_table_move())
  381. return false;
  382. if (!uffd_supports_page_table_move(pmc))
  383. return false;
  384. /*
  385. * The destination pud shouldn't be established, free_pgtables()
  386. * should have released it.
  387. */
  388. if (WARN_ON_ONCE(!pud_none(*new_pud)))
  389. return false;
  390. /*
  391. * We don't have to worry about the ordering of src and dst
  392. * ptlocks because exclusive mmap_lock prevents deadlock.
  393. */
  394. old_ptl = pud_lock(mm, old_pud);
  395. new_ptl = pud_lockptr(mm, new_pud);
  396. if (new_ptl != old_ptl)
  397. spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
  398. /* Clear the pud */
  399. pud = *old_pud;
  400. pud_clear(old_pud);
  401. VM_BUG_ON(!pud_none(*new_pud));
  402. pud_populate(mm, new_pud, pud_pgtable(pud));
  403. flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PUD_SIZE);
  404. if (new_ptl != old_ptl)
  405. spin_unlock(new_ptl);
  406. spin_unlock(old_ptl);
  407. return true;
  408. }
  409. #else
  410. static inline bool move_normal_pud(struct pagetable_move_control *pmc,
  411. pud_t *old_pud, pud_t *new_pud)
  412. {
  413. return false;
  414. }
  415. #endif
  416. #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
  417. static bool move_huge_pud(struct pagetable_move_control *pmc,
  418. pud_t *old_pud, pud_t *new_pud)
  419. {
  420. spinlock_t *old_ptl, *new_ptl;
  421. struct vm_area_struct *vma = pmc->old;
  422. struct mm_struct *mm = vma->vm_mm;
  423. pud_t pud;
  424. /*
  425. * The destination pud shouldn't be established, free_pgtables()
  426. * should have released it.
  427. */
  428. if (WARN_ON_ONCE(!pud_none(*new_pud)))
  429. return false;
  430. /*
  431. * We don't have to worry about the ordering of src and dst
  432. * ptlocks because exclusive mmap_lock prevents deadlock.
  433. */
  434. old_ptl = pud_lock(mm, old_pud);
  435. new_ptl = pud_lockptr(mm, new_pud);
  436. if (new_ptl != old_ptl)
  437. spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
  438. /* Clear the pud */
  439. pud = *old_pud;
  440. pud_clear(old_pud);
  441. VM_BUG_ON(!pud_none(*new_pud));
  442. /* Set the new pud */
  443. /* mark soft_ditry when we add pud level soft dirty support */
  444. set_pud_at(mm, pmc->new_addr, new_pud, pud);
  445. flush_pud_tlb_range(vma, pmc->old_addr, pmc->old_addr + HPAGE_PUD_SIZE);
  446. if (new_ptl != old_ptl)
  447. spin_unlock(new_ptl);
  448. spin_unlock(old_ptl);
  449. return true;
  450. }
  451. #else
  452. static bool move_huge_pud(struct pagetable_move_control *pmc,
  453. pud_t *old_pud, pud_t *new_pud)
  454. {
  455. WARN_ON_ONCE(1);
  456. return false;
  457. }
  458. #endif
  459. enum pgt_entry {
  460. NORMAL_PMD,
  461. HPAGE_PMD,
  462. NORMAL_PUD,
  463. HPAGE_PUD,
  464. };
  465. /*
  466. * Returns an extent of the corresponding size for the pgt_entry specified if
  467. * valid. Else returns a smaller extent bounded by the end of the source and
  468. * destination pgt_entry.
  469. */
  470. static __always_inline unsigned long get_extent(enum pgt_entry entry,
  471. struct pagetable_move_control *pmc)
  472. {
  473. unsigned long next, extent, mask, size;
  474. unsigned long old_addr = pmc->old_addr;
  475. unsigned long old_end = pmc->old_end;
  476. unsigned long new_addr = pmc->new_addr;
  477. switch (entry) {
  478. case HPAGE_PMD:
  479. case NORMAL_PMD:
  480. mask = PMD_MASK;
  481. size = PMD_SIZE;
  482. break;
  483. case HPAGE_PUD:
  484. case NORMAL_PUD:
  485. mask = PUD_MASK;
  486. size = PUD_SIZE;
  487. break;
  488. default:
  489. BUILD_BUG();
  490. break;
  491. }
  492. next = (old_addr + size) & mask;
  493. /* even if next overflowed, extent below will be ok */
  494. extent = next - old_addr;
  495. if (extent > old_end - old_addr)
  496. extent = old_end - old_addr;
  497. next = (new_addr + size) & mask;
  498. if (extent > next - new_addr)
  499. extent = next - new_addr;
  500. return extent;
  501. }
  502. /*
  503. * Should move_pgt_entry() acquire the rmap locks? This is either expressed in
  504. * the PMC, or overridden in the case of normal, larger page tables.
  505. */
  506. static bool should_take_rmap_locks(struct pagetable_move_control *pmc,
  507. enum pgt_entry entry)
  508. {
  509. switch (entry) {
  510. case NORMAL_PMD:
  511. case NORMAL_PUD:
  512. return true;
  513. default:
  514. return pmc->need_rmap_locks;
  515. }
  516. }
  517. /*
  518. * Attempts to speedup the move by moving entry at the level corresponding to
  519. * pgt_entry. Returns true if the move was successful, else false.
  520. */
  521. static bool move_pgt_entry(struct pagetable_move_control *pmc,
  522. enum pgt_entry entry, void *old_entry, void *new_entry)
  523. {
  524. bool moved = false;
  525. bool need_rmap_locks = should_take_rmap_locks(pmc, entry);
  526. /* See comment in move_ptes() */
  527. if (need_rmap_locks)
  528. take_rmap_locks(pmc->old);
  529. switch (entry) {
  530. case NORMAL_PMD:
  531. moved = move_normal_pmd(pmc, old_entry, new_entry);
  532. break;
  533. case NORMAL_PUD:
  534. moved = move_normal_pud(pmc, old_entry, new_entry);
  535. break;
  536. case HPAGE_PMD:
  537. moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
  538. move_huge_pmd(pmc->old, pmc->old_addr, pmc->new_addr, old_entry,
  539. new_entry);
  540. break;
  541. case HPAGE_PUD:
  542. moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
  543. move_huge_pud(pmc, old_entry, new_entry);
  544. break;
  545. default:
  546. WARN_ON_ONCE(1);
  547. break;
  548. }
  549. if (need_rmap_locks)
  550. drop_rmap_locks(pmc->old);
  551. return moved;
  552. }
  553. /*
  554. * A helper to check if aligning down is OK. The aligned address should fall
  555. * on *no mapping*. For the stack moving down, that's a special move within
  556. * the VMA that is created to span the source and destination of the move,
  557. * so we make an exception for it.
  558. */
  559. static bool can_align_down(struct pagetable_move_control *pmc,
  560. struct vm_area_struct *vma, unsigned long addr_to_align,
  561. unsigned long mask)
  562. {
  563. unsigned long addr_masked = addr_to_align & mask;
  564. /*
  565. * If @addr_to_align of either source or destination is not the beginning
  566. * of the corresponding VMA, we can't align down or we will destroy part
  567. * of the current mapping.
  568. */
  569. if (!pmc->for_stack && vma->vm_start != addr_to_align)
  570. return false;
  571. /* In the stack case we explicitly permit in-VMA alignment. */
  572. if (pmc->for_stack && addr_masked >= vma->vm_start)
  573. return true;
  574. /*
  575. * Make sure the realignment doesn't cause the address to fall on an
  576. * existing mapping.
  577. */
  578. return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL;
  579. }
  580. /*
  581. * Determine if are in fact able to realign for efficiency to a higher page
  582. * table boundary.
  583. */
  584. static bool can_realign_addr(struct pagetable_move_control *pmc,
  585. unsigned long pagetable_mask)
  586. {
  587. unsigned long align_mask = ~pagetable_mask;
  588. unsigned long old_align = pmc->old_addr & align_mask;
  589. unsigned long new_align = pmc->new_addr & align_mask;
  590. unsigned long pagetable_size = align_mask + 1;
  591. unsigned long old_align_next = pagetable_size - old_align;
  592. /*
  593. * We don't want to have to go hunting for VMAs from the end of the old
  594. * VMA to the next page table boundary, also we want to make sure the
  595. * operation is worthwhile.
  596. *
  597. * So ensure that we only perform this realignment if the end of the
  598. * range being copied reaches or crosses the page table boundary.
  599. *
  600. * boundary boundary
  601. * .<- old_align -> .
  602. * . |----------------.-----------|
  603. * . | vma . |
  604. * . |----------------.-----------|
  605. * . <----------------.----------->
  606. * . len_in
  607. * <------------------------------->
  608. * . pagetable_size .
  609. * . <---------------->
  610. * . old_align_next .
  611. */
  612. if (pmc->len_in < old_align_next)
  613. return false;
  614. /* Skip if the addresses are already aligned. */
  615. if (old_align == 0)
  616. return false;
  617. /* Only realign if the new and old addresses are mutually aligned. */
  618. if (old_align != new_align)
  619. return false;
  620. /* Ensure realignment doesn't cause overlap with existing mappings. */
  621. if (!can_align_down(pmc, pmc->old, pmc->old_addr, pagetable_mask) ||
  622. !can_align_down(pmc, pmc->new, pmc->new_addr, pagetable_mask))
  623. return false;
  624. return true;
  625. }
  626. /*
  627. * Opportunistically realign to specified boundary for faster copy.
  628. *
  629. * Consider an mremap() of a VMA with page table boundaries as below, and no
  630. * preceding VMAs from the lower page table boundary to the start of the VMA,
  631. * with the end of the range reaching or crossing the page table boundary.
  632. *
  633. * boundary boundary
  634. * . |----------------.-----------|
  635. * . | vma . |
  636. * . |----------------.-----------|
  637. * . pmc->old_addr . pmc->old_end
  638. * . <---------------------------->
  639. * . move these page tables
  640. *
  641. * If we proceed with moving page tables in this scenario, we will have a lot of
  642. * work to do traversing old page tables and establishing new ones in the
  643. * destination across multiple lower level page tables.
  644. *
  645. * The idea here is simply to align pmc->old_addr, pmc->new_addr down to the
  646. * page table boundary, so we can simply copy a single page table entry for the
  647. * aligned portion of the VMA instead:
  648. *
  649. * boundary boundary
  650. * . |----------------.-----------|
  651. * . | vma . |
  652. * . |----------------.-----------|
  653. * pmc->old_addr . pmc->old_end
  654. * <------------------------------------------->
  655. * . move these page tables
  656. */
  657. static void try_realign_addr(struct pagetable_move_control *pmc,
  658. unsigned long pagetable_mask)
  659. {
  660. if (!can_realign_addr(pmc, pagetable_mask))
  661. return;
  662. /*
  663. * Simply align to page table boundaries. Note that we do NOT update the
  664. * pmc->old_end value, and since the move_page_tables() operation spans
  665. * from [old_addr, old_end) (offsetting new_addr as it is performed),
  666. * this simply changes the start of the copy, not the end.
  667. */
  668. pmc->old_addr &= pagetable_mask;
  669. pmc->new_addr &= pagetable_mask;
  670. }
  671. /* Is the page table move operation done? */
  672. static bool pmc_done(struct pagetable_move_control *pmc)
  673. {
  674. return pmc->old_addr >= pmc->old_end;
  675. }
  676. /* Advance to the next page table, offset by extent bytes. */
  677. static void pmc_next(struct pagetable_move_control *pmc, unsigned long extent)
  678. {
  679. pmc->old_addr += extent;
  680. pmc->new_addr += extent;
  681. }
  682. /*
  683. * Determine how many bytes in the specified input range have had their page
  684. * tables moved so far.
  685. */
  686. static unsigned long pmc_progress(struct pagetable_move_control *pmc)
  687. {
  688. unsigned long orig_old_addr = pmc->old_end - pmc->len_in;
  689. unsigned long old_addr = pmc->old_addr;
  690. /*
  691. * Prevent negative return values when {old,new}_addr was realigned but
  692. * we broke out of the loop in move_page_tables() for the first PMD
  693. * itself.
  694. */
  695. return old_addr < orig_old_addr ? 0 : old_addr - orig_old_addr;
  696. }
  697. unsigned long move_page_tables(struct pagetable_move_control *pmc)
  698. {
  699. unsigned long extent;
  700. struct mmu_notifier_range range;
  701. pmd_t *old_pmd, *new_pmd;
  702. pud_t *old_pud, *new_pud;
  703. struct mm_struct *mm = pmc->old->vm_mm;
  704. if (!pmc->len_in)
  705. return 0;
  706. if (is_vm_hugetlb_page(pmc->old))
  707. return move_hugetlb_page_tables(pmc->old, pmc->new, pmc->old_addr,
  708. pmc->new_addr, pmc->len_in);
  709. /*
  710. * If possible, realign addresses to PMD boundary for faster copy.
  711. * Only realign if the mremap copying hits a PMD boundary.
  712. */
  713. try_realign_addr(pmc, PMD_MASK);
  714. flush_cache_range(pmc->old, pmc->old_addr, pmc->old_end);
  715. mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, mm,
  716. pmc->old_addr, pmc->old_end);
  717. mmu_notifier_invalidate_range_start(&range);
  718. for (; !pmc_done(pmc); pmc_next(pmc, extent)) {
  719. cond_resched();
  720. /*
  721. * If extent is PUD-sized try to speed up the move by moving at the
  722. * PUD level if possible.
  723. */
  724. extent = get_extent(NORMAL_PUD, pmc);
  725. old_pud = get_old_pud(mm, pmc->old_addr);
  726. if (!old_pud)
  727. continue;
  728. new_pud = alloc_new_pud(mm, pmc->new_addr);
  729. if (!new_pud)
  730. break;
  731. if (pud_trans_huge(*old_pud)) {
  732. if (extent == HPAGE_PUD_SIZE) {
  733. move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud);
  734. /* We ignore and continue on error? */
  735. continue;
  736. }
  737. } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
  738. if (move_pgt_entry(pmc, NORMAL_PUD, old_pud, new_pud))
  739. continue;
  740. }
  741. extent = get_extent(NORMAL_PMD, pmc);
  742. old_pmd = get_old_pmd(mm, pmc->old_addr);
  743. if (!old_pmd)
  744. continue;
  745. new_pmd = alloc_new_pmd(mm, pmc->new_addr);
  746. if (!new_pmd)
  747. break;
  748. again:
  749. if (pmd_is_huge(*old_pmd)) {
  750. if (extent == HPAGE_PMD_SIZE &&
  751. move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd))
  752. continue;
  753. split_huge_pmd(pmc->old, old_pmd, pmc->old_addr);
  754. } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
  755. extent == PMD_SIZE) {
  756. /*
  757. * If the extent is PMD-sized, try to speed the move by
  758. * moving at the PMD level if possible.
  759. */
  760. if (move_pgt_entry(pmc, NORMAL_PMD, old_pmd, new_pmd))
  761. continue;
  762. }
  763. if (pmd_none(*old_pmd))
  764. continue;
  765. if (pte_alloc(pmc->new->vm_mm, new_pmd))
  766. break;
  767. if (move_ptes(pmc, extent, old_pmd, new_pmd) < 0)
  768. goto again;
  769. }
  770. mmu_notifier_invalidate_range_end(&range);
  771. return pmc_progress(pmc);
  772. }
  773. /* Set vrm->delta to the difference in VMA size specified by user. */
  774. static void vrm_set_delta(struct vma_remap_struct *vrm)
  775. {
  776. vrm->delta = abs_diff(vrm->old_len, vrm->new_len);
  777. }
  778. /* Determine what kind of remap this is - shrink, expand or no resize at all. */
  779. static enum mremap_type vrm_remap_type(struct vma_remap_struct *vrm)
  780. {
  781. if (vrm->delta == 0)
  782. return MREMAP_NO_RESIZE;
  783. if (vrm->old_len > vrm->new_len)
  784. return MREMAP_SHRINK;
  785. return MREMAP_EXPAND;
  786. }
  787. /*
  788. * When moving a VMA to vrm->new_adr, does this result in the new and old VMAs
  789. * overlapping?
  790. */
  791. static bool vrm_overlaps(struct vma_remap_struct *vrm)
  792. {
  793. unsigned long start_old = vrm->addr;
  794. unsigned long start_new = vrm->new_addr;
  795. unsigned long end_old = vrm->addr + vrm->old_len;
  796. unsigned long end_new = vrm->new_addr + vrm->new_len;
  797. /*
  798. * start_old end_old
  799. * |-----------|
  800. * | |
  801. * |-----------|
  802. * |-------------|
  803. * | |
  804. * |-------------|
  805. * start_new end_new
  806. */
  807. if (end_old > start_new && end_new > start_old)
  808. return true;
  809. return false;
  810. }
  811. /*
  812. * Will a new address definitely be assigned? This either if the user specifies
  813. * it via MREMAP_FIXED, or if MREMAP_DONTUNMAP is used, indicating we will
  814. * always determine a target address.
  815. */
  816. static bool vrm_implies_new_addr(struct vma_remap_struct *vrm)
  817. {
  818. return vrm->flags & (MREMAP_FIXED | MREMAP_DONTUNMAP);
  819. }
  820. /*
  821. * Find an unmapped area for the requested vrm->new_addr.
  822. *
  823. * If MREMAP_FIXED then this is equivalent to a MAP_FIXED mmap() call. If only
  824. * MREMAP_DONTUNMAP is set, then this is equivalent to providing a hint to
  825. * mmap(), otherwise this is equivalent to mmap() specifying a NULL address.
  826. *
  827. * Returns 0 on success (with vrm->new_addr updated), or an error code upon
  828. * failure.
  829. */
  830. static unsigned long vrm_set_new_addr(struct vma_remap_struct *vrm)
  831. {
  832. struct vm_area_struct *vma = vrm->vma;
  833. unsigned long map_flags = 0;
  834. /* Page Offset _into_ the VMA. */
  835. pgoff_t internal_pgoff = (vrm->addr - vma->vm_start) >> PAGE_SHIFT;
  836. pgoff_t pgoff = vma->vm_pgoff + internal_pgoff;
  837. unsigned long new_addr = vrm_implies_new_addr(vrm) ? vrm->new_addr : 0;
  838. unsigned long res;
  839. if (vrm->flags & MREMAP_FIXED)
  840. map_flags |= MAP_FIXED;
  841. if (vma->vm_flags & VM_MAYSHARE)
  842. map_flags |= MAP_SHARED;
  843. res = get_unmapped_area(vma->vm_file, new_addr, vrm->new_len, pgoff,
  844. map_flags);
  845. if (IS_ERR_VALUE(res))
  846. return res;
  847. vrm->new_addr = res;
  848. return 0;
  849. }
  850. /*
  851. * Keep track of pages which have been added to the memory mapping. If the VMA
  852. * is accounted, also check to see if there is sufficient memory.
  853. *
  854. * Returns true on success, false if insufficient memory to charge.
  855. */
  856. static bool vrm_calc_charge(struct vma_remap_struct *vrm)
  857. {
  858. unsigned long charged;
  859. if (!(vrm->vma->vm_flags & VM_ACCOUNT))
  860. return true;
  861. /*
  862. * If we don't unmap the old mapping, then we account the entirety of
  863. * the length of the new one. Otherwise it's just the delta in size.
  864. */
  865. if (vrm->flags & MREMAP_DONTUNMAP)
  866. charged = vrm->new_len >> PAGE_SHIFT;
  867. else
  868. charged = vrm->delta >> PAGE_SHIFT;
  869. /* This accounts 'charged' pages of memory. */
  870. if (security_vm_enough_memory_mm(current->mm, charged))
  871. return false;
  872. vrm->charged = charged;
  873. return true;
  874. }
  875. /*
  876. * an error has occurred so we will not be using vrm->charged memory. Unaccount
  877. * this memory if the VMA is accounted.
  878. */
  879. static void vrm_uncharge(struct vma_remap_struct *vrm)
  880. {
  881. if (!(vrm->vma->vm_flags & VM_ACCOUNT))
  882. return;
  883. vm_unacct_memory(vrm->charged);
  884. vrm->charged = 0;
  885. }
  886. /*
  887. * Update mm exec_vm, stack_vm, data_vm, and locked_vm fields as needed to
  888. * account for 'bytes' memory used, and if locked, indicate this in the VRM so
  889. * we can handle this correctly later.
  890. */
  891. static void vrm_stat_account(struct vma_remap_struct *vrm,
  892. unsigned long bytes)
  893. {
  894. unsigned long pages = bytes >> PAGE_SHIFT;
  895. struct mm_struct *mm = current->mm;
  896. struct vm_area_struct *vma = vrm->vma;
  897. vm_stat_account(mm, vma->vm_flags, pages);
  898. if (vma->vm_flags & VM_LOCKED)
  899. mm->locked_vm += pages;
  900. }
  901. /*
  902. * Perform checks before attempting to write a VMA prior to it being
  903. * moved.
  904. */
  905. static unsigned long prep_move_vma(struct vma_remap_struct *vrm)
  906. {
  907. unsigned long err = 0;
  908. struct vm_area_struct *vma = vrm->vma;
  909. unsigned long old_addr = vrm->addr;
  910. unsigned long old_len = vrm->old_len;
  911. vm_flags_t dummy = vma->vm_flags;
  912. /*
  913. * We'd prefer to avoid failure later on in do_munmap:
  914. * which may split one vma into three before unmapping.
  915. */
  916. if (current->mm->map_count >= sysctl_max_map_count - 3)
  917. return -ENOMEM;
  918. if (vma->vm_ops && vma->vm_ops->may_split) {
  919. if (vma->vm_start != old_addr)
  920. err = vma->vm_ops->may_split(vma, old_addr);
  921. if (!err && vma->vm_end != old_addr + old_len)
  922. err = vma->vm_ops->may_split(vma, old_addr + old_len);
  923. if (err)
  924. return err;
  925. }
  926. /*
  927. * Advise KSM to break any KSM pages in the area to be moved:
  928. * it would be confusing if they were to turn up at the new
  929. * location, where they happen to coincide with different KSM
  930. * pages recently unmapped. But leave vma->vm_flags as it was,
  931. * so KSM can come around to merge on vma and new_vma afterwards.
  932. */
  933. err = ksm_madvise(vma, old_addr, old_addr + old_len,
  934. MADV_UNMERGEABLE, &dummy);
  935. if (err)
  936. return err;
  937. return 0;
  938. }
  939. /*
  940. * Unmap source VMA for VMA move, turning it from a copy to a move, being
  941. * careful to ensure we do not underflow memory account while doing so if an
  942. * accountable move.
  943. *
  944. * This is best effort, if we fail to unmap then we simply try to correct
  945. * accounting and exit.
  946. */
  947. static void unmap_source_vma(struct vma_remap_struct *vrm)
  948. {
  949. struct mm_struct *mm = current->mm;
  950. unsigned long addr = vrm->addr;
  951. unsigned long len = vrm->old_len;
  952. struct vm_area_struct *vma = vrm->vma;
  953. VMA_ITERATOR(vmi, mm, addr);
  954. int err;
  955. unsigned long vm_start;
  956. unsigned long vm_end;
  957. /*
  958. * It might seem odd that we check for MREMAP_DONTUNMAP here, given this
  959. * function implies that we unmap the original VMA, which seems
  960. * contradictory.
  961. *
  962. * However, this occurs when this operation was attempted and an error
  963. * arose, in which case we _do_ wish to unmap the _new_ VMA, which means
  964. * we actually _do_ want it be unaccounted.
  965. */
  966. bool accountable_move = (vma->vm_flags & VM_ACCOUNT) &&
  967. !(vrm->flags & MREMAP_DONTUNMAP);
  968. /*
  969. * So we perform a trick here to prevent incorrect accounting. Any merge
  970. * or new VMA allocation performed in copy_vma() does not adjust
  971. * accounting, it is expected that callers handle this.
  972. *
  973. * And indeed we already have, accounting appropriately in the case of
  974. * both in vrm_charge().
  975. *
  976. * However, when we unmap the existing VMA (to effect the move), this
  977. * code will, if the VMA has VM_ACCOUNT set, attempt to unaccount
  978. * removed pages.
  979. *
  980. * To avoid this we temporarily clear this flag, reinstating on any
  981. * portions of the original VMA that remain.
  982. */
  983. if (accountable_move) {
  984. vm_flags_clear(vma, VM_ACCOUNT);
  985. /* We are about to split vma, so store the start/end. */
  986. vm_start = vma->vm_start;
  987. vm_end = vma->vm_end;
  988. }
  989. err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false);
  990. vrm->vma = NULL; /* Invalidated. */
  991. vrm->vmi_needs_invalidate = true;
  992. if (err) {
  993. /* OOM: unable to split vma, just get accounts right */
  994. vm_acct_memory(len >> PAGE_SHIFT);
  995. return;
  996. }
  997. /*
  998. * If we mremap() from a VMA like this:
  999. *
  1000. * addr end
  1001. * | |
  1002. * v v
  1003. * |-------------|
  1004. * | |
  1005. * |-------------|
  1006. *
  1007. * Having cleared VM_ACCOUNT from the whole VMA, after we unmap above
  1008. * we'll end up with:
  1009. *
  1010. * addr end
  1011. * | |
  1012. * v v
  1013. * |---| |---|
  1014. * | A | | B |
  1015. * |---| |---|
  1016. *
  1017. * The VMI is still pointing at addr, so vma_prev() will give us A, and
  1018. * a subsequent or lone vma_next() will give as B.
  1019. *
  1020. * do_vmi_munmap() will have restored the VMI back to addr.
  1021. */
  1022. if (accountable_move) {
  1023. unsigned long end = addr + len;
  1024. if (vm_start < addr) {
  1025. struct vm_area_struct *prev = vma_prev(&vmi);
  1026. vm_flags_set(prev, VM_ACCOUNT); /* Acquires VMA lock. */
  1027. }
  1028. if (vm_end > end) {
  1029. struct vm_area_struct *next = vma_next(&vmi);
  1030. vm_flags_set(next, VM_ACCOUNT); /* Acquires VMA lock. */
  1031. }
  1032. }
  1033. }
  1034. /*
  1035. * Copy vrm->vma over to vrm->new_addr possibly adjusting size as part of the
  1036. * process. Additionally handle an error occurring on moving of page tables,
  1037. * where we reset vrm state to cause unmapping of the new VMA.
  1038. *
  1039. * Outputs the newly installed VMA to new_vma_ptr. Returns 0 on success or an
  1040. * error code.
  1041. */
  1042. static int copy_vma_and_data(struct vma_remap_struct *vrm,
  1043. struct vm_area_struct **new_vma_ptr)
  1044. {
  1045. unsigned long internal_offset = vrm->addr - vrm->vma->vm_start;
  1046. unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT;
  1047. unsigned long new_pgoff = vrm->vma->vm_pgoff + internal_pgoff;
  1048. unsigned long moved_len;
  1049. struct vm_area_struct *vma = vrm->vma;
  1050. struct vm_area_struct *new_vma;
  1051. int err = 0;
  1052. PAGETABLE_MOVE(pmc, NULL, NULL, vrm->addr, vrm->new_addr, vrm->old_len);
  1053. new_vma = copy_vma(&vma, vrm->new_addr, vrm->new_len, new_pgoff,
  1054. &pmc.need_rmap_locks);
  1055. if (!new_vma) {
  1056. vrm_uncharge(vrm);
  1057. *new_vma_ptr = NULL;
  1058. return -ENOMEM;
  1059. }
  1060. /* By merging, we may have invalidated any iterator in use. */
  1061. if (vma != vrm->vma)
  1062. vrm->vmi_needs_invalidate = true;
  1063. vrm->vma = vma;
  1064. pmc.old = vma;
  1065. pmc.new = new_vma;
  1066. moved_len = move_page_tables(&pmc);
  1067. if (moved_len < vrm->old_len)
  1068. err = -ENOMEM;
  1069. else if (vma->vm_ops && vma->vm_ops->mremap)
  1070. err = vma->vm_ops->mremap(new_vma);
  1071. if (unlikely(err)) {
  1072. PAGETABLE_MOVE(pmc_revert, new_vma, vma, vrm->new_addr,
  1073. vrm->addr, moved_len);
  1074. /*
  1075. * On error, move entries back from new area to old,
  1076. * which will succeed since page tables still there,
  1077. * and then proceed to unmap new area instead of old.
  1078. */
  1079. pmc_revert.need_rmap_locks = true;
  1080. move_page_tables(&pmc_revert);
  1081. vrm->vma = new_vma;
  1082. vrm->old_len = vrm->new_len;
  1083. vrm->addr = vrm->new_addr;
  1084. } else {
  1085. mremap_userfaultfd_prep(new_vma, vrm->uf);
  1086. }
  1087. fixup_hugetlb_reservations(vma);
  1088. *new_vma_ptr = new_vma;
  1089. return err;
  1090. }
  1091. /*
  1092. * Perform final tasks for MADV_DONTUNMAP operation, clearing mlock() flag on
  1093. * remaining VMA by convention (it cannot be mlock()'d any longer, as pages in
  1094. * range are no longer mapped), and removing anon_vma_chain links from it if the
  1095. * entire VMA was copied over.
  1096. */
  1097. static void dontunmap_complete(struct vma_remap_struct *vrm,
  1098. struct vm_area_struct *new_vma)
  1099. {
  1100. unsigned long start = vrm->addr;
  1101. unsigned long end = vrm->addr + vrm->old_len;
  1102. unsigned long old_start = vrm->vma->vm_start;
  1103. unsigned long old_end = vrm->vma->vm_end;
  1104. /* We always clear VM_LOCKED[ONFAULT] on the old VMA. */
  1105. vm_flags_clear(vrm->vma, VM_LOCKED_MASK);
  1106. /*
  1107. * anon_vma links of the old vma is no longer needed after its page
  1108. * table has been moved.
  1109. */
  1110. if (new_vma != vrm->vma && start == old_start && end == old_end)
  1111. unlink_anon_vmas(vrm->vma);
  1112. /* Because we won't unmap we don't need to touch locked_vm. */
  1113. }
  1114. static unsigned long move_vma(struct vma_remap_struct *vrm)
  1115. {
  1116. struct mm_struct *mm = current->mm;
  1117. struct vm_area_struct *new_vma;
  1118. unsigned long hiwater_vm;
  1119. int err;
  1120. err = prep_move_vma(vrm);
  1121. if (err)
  1122. return err;
  1123. /*
  1124. * If accounted, determine the number of bytes the operation will
  1125. * charge.
  1126. */
  1127. if (!vrm_calc_charge(vrm))
  1128. return -ENOMEM;
  1129. /* We don't want racing faults. */
  1130. vma_start_write(vrm->vma);
  1131. /* Perform copy step. */
  1132. err = copy_vma_and_data(vrm, &new_vma);
  1133. /*
  1134. * If we established the copied-to VMA, we attempt to recover from the
  1135. * error by setting the destination VMA to the source VMA and unmapping
  1136. * it below.
  1137. */
  1138. if (err && !new_vma)
  1139. return err;
  1140. /*
  1141. * If we failed to move page tables we still do total_vm increment
  1142. * since do_munmap() will decrement it by old_len == new_len.
  1143. *
  1144. * Since total_vm is about to be raised artificially high for a
  1145. * moment, we need to restore high watermark afterwards: if stats
  1146. * are taken meanwhile, total_vm and hiwater_vm appear too high.
  1147. * If this were a serious issue, we'd add a flag to do_munmap().
  1148. */
  1149. hiwater_vm = mm->hiwater_vm;
  1150. vrm_stat_account(vrm, vrm->new_len);
  1151. if (unlikely(!err && (vrm->flags & MREMAP_DONTUNMAP)))
  1152. dontunmap_complete(vrm, new_vma);
  1153. else
  1154. unmap_source_vma(vrm);
  1155. mm->hiwater_vm = hiwater_vm;
  1156. return err ? (unsigned long)err : vrm->new_addr;
  1157. }
  1158. /*
  1159. * The user has requested that the VMA be shrunk (i.e., old_len > new_len), so
  1160. * execute this, optionally dropping the mmap lock when we do so.
  1161. *
  1162. * In both cases this invalidates the VMA, however if we don't drop the lock,
  1163. * then load the correct VMA into vrm->vma afterwards.
  1164. */
  1165. static unsigned long shrink_vma(struct vma_remap_struct *vrm,
  1166. bool drop_lock)
  1167. {
  1168. struct mm_struct *mm = current->mm;
  1169. unsigned long unmap_start = vrm->addr + vrm->new_len;
  1170. unsigned long unmap_bytes = vrm->delta;
  1171. unsigned long res;
  1172. VMA_ITERATOR(vmi, mm, unmap_start);
  1173. VM_BUG_ON(vrm->remap_type != MREMAP_SHRINK);
  1174. res = do_vmi_munmap(&vmi, mm, unmap_start, unmap_bytes,
  1175. vrm->uf_unmap, drop_lock);
  1176. vrm->vma = NULL; /* Invalidated. */
  1177. if (res)
  1178. return res;
  1179. /*
  1180. * If we've not dropped the lock, then we should reload the VMA to
  1181. * replace the invalidated VMA with the one that may have now been
  1182. * split.
  1183. */
  1184. if (drop_lock) {
  1185. vrm->mmap_locked = false;
  1186. } else {
  1187. vrm->vma = vma_lookup(mm, vrm->addr);
  1188. if (!vrm->vma)
  1189. return -EFAULT;
  1190. }
  1191. return 0;
  1192. }
  1193. /*
  1194. * mremap_to() - remap a vma to a new location.
  1195. * Returns: The new address of the vma or an error.
  1196. */
  1197. static unsigned long mremap_to(struct vma_remap_struct *vrm)
  1198. {
  1199. struct mm_struct *mm = current->mm;
  1200. unsigned long err;
  1201. if (vrm->flags & MREMAP_FIXED) {
  1202. /*
  1203. * In mremap_to().
  1204. * VMA is moved to dst address, and munmap dst first.
  1205. * do_munmap will check if dst is sealed.
  1206. */
  1207. err = do_munmap(mm, vrm->new_addr, vrm->new_len,
  1208. vrm->uf_unmap_early);
  1209. vrm->vma = NULL; /* Invalidated. */
  1210. vrm->vmi_needs_invalidate = true;
  1211. if (err)
  1212. return err;
  1213. /*
  1214. * If we remap a portion of a VMA elsewhere in the same VMA,
  1215. * this can invalidate the old VMA. Reset.
  1216. */
  1217. vrm->vma = vma_lookup(mm, vrm->addr);
  1218. if (!vrm->vma)
  1219. return -EFAULT;
  1220. }
  1221. if (vrm->remap_type == MREMAP_SHRINK) {
  1222. err = shrink_vma(vrm, /* drop_lock= */false);
  1223. if (err)
  1224. return err;
  1225. /* Set up for the move now shrink has been executed. */
  1226. vrm->old_len = vrm->new_len;
  1227. }
  1228. /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */
  1229. if (vrm->flags & MREMAP_DONTUNMAP) {
  1230. vm_flags_t vm_flags = vrm->vma->vm_flags;
  1231. unsigned long pages = vrm->old_len >> PAGE_SHIFT;
  1232. if (!may_expand_vm(mm, vm_flags, pages))
  1233. return -ENOMEM;
  1234. }
  1235. err = vrm_set_new_addr(vrm);
  1236. if (err)
  1237. return err;
  1238. return move_vma(vrm);
  1239. }
  1240. static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
  1241. {
  1242. unsigned long end = vma->vm_end + delta;
  1243. if (end < vma->vm_end) /* overflow */
  1244. return 0;
  1245. if (find_vma_intersection(vma->vm_mm, vma->vm_end, end))
  1246. return 0;
  1247. if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
  1248. 0, MAP_FIXED) & ~PAGE_MASK)
  1249. return 0;
  1250. return 1;
  1251. }
  1252. /* Determine whether we are actually able to execute an in-place expansion. */
  1253. static bool vrm_can_expand_in_place(struct vma_remap_struct *vrm)
  1254. {
  1255. /* Number of bytes from vrm->addr to end of VMA. */
  1256. unsigned long suffix_bytes = vrm->vma->vm_end - vrm->addr;
  1257. /* If end of range aligns to end of VMA, we can just expand in-place. */
  1258. if (suffix_bytes != vrm->old_len)
  1259. return false;
  1260. /* Check whether this is feasible. */
  1261. if (!vma_expandable(vrm->vma, vrm->delta))
  1262. return false;
  1263. return true;
  1264. }
  1265. /*
  1266. * We know we can expand the VMA in-place by delta pages, so do so.
  1267. *
  1268. * If we discover the VMA is locked, update mm_struct statistics accordingly and
  1269. * indicate so to the caller.
  1270. */
  1271. static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm)
  1272. {
  1273. struct mm_struct *mm = current->mm;
  1274. struct vm_area_struct *vma = vrm->vma;
  1275. VMA_ITERATOR(vmi, mm, vma->vm_end);
  1276. if (!vrm_calc_charge(vrm))
  1277. return -ENOMEM;
  1278. /*
  1279. * Function vma_merge_extend() is called on the
  1280. * extension we are adding to the already existing vma,
  1281. * vma_merge_extend() will merge this extension with the
  1282. * already existing vma (expand operation itself) and
  1283. * possibly also with the next vma if it becomes
  1284. * adjacent to the expanded vma and otherwise
  1285. * compatible.
  1286. */
  1287. vma = vma_merge_extend(&vmi, vma, vrm->delta);
  1288. if (!vma) {
  1289. vrm_uncharge(vrm);
  1290. return -ENOMEM;
  1291. }
  1292. vrm->vma = vma;
  1293. vrm_stat_account(vrm, vrm->delta);
  1294. return 0;
  1295. }
  1296. static bool align_hugetlb(struct vma_remap_struct *vrm)
  1297. {
  1298. struct hstate *h __maybe_unused = hstate_vma(vrm->vma);
  1299. vrm->old_len = ALIGN(vrm->old_len, huge_page_size(h));
  1300. vrm->new_len = ALIGN(vrm->new_len, huge_page_size(h));
  1301. /* addrs must be huge page aligned */
  1302. if (vrm->addr & ~huge_page_mask(h))
  1303. return false;
  1304. if (vrm->new_addr & ~huge_page_mask(h))
  1305. return false;
  1306. /*
  1307. * Don't allow remap expansion, because the underlying hugetlb
  1308. * reservation is not yet capable to handle split reservation.
  1309. */
  1310. if (vrm->new_len > vrm->old_len)
  1311. return false;
  1312. return true;
  1313. }
  1314. /*
  1315. * We are mremap()'ing without specifying a fixed address to move to, but are
  1316. * requesting that the VMA's size be increased.
  1317. *
  1318. * Try to do so in-place, if this fails, then move the VMA to a new location to
  1319. * action the change.
  1320. */
  1321. static unsigned long expand_vma(struct vma_remap_struct *vrm)
  1322. {
  1323. unsigned long err;
  1324. /*
  1325. * [addr, old_len) spans precisely to the end of the VMA, so try to
  1326. * expand it in-place.
  1327. */
  1328. if (vrm_can_expand_in_place(vrm)) {
  1329. err = expand_vma_in_place(vrm);
  1330. if (err)
  1331. return err;
  1332. /* OK we're done! */
  1333. return vrm->addr;
  1334. }
  1335. /*
  1336. * We weren't able to just expand or shrink the area,
  1337. * we need to create a new one and move it.
  1338. */
  1339. /* We're not allowed to move the VMA, so error out. */
  1340. if (!(vrm->flags & MREMAP_MAYMOVE))
  1341. return -ENOMEM;
  1342. /* Find a new location to move the VMA to. */
  1343. err = vrm_set_new_addr(vrm);
  1344. if (err)
  1345. return err;
  1346. return move_vma(vrm);
  1347. }
  1348. /*
  1349. * Attempt to resize the VMA in-place, if we cannot, then move the VMA to the
  1350. * first available address to perform the operation.
  1351. */
  1352. static unsigned long mremap_at(struct vma_remap_struct *vrm)
  1353. {
  1354. unsigned long res;
  1355. switch (vrm->remap_type) {
  1356. case MREMAP_INVALID:
  1357. break;
  1358. case MREMAP_NO_RESIZE:
  1359. /* NO-OP CASE - resizing to the same size. */
  1360. return vrm->addr;
  1361. case MREMAP_SHRINK:
  1362. /*
  1363. * SHRINK CASE. Can always be done in-place.
  1364. *
  1365. * Simply unmap the shrunken portion of the VMA. This does all
  1366. * the needed commit accounting, and we indicate that the mmap
  1367. * lock should be dropped.
  1368. */
  1369. res = shrink_vma(vrm, /* drop_lock= */true);
  1370. if (res)
  1371. return res;
  1372. return vrm->addr;
  1373. case MREMAP_EXPAND:
  1374. return expand_vma(vrm);
  1375. }
  1376. /* Should not be possible. */
  1377. WARN_ON_ONCE(1);
  1378. return -EINVAL;
  1379. }
  1380. /*
  1381. * Will this operation result in the VMA being expanded or moved and thus need
  1382. * to map a new portion of virtual address space?
  1383. */
  1384. static bool vrm_will_map_new(struct vma_remap_struct *vrm)
  1385. {
  1386. if (vrm->remap_type == MREMAP_EXPAND)
  1387. return true;
  1388. if (vrm_implies_new_addr(vrm))
  1389. return true;
  1390. return false;
  1391. }
  1392. /* Does this remap ONLY move mappings? */
  1393. static bool vrm_move_only(struct vma_remap_struct *vrm)
  1394. {
  1395. if (!(vrm->flags & MREMAP_FIXED))
  1396. return false;
  1397. if (vrm->old_len != vrm->new_len)
  1398. return false;
  1399. return true;
  1400. }
  1401. static void notify_uffd(struct vma_remap_struct *vrm, bool failed)
  1402. {
  1403. struct mm_struct *mm = current->mm;
  1404. /* Regardless of success/failure, we always notify of any unmaps. */
  1405. userfaultfd_unmap_complete(mm, vrm->uf_unmap_early);
  1406. if (failed)
  1407. mremap_userfaultfd_fail(vrm->uf);
  1408. else
  1409. mremap_userfaultfd_complete(vrm->uf, vrm->addr,
  1410. vrm->new_addr, vrm->old_len);
  1411. userfaultfd_unmap_complete(mm, vrm->uf_unmap);
  1412. }
  1413. static bool vma_multi_allowed(struct vm_area_struct *vma)
  1414. {
  1415. struct file *file = vma->vm_file;
  1416. /*
  1417. * We can't support moving multiple uffd VMAs as notify requires
  1418. * mmap lock to be dropped.
  1419. */
  1420. if (userfaultfd_armed(vma))
  1421. return false;
  1422. /*
  1423. * Custom get unmapped area might result in MREMAP_FIXED not
  1424. * being obeyed.
  1425. */
  1426. if (!file || !file->f_op->get_unmapped_area)
  1427. return true;
  1428. /* Known good. */
  1429. if (vma_is_shmem(vma))
  1430. return true;
  1431. if (is_vm_hugetlb_page(vma))
  1432. return true;
  1433. if (file->f_op->get_unmapped_area == thp_get_unmapped_area)
  1434. return true;
  1435. return false;
  1436. }
  1437. static int check_prep_vma(struct vma_remap_struct *vrm)
  1438. {
  1439. struct vm_area_struct *vma = vrm->vma;
  1440. struct mm_struct *mm = current->mm;
  1441. unsigned long addr = vrm->addr;
  1442. unsigned long old_len, new_len, pgoff;
  1443. if (!vma)
  1444. return -EFAULT;
  1445. /* If mseal()'d, mremap() is prohibited. */
  1446. if (vma_is_sealed(vma))
  1447. return -EPERM;
  1448. /* Align to hugetlb page size, if required. */
  1449. if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm))
  1450. return -EINVAL;
  1451. vrm_set_delta(vrm);
  1452. vrm->remap_type = vrm_remap_type(vrm);
  1453. /* For convenience, we set new_addr even if VMA won't move. */
  1454. if (!vrm_implies_new_addr(vrm))
  1455. vrm->new_addr = addr;
  1456. /* Below only meaningful if we expand or move a VMA. */
  1457. if (!vrm_will_map_new(vrm))
  1458. return 0;
  1459. old_len = vrm->old_len;
  1460. new_len = vrm->new_len;
  1461. /*
  1462. * !old_len is a special case where an attempt is made to 'duplicate'
  1463. * a mapping. This makes no sense for private mappings as it will
  1464. * instead create a fresh/new mapping unrelated to the original. This
  1465. * is contrary to the basic idea of mremap which creates new mappings
  1466. * based on the original. There are no known use cases for this
  1467. * behavior. As a result, fail such attempts.
  1468. */
  1469. if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) {
  1470. pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n",
  1471. current->comm, current->pid);
  1472. return -EINVAL;
  1473. }
  1474. if ((vrm->flags & MREMAP_DONTUNMAP) &&
  1475. (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
  1476. return -EINVAL;
  1477. /*
  1478. * We permit crossing of boundaries for the range being unmapped due to
  1479. * a shrink.
  1480. */
  1481. if (vrm->remap_type == MREMAP_SHRINK)
  1482. old_len = new_len;
  1483. /*
  1484. * We can't remap across the end of VMAs, as another VMA may be
  1485. * adjacent:
  1486. *
  1487. * addr vma->vm_end
  1488. * |-----.----------|
  1489. * | . |
  1490. * |-----.----------|
  1491. * .<--------->xxx>
  1492. * old_len
  1493. *
  1494. * We also require that vma->vm_start <= addr < vma->vm_end.
  1495. */
  1496. if (old_len > vma->vm_end - addr)
  1497. return -EFAULT;
  1498. if (new_len == old_len)
  1499. return 0;
  1500. /* We are expanding and the VMA is mlock()'d so we need to populate. */
  1501. if (vma->vm_flags & VM_LOCKED)
  1502. vrm->populate_expand = true;
  1503. /* Need to be careful about a growing mapping */
  1504. pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
  1505. pgoff += vma->vm_pgoff;
  1506. if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
  1507. return -EINVAL;
  1508. if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))
  1509. return -EFAULT;
  1510. if (!mlock_future_ok(mm, vma->vm_flags & VM_LOCKED, vrm->delta))
  1511. return -EAGAIN;
  1512. if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT))
  1513. return -ENOMEM;
  1514. return 0;
  1515. }
  1516. /*
  1517. * Are the parameters passed to mremap() valid? If so return 0, otherwise return
  1518. * error.
  1519. */
  1520. static unsigned long check_mremap_params(struct vma_remap_struct *vrm)
  1521. {
  1522. unsigned long addr = vrm->addr;
  1523. unsigned long flags = vrm->flags;
  1524. /* Ensure no unexpected flag values. */
  1525. if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP))
  1526. return -EINVAL;
  1527. /* Start address must be page-aligned. */
  1528. if (offset_in_page(addr))
  1529. return -EINVAL;
  1530. /*
  1531. * We allow a zero old-len as a special case
  1532. * for DOS-emu "duplicate shm area" thing. But
  1533. * a zero new-len is nonsensical.
  1534. */
  1535. if (!vrm->new_len)
  1536. return -EINVAL;
  1537. /* Is the new length silly? */
  1538. if (vrm->new_len > TASK_SIZE)
  1539. return -EINVAL;
  1540. /* Remainder of checks are for cases with specific new_addr. */
  1541. if (!vrm_implies_new_addr(vrm))
  1542. return 0;
  1543. /* Is the new address silly? */
  1544. if (vrm->new_addr > TASK_SIZE - vrm->new_len)
  1545. return -EINVAL;
  1546. /* The new address must be page-aligned. */
  1547. if (offset_in_page(vrm->new_addr))
  1548. return -EINVAL;
  1549. /* A fixed address implies a move. */
  1550. if (!(flags & MREMAP_MAYMOVE))
  1551. return -EINVAL;
  1552. /* MREMAP_DONTUNMAP does not allow resizing in the process. */
  1553. if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len)
  1554. return -EINVAL;
  1555. /* Target VMA must not overlap source VMA. */
  1556. if (vrm_overlaps(vrm))
  1557. return -EINVAL;
  1558. /*
  1559. * move_vma() need us to stay 4 maps below the threshold, otherwise
  1560. * it will bail out at the very beginning.
  1561. * That is a problem if we have already unmapped the regions here
  1562. * (new_addr, and old_addr), because userspace will not know the
  1563. * state of the vma's after it gets -ENOMEM.
  1564. * So, to avoid such scenario we can pre-compute if the whole
  1565. * operation has high chances to success map-wise.
  1566. * Worst-scenario case is when both vma's (new_addr and old_addr) get
  1567. * split in 3 before unmapping it.
  1568. * That means 2 more maps (1 for each) to the ones we already hold.
  1569. * Check whether current map count plus 2 still leads us to 4 maps below
  1570. * the threshold, otherwise return -ENOMEM here to be more safe.
  1571. */
  1572. if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3)
  1573. return -ENOMEM;
  1574. return 0;
  1575. }
  1576. static unsigned long remap_move(struct vma_remap_struct *vrm)
  1577. {
  1578. struct vm_area_struct *vma;
  1579. unsigned long start = vrm->addr;
  1580. unsigned long end = vrm->addr + vrm->old_len;
  1581. unsigned long new_addr = vrm->new_addr;
  1582. unsigned long target_addr = new_addr;
  1583. unsigned long res = -EFAULT;
  1584. unsigned long last_end;
  1585. bool seen_vma = false;
  1586. VMA_ITERATOR(vmi, current->mm, start);
  1587. /*
  1588. * When moving VMAs we allow for batched moves across multiple VMAs,
  1589. * with all VMAs in the input range [addr, addr + old_len) being moved
  1590. * (and split as necessary).
  1591. */
  1592. for_each_vma_range(vmi, vma, end) {
  1593. /* Account for start, end not aligned with VMA start, end. */
  1594. unsigned long addr = max(vma->vm_start, start);
  1595. unsigned long len = min(end, vma->vm_end) - addr;
  1596. unsigned long offset, res_vma;
  1597. bool multi_allowed;
  1598. /* No gap permitted at the start of the range. */
  1599. if (!seen_vma && start < vma->vm_start)
  1600. return -EFAULT;
  1601. /*
  1602. * To sensibly move multiple VMAs, accounting for the fact that
  1603. * get_unmapped_area() may align even MAP_FIXED moves, we simply
  1604. * attempt to move such that the gaps between source VMAs remain
  1605. * consistent in destination VMAs, e.g.:
  1606. *
  1607. * X Y X Y
  1608. * <---> <-> <---> <->
  1609. * |-------| |-----| |-----| |-------| |-----| |-----|
  1610. * | A | | B | | C | ---> | A' | | B' | | C' |
  1611. * |-------| |-----| |-----| |-------| |-----| |-----|
  1612. * new_addr
  1613. *
  1614. * So we map B' at A'->vm_end + X, and C' at B'->vm_end + Y.
  1615. */
  1616. offset = seen_vma ? vma->vm_start - last_end : 0;
  1617. last_end = vma->vm_end;
  1618. vrm->vma = vma;
  1619. vrm->addr = addr;
  1620. vrm->new_addr = target_addr + offset;
  1621. vrm->old_len = vrm->new_len = len;
  1622. multi_allowed = vma_multi_allowed(vma);
  1623. if (!multi_allowed) {
  1624. /* This is not the first VMA, abort immediately. */
  1625. if (seen_vma)
  1626. return -EFAULT;
  1627. /* This is the first, but there are more, abort. */
  1628. if (vma->vm_end < end)
  1629. return -EFAULT;
  1630. }
  1631. res_vma = check_prep_vma(vrm);
  1632. if (!res_vma)
  1633. res_vma = mremap_to(vrm);
  1634. if (IS_ERR_VALUE(res_vma))
  1635. return res_vma;
  1636. if (!seen_vma) {
  1637. VM_WARN_ON_ONCE(multi_allowed && res_vma != new_addr);
  1638. res = res_vma;
  1639. }
  1640. /* mmap lock is only dropped on shrink. */
  1641. VM_WARN_ON_ONCE(!vrm->mmap_locked);
  1642. /* This is a move, no expand should occur. */
  1643. VM_WARN_ON_ONCE(vrm->populate_expand);
  1644. if (vrm->vmi_needs_invalidate) {
  1645. vma_iter_invalidate(&vmi);
  1646. vrm->vmi_needs_invalidate = false;
  1647. }
  1648. seen_vma = true;
  1649. target_addr = res_vma + vrm->new_len;
  1650. }
  1651. return res;
  1652. }
  1653. static unsigned long do_mremap(struct vma_remap_struct *vrm)
  1654. {
  1655. struct mm_struct *mm = current->mm;
  1656. unsigned long res;
  1657. bool failed;
  1658. vrm->old_len = PAGE_ALIGN(vrm->old_len);
  1659. vrm->new_len = PAGE_ALIGN(vrm->new_len);
  1660. res = check_mremap_params(vrm);
  1661. if (res)
  1662. return res;
  1663. if (mmap_write_lock_killable(mm))
  1664. return -EINTR;
  1665. vrm->mmap_locked = true;
  1666. if (vrm_move_only(vrm)) {
  1667. res = remap_move(vrm);
  1668. } else {
  1669. vrm->vma = vma_lookup(current->mm, vrm->addr);
  1670. res = check_prep_vma(vrm);
  1671. if (res)
  1672. goto out;
  1673. /* Actually execute mremap. */
  1674. res = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm);
  1675. }
  1676. out:
  1677. failed = IS_ERR_VALUE(res);
  1678. if (vrm->mmap_locked)
  1679. mmap_write_unlock(mm);
  1680. /* VMA mlock'd + was expanded, so populated expanded region. */
  1681. if (!failed && vrm->populate_expand)
  1682. mm_populate(vrm->new_addr + vrm->old_len, vrm->delta);
  1683. notify_uffd(vrm, failed);
  1684. return res;
  1685. }
  1686. /*
  1687. * Expand (or shrink) an existing mapping, potentially moving it at the
  1688. * same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
  1689. *
  1690. * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
  1691. * This option implies MREMAP_MAYMOVE.
  1692. */
  1693. SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
  1694. unsigned long, new_len, unsigned long, flags,
  1695. unsigned long, new_addr)
  1696. {
  1697. struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
  1698. LIST_HEAD(uf_unmap_early);
  1699. LIST_HEAD(uf_unmap);
  1700. /*
  1701. * There is a deliberate asymmetry here: we strip the pointer tag
  1702. * from the old address but leave the new address alone. This is
  1703. * for consistency with mmap(), where we prevent the creation of
  1704. * aliasing mappings in userspace by leaving the tag bits of the
  1705. * mapping address intact. A non-zero tag will cause the subsequent
  1706. * range checks to reject the address as invalid.
  1707. *
  1708. * See Documentation/arch/arm64/tagged-address-abi.rst for more
  1709. * information.
  1710. */
  1711. struct vma_remap_struct vrm = {
  1712. .addr = untagged_addr(addr),
  1713. .old_len = old_len,
  1714. .new_len = new_len,
  1715. .flags = flags,
  1716. .new_addr = new_addr,
  1717. .uf = &uf,
  1718. .uf_unmap_early = &uf_unmap_early,
  1719. .uf_unmap = &uf_unmap,
  1720. .remap_type = MREMAP_INVALID, /* We set later. */
  1721. };
  1722. return do_mremap(&vrm);
  1723. }