pagewalk.c 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/pagewalk.h>
  3. #include <linux/highmem.h>
  4. #include <linux/sched.h>
  5. #include <linux/hugetlb.h>
  6. #include <linux/mmu_context.h>
  7. #include <linux/swap.h>
  8. #include <linux/leafops.h>
  9. #include <asm/tlbflush.h>
  10. #include "internal.h"
  11. /*
  12. * We want to know the real level where a entry is located ignoring any
  13. * folding of levels which may be happening. For example if p4d is folded then
  14. * a missing entry found at level 1 (p4d) is actually at level 0 (pgd).
  15. */
  16. static int real_depth(int depth)
  17. {
  18. if (depth == 3 && PTRS_PER_PMD == 1)
  19. depth = 2;
  20. if (depth == 2 && PTRS_PER_PUD == 1)
  21. depth = 1;
  22. if (depth == 1 && PTRS_PER_P4D == 1)
  23. depth = 0;
  24. return depth;
  25. }
  26. static int walk_pte_range_inner(pte_t *pte, unsigned long addr,
  27. unsigned long end, struct mm_walk *walk)
  28. {
  29. const struct mm_walk_ops *ops = walk->ops;
  30. int err = 0;
  31. for (;;) {
  32. if (ops->install_pte && pte_none(ptep_get(pte))) {
  33. pte_t new_pte;
  34. err = ops->install_pte(addr, addr + PAGE_SIZE, &new_pte,
  35. walk);
  36. if (err)
  37. break;
  38. set_pte_at(walk->mm, addr, pte, new_pte);
  39. /* Non-present before, so for arches that need it. */
  40. if (!WARN_ON_ONCE(walk->no_vma))
  41. update_mmu_cache(walk->vma, addr, pte);
  42. } else {
  43. err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
  44. if (err)
  45. break;
  46. }
  47. if (addr >= end - PAGE_SIZE)
  48. break;
  49. addr += PAGE_SIZE;
  50. pte++;
  51. }
  52. return err;
  53. }
  54. static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
  55. struct mm_walk *walk)
  56. {
  57. pte_t *pte;
  58. int err = 0;
  59. spinlock_t *ptl;
  60. if (walk->no_vma) {
  61. /*
  62. * pte_offset_map() might apply user-specific validation.
  63. * Indeed, on x86_64 the pmd entries set up by init_espfix_ap()
  64. * fit its pmd_bad() check (_PAGE_NX set and _PAGE_RW clear),
  65. * and CONFIG_EFI_PGT_DUMP efi_mm goes so far as to walk them.
  66. */
  67. if (walk->mm == &init_mm || addr >= TASK_SIZE)
  68. pte = pte_offset_kernel(pmd, addr);
  69. else
  70. pte = pte_offset_map(pmd, addr);
  71. if (pte) {
  72. err = walk_pte_range_inner(pte, addr, end, walk);
  73. if (walk->mm != &init_mm && addr < TASK_SIZE)
  74. pte_unmap(pte);
  75. }
  76. } else {
  77. pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
  78. if (pte) {
  79. err = walk_pte_range_inner(pte, addr, end, walk);
  80. pte_unmap_unlock(pte, ptl);
  81. }
  82. }
  83. if (!pte)
  84. walk->action = ACTION_AGAIN;
  85. return err;
  86. }
  87. static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
  88. struct mm_walk *walk)
  89. {
  90. pud_t pudval = pudp_get(pud);
  91. pmd_t *pmd;
  92. unsigned long next;
  93. const struct mm_walk_ops *ops = walk->ops;
  94. bool has_handler = ops->pte_entry;
  95. bool has_install = ops->install_pte;
  96. int err = 0;
  97. int depth = real_depth(3);
  98. /*
  99. * For PTE handling, pte_offset_map_lock() takes care of checking
  100. * whether there actually is a page table. But it also has to be
  101. * very careful about concurrent page table reclaim.
  102. *
  103. * Similarly, we have to be careful here - a PUD entry that points
  104. * to a PMD table cannot go away, so we can just walk it. But if
  105. * it's something else, we need to ensure we didn't race something,
  106. * so need to retry.
  107. *
  108. * A pertinent example of this is a PUD refault after PUD split -
  109. * we will need to split again or risk accessing invalid memory.
  110. */
  111. if (!pud_present(pudval) || pud_leaf(pudval)) {
  112. walk->action = ACTION_AGAIN;
  113. return 0;
  114. }
  115. pmd = pmd_offset(pud, addr);
  116. do {
  117. again:
  118. next = pmd_addr_end(addr, end);
  119. if (pmd_none(*pmd)) {
  120. if (has_install)
  121. err = __pte_alloc(walk->mm, pmd);
  122. else if (ops->pte_hole)
  123. err = ops->pte_hole(addr, next, depth, walk);
  124. if (err)
  125. break;
  126. if (!has_install)
  127. continue;
  128. }
  129. walk->action = ACTION_SUBTREE;
  130. /*
  131. * This implies that each ->pmd_entry() handler
  132. * needs to know about pmd_trans_huge() pmds
  133. */
  134. if (ops->pmd_entry)
  135. err = ops->pmd_entry(pmd, addr, next, walk);
  136. if (err)
  137. break;
  138. if (walk->action == ACTION_AGAIN)
  139. goto again;
  140. if (walk->action == ACTION_CONTINUE)
  141. continue;
  142. if (!has_handler) { /* No handlers for lower page tables. */
  143. if (!has_install)
  144. continue; /* Nothing to do. */
  145. /*
  146. * We are ONLY installing, so avoid unnecessarily
  147. * splitting a present huge page.
  148. */
  149. if (pmd_present(*pmd) && pmd_trans_huge(*pmd))
  150. continue;
  151. }
  152. if (walk->vma)
  153. split_huge_pmd(walk->vma, pmd, addr);
  154. else if (pmd_leaf(*pmd) || !pmd_present(*pmd))
  155. continue; /* Nothing to do. */
  156. err = walk_pte_range(pmd, addr, next, walk);
  157. if (err)
  158. break;
  159. if (walk->action == ACTION_AGAIN)
  160. goto again;
  161. } while (pmd++, addr = next, addr != end);
  162. return err;
  163. }
  164. static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
  165. struct mm_walk *walk)
  166. {
  167. pud_t *pud;
  168. unsigned long next;
  169. const struct mm_walk_ops *ops = walk->ops;
  170. bool has_handler = ops->pmd_entry || ops->pte_entry;
  171. bool has_install = ops->install_pte;
  172. int err = 0;
  173. int depth = real_depth(2);
  174. pud = pud_offset(p4d, addr);
  175. do {
  176. again:
  177. next = pud_addr_end(addr, end);
  178. if (pud_none(*pud)) {
  179. if (has_install)
  180. err = __pmd_alloc(walk->mm, pud, addr);
  181. else if (ops->pte_hole)
  182. err = ops->pte_hole(addr, next, depth, walk);
  183. if (err)
  184. break;
  185. if (!has_install)
  186. continue;
  187. }
  188. walk->action = ACTION_SUBTREE;
  189. if (ops->pud_entry)
  190. err = ops->pud_entry(pud, addr, next, walk);
  191. if (err)
  192. break;
  193. if (walk->action == ACTION_AGAIN)
  194. goto again;
  195. if (walk->action == ACTION_CONTINUE)
  196. continue;
  197. if (!has_handler) { /* No handlers for lower page tables. */
  198. if (!has_install)
  199. continue; /* Nothing to do. */
  200. /*
  201. * We are ONLY installing, so avoid unnecessarily
  202. * splitting a present huge page.
  203. */
  204. if (pud_present(*pud) && pud_trans_huge(*pud))
  205. continue;
  206. }
  207. if (walk->vma)
  208. split_huge_pud(walk->vma, pud, addr);
  209. else if (pud_leaf(*pud) || !pud_present(*pud))
  210. continue; /* Nothing to do. */
  211. err = walk_pmd_range(pud, addr, next, walk);
  212. if (err)
  213. break;
  214. if (walk->action == ACTION_AGAIN)
  215. goto again;
  216. } while (pud++, addr = next, addr != end);
  217. return err;
  218. }
  219. static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
  220. struct mm_walk *walk)
  221. {
  222. p4d_t *p4d;
  223. unsigned long next;
  224. const struct mm_walk_ops *ops = walk->ops;
  225. bool has_handler = ops->pud_entry || ops->pmd_entry || ops->pte_entry;
  226. bool has_install = ops->install_pte;
  227. int err = 0;
  228. int depth = real_depth(1);
  229. p4d = p4d_offset(pgd, addr);
  230. do {
  231. next = p4d_addr_end(addr, end);
  232. if (p4d_none_or_clear_bad(p4d)) {
  233. if (has_install)
  234. err = __pud_alloc(walk->mm, p4d, addr);
  235. else if (ops->pte_hole)
  236. err = ops->pte_hole(addr, next, depth, walk);
  237. if (err)
  238. break;
  239. if (!has_install)
  240. continue;
  241. }
  242. if (ops->p4d_entry) {
  243. err = ops->p4d_entry(p4d, addr, next, walk);
  244. if (err)
  245. break;
  246. }
  247. if (has_handler || has_install)
  248. err = walk_pud_range(p4d, addr, next, walk);
  249. if (err)
  250. break;
  251. } while (p4d++, addr = next, addr != end);
  252. return err;
  253. }
  254. static int walk_pgd_range(unsigned long addr, unsigned long end,
  255. struct mm_walk *walk)
  256. {
  257. pgd_t *pgd;
  258. unsigned long next;
  259. const struct mm_walk_ops *ops = walk->ops;
  260. bool has_handler = ops->p4d_entry || ops->pud_entry || ops->pmd_entry ||
  261. ops->pte_entry;
  262. bool has_install = ops->install_pte;
  263. int err = 0;
  264. if (walk->pgd)
  265. pgd = walk->pgd + pgd_index(addr);
  266. else
  267. pgd = pgd_offset(walk->mm, addr);
  268. do {
  269. next = pgd_addr_end(addr, end);
  270. if (pgd_none_or_clear_bad(pgd)) {
  271. if (has_install)
  272. err = __p4d_alloc(walk->mm, pgd, addr);
  273. else if (ops->pte_hole)
  274. err = ops->pte_hole(addr, next, 0, walk);
  275. if (err)
  276. break;
  277. if (!has_install)
  278. continue;
  279. }
  280. if (ops->pgd_entry) {
  281. err = ops->pgd_entry(pgd, addr, next, walk);
  282. if (err)
  283. break;
  284. }
  285. if (has_handler || has_install)
  286. err = walk_p4d_range(pgd, addr, next, walk);
  287. if (err)
  288. break;
  289. } while (pgd++, addr = next, addr != end);
  290. return err;
  291. }
  292. #ifdef CONFIG_HUGETLB_PAGE
  293. static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
  294. unsigned long end)
  295. {
  296. unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h);
  297. return min(boundary, end);
  298. }
  299. static int walk_hugetlb_range(unsigned long addr, unsigned long end,
  300. struct mm_walk *walk)
  301. {
  302. struct vm_area_struct *vma = walk->vma;
  303. struct hstate *h = hstate_vma(vma);
  304. unsigned long next;
  305. unsigned long hmask = huge_page_mask(h);
  306. unsigned long sz = huge_page_size(h);
  307. pte_t *pte;
  308. const struct mm_walk_ops *ops = walk->ops;
  309. int err = 0;
  310. hugetlb_vma_lock_read(vma);
  311. do {
  312. next = hugetlb_entry_end(h, addr, end);
  313. pte = hugetlb_walk(vma, addr & hmask, sz);
  314. if (pte)
  315. err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
  316. else if (ops->pte_hole)
  317. err = ops->pte_hole(addr, next, -1, walk);
  318. if (err)
  319. break;
  320. } while (addr = next, addr != end);
  321. hugetlb_vma_unlock_read(vma);
  322. return err;
  323. }
  324. #else /* CONFIG_HUGETLB_PAGE */
  325. static int walk_hugetlb_range(unsigned long addr, unsigned long end,
  326. struct mm_walk *walk)
  327. {
  328. return 0;
  329. }
  330. #endif /* CONFIG_HUGETLB_PAGE */
  331. /*
  332. * Decide whether we really walk over the current vma on [@start, @end)
  333. * or skip it via the returned value. Return 0 if we do walk over the
  334. * current vma, and return 1 if we skip the vma. Negative values means
  335. * error, where we abort the current walk.
  336. */
  337. static int walk_page_test(unsigned long start, unsigned long end,
  338. struct mm_walk *walk)
  339. {
  340. struct vm_area_struct *vma = walk->vma;
  341. const struct mm_walk_ops *ops = walk->ops;
  342. if (ops->test_walk)
  343. return ops->test_walk(start, end, walk);
  344. /*
  345. * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
  346. * range, so we don't walk over it as we do for normal vmas. However,
  347. * Some callers are interested in handling hole range and they don't
  348. * want to just ignore any single address range. Such users certainly
  349. * define their ->pte_hole() callbacks, so let's delegate them to handle
  350. * vma(VM_PFNMAP).
  351. */
  352. if (vma->vm_flags & VM_PFNMAP) {
  353. int err = 1;
  354. if (ops->pte_hole)
  355. err = ops->pte_hole(start, end, -1, walk);
  356. return err ? err : 1;
  357. }
  358. return 0;
  359. }
  360. static int __walk_page_range(unsigned long start, unsigned long end,
  361. struct mm_walk *walk)
  362. {
  363. int err = 0;
  364. struct vm_area_struct *vma = walk->vma;
  365. const struct mm_walk_ops *ops = walk->ops;
  366. bool is_hugetlb = is_vm_hugetlb_page(vma);
  367. /* We do not support hugetlb PTE installation. */
  368. if (ops->install_pte && is_hugetlb)
  369. return -EINVAL;
  370. if (ops->pre_vma) {
  371. err = ops->pre_vma(start, end, walk);
  372. if (err)
  373. return err;
  374. }
  375. if (is_hugetlb) {
  376. if (ops->hugetlb_entry)
  377. err = walk_hugetlb_range(start, end, walk);
  378. } else
  379. err = walk_pgd_range(start, end, walk);
  380. if (ops->post_vma)
  381. ops->post_vma(walk);
  382. return err;
  383. }
  384. static inline void process_mm_walk_lock(struct mm_struct *mm,
  385. enum page_walk_lock walk_lock)
  386. {
  387. if (walk_lock == PGWALK_RDLOCK)
  388. mmap_assert_locked(mm);
  389. else if (walk_lock != PGWALK_VMA_RDLOCK_VERIFY)
  390. mmap_assert_write_locked(mm);
  391. }
  392. static inline void process_vma_walk_lock(struct vm_area_struct *vma,
  393. enum page_walk_lock walk_lock)
  394. {
  395. #ifdef CONFIG_PER_VMA_LOCK
  396. switch (walk_lock) {
  397. case PGWALK_WRLOCK:
  398. vma_start_write(vma);
  399. break;
  400. case PGWALK_WRLOCK_VERIFY:
  401. vma_assert_write_locked(vma);
  402. break;
  403. case PGWALK_VMA_RDLOCK_VERIFY:
  404. vma_assert_locked(vma);
  405. break;
  406. case PGWALK_RDLOCK:
  407. /* PGWALK_RDLOCK is handled by process_mm_walk_lock */
  408. break;
  409. }
  410. #endif
  411. }
  412. /*
  413. * See the comment for walk_page_range(), this performs the heavy lifting of the
  414. * operation, only sets no restrictions on how the walk proceeds.
  415. *
  416. * We usually restrict the ability to install PTEs, but this functionality is
  417. * available to internal memory management code and provided in mm/internal.h.
  418. */
  419. int walk_page_range_mm_unsafe(struct mm_struct *mm, unsigned long start,
  420. unsigned long end, const struct mm_walk_ops *ops,
  421. void *private)
  422. {
  423. int err = 0;
  424. unsigned long next;
  425. struct vm_area_struct *vma;
  426. struct mm_walk walk = {
  427. .ops = ops,
  428. .mm = mm,
  429. .private = private,
  430. };
  431. if (start >= end)
  432. return -EINVAL;
  433. if (!walk.mm)
  434. return -EINVAL;
  435. process_mm_walk_lock(walk.mm, ops->walk_lock);
  436. vma = find_vma(walk.mm, start);
  437. do {
  438. if (!vma) { /* after the last vma */
  439. walk.vma = NULL;
  440. next = end;
  441. if (ops->pte_hole)
  442. err = ops->pte_hole(start, next, -1, &walk);
  443. } else if (start < vma->vm_start) { /* outside vma */
  444. walk.vma = NULL;
  445. next = min(end, vma->vm_start);
  446. if (ops->pte_hole)
  447. err = ops->pte_hole(start, next, -1, &walk);
  448. } else { /* inside vma */
  449. process_vma_walk_lock(vma, ops->walk_lock);
  450. walk.vma = vma;
  451. next = min(end, vma->vm_end);
  452. vma = find_vma(mm, vma->vm_end);
  453. err = walk_page_test(start, next, &walk);
  454. if (err > 0) {
  455. /*
  456. * positive return values are purely for
  457. * controlling the pagewalk, so should never
  458. * be passed to the callers.
  459. */
  460. err = 0;
  461. continue;
  462. }
  463. if (err < 0)
  464. break;
  465. err = __walk_page_range(start, next, &walk);
  466. }
  467. if (err)
  468. break;
  469. } while (start = next, start < end);
  470. return err;
  471. }
  472. /*
  473. * Determine if the walk operations specified are permitted to be used for a
  474. * page table walk.
  475. *
  476. * This check is performed on all functions which are parameterised by walk
  477. * operations and exposed in include/linux/pagewalk.h.
  478. *
  479. * Internal memory management code can use *_unsafe() functions to be able to
  480. * use all page walking operations.
  481. */
  482. static bool check_ops_safe(const struct mm_walk_ops *ops)
  483. {
  484. /*
  485. * The installation of PTEs is solely under the control of memory
  486. * management logic and subject to many subtle locking, security and
  487. * cache considerations so we cannot permit other users to do so, and
  488. * certainly not for exported symbols.
  489. */
  490. if (ops->install_pte)
  491. return false;
  492. return true;
  493. }
  494. /**
  495. * walk_page_range - walk page table with caller specific callbacks
  496. * @mm: mm_struct representing the target process of page table walk
  497. * @start: start address of the virtual address range
  498. * @end: end address of the virtual address range
  499. * @ops: operation to call during the walk
  500. * @private: private data for callbacks' usage
  501. *
  502. * Recursively walk the page table tree of the process represented by @mm
  503. * within the virtual address range [@start, @end). During walking, we can do
  504. * some caller-specific works for each entry, by setting up pmd_entry(),
  505. * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
  506. * callbacks, the associated entries/pages are just ignored.
  507. * The return values of these callbacks are commonly defined like below:
  508. *
  509. * - 0 : succeeded to handle the current entry, and if you don't reach the
  510. * end address yet, continue to walk.
  511. * - >0 : succeeded to handle the current entry, and return to the caller
  512. * with caller specific value.
  513. * - <0 : failed to handle the current entry, and return to the caller
  514. * with error code.
  515. *
  516. * Before starting to walk page table, some callers want to check whether
  517. * they really want to walk over the current vma, typically by checking
  518. * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
  519. * purpose.
  520. *
  521. * If operations need to be staged before and committed after a vma is walked,
  522. * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(),
  523. * since it is intended to handle commit-type operations, can't return any
  524. * errors.
  525. *
  526. * struct mm_walk keeps current values of some common data like vma and pmd,
  527. * which are useful for the access from callbacks. If you want to pass some
  528. * caller-specific data to callbacks, @private should be helpful.
  529. *
  530. * Locking:
  531. * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
  532. * because these function traverse vma list and/or access to vma's data.
  533. */
  534. int walk_page_range(struct mm_struct *mm, unsigned long start,
  535. unsigned long end, const struct mm_walk_ops *ops,
  536. void *private)
  537. {
  538. if (!check_ops_safe(ops))
  539. return -EINVAL;
  540. return walk_page_range_mm_unsafe(mm, start, end, ops, private);
  541. }
  542. /**
  543. * walk_kernel_page_table_range - walk a range of kernel pagetables.
  544. * @start: start address of the virtual address range
  545. * @end: end address of the virtual address range
  546. * @ops: operation to call during the walk
  547. * @pgd: pgd to walk if different from mm->pgd
  548. * @private: private data for callbacks' usage
  549. *
  550. * Similar to walk_page_range() but can walk any page tables even if they are
  551. * not backed by VMAs. Because 'unusual' entries may be walked this function
  552. * will also not lock the PTEs for the pte_entry() callback. This is useful for
  553. * walking kernel pages tables or page tables for firmware.
  554. *
  555. * Note: Be careful to walk the kernel pages tables, the caller may be need to
  556. * take other effective approaches (mmap lock may be insufficient) to prevent
  557. * the intermediate kernel page tables belonging to the specified address range
  558. * from being freed (e.g. memory hot-remove).
  559. */
  560. int walk_kernel_page_table_range(unsigned long start, unsigned long end,
  561. const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
  562. {
  563. /*
  564. * Kernel intermediate page tables are usually not freed, so the mmap
  565. * read lock is sufficient. But there are some exceptions.
  566. * E.g. memory hot-remove. In which case, the mmap lock is insufficient
  567. * to prevent the intermediate kernel pages tables belonging to the
  568. * specified address range from being freed. The caller should take
  569. * other actions to prevent this race.
  570. */
  571. mmap_assert_locked(&init_mm);
  572. return walk_kernel_page_table_range_lockless(start, end, ops, pgd,
  573. private);
  574. }
  575. /*
  576. * Use this function to walk the kernel page tables locklessly. It should be
  577. * guaranteed that the caller has exclusive access over the range they are
  578. * operating on - that there should be no concurrent access, for example,
  579. * changing permissions for vmalloc objects.
  580. */
  581. int walk_kernel_page_table_range_lockless(unsigned long start, unsigned long end,
  582. const struct mm_walk_ops *ops, pgd_t *pgd, void *private)
  583. {
  584. struct mm_walk walk = {
  585. .ops = ops,
  586. .mm = &init_mm,
  587. .pgd = pgd,
  588. .private = private,
  589. .no_vma = true
  590. };
  591. if (start >= end)
  592. return -EINVAL;
  593. if (!check_ops_safe(ops))
  594. return -EINVAL;
  595. return walk_pgd_range(start, end, &walk);
  596. }
  597. /**
  598. * walk_page_range_debug - walk a range of pagetables not backed by a vma
  599. * @mm: mm_struct representing the target process of page table walk
  600. * @start: start address of the virtual address range
  601. * @end: end address of the virtual address range
  602. * @ops: operation to call during the walk
  603. * @pgd: pgd to walk if different from mm->pgd
  604. * @private: private data for callbacks' usage
  605. *
  606. * Similar to walk_page_range() but can walk any page tables even if they are
  607. * not backed by VMAs. Because 'unusual' entries may be walked this function
  608. * will also not lock the PTEs for the pte_entry() callback.
  609. *
  610. * This is for debugging purposes ONLY.
  611. */
  612. int walk_page_range_debug(struct mm_struct *mm, unsigned long start,
  613. unsigned long end, const struct mm_walk_ops *ops,
  614. pgd_t *pgd, void *private)
  615. {
  616. struct mm_walk walk = {
  617. .ops = ops,
  618. .mm = mm,
  619. .pgd = pgd,
  620. .private = private,
  621. .no_vma = true
  622. };
  623. /* For convenience, we allow traversal of kernel mappings. */
  624. if (mm == &init_mm)
  625. return walk_kernel_page_table_range(start, end, ops,
  626. pgd, private);
  627. if (start >= end || !walk.mm)
  628. return -EINVAL;
  629. if (!check_ops_safe(ops))
  630. return -EINVAL;
  631. /*
  632. * The mmap lock protects the page walker from changes to the page
  633. * tables during the walk. However a read lock is insufficient to
  634. * protect those areas which don't have a VMA as munmap() detaches
  635. * the VMAs before downgrading to a read lock and actually tearing
  636. * down PTEs/page tables. In which case, the mmap write lock should
  637. * be held.
  638. */
  639. mmap_assert_write_locked(mm);
  640. return walk_pgd_range(start, end, &walk);
  641. }
  642. int walk_page_range_vma_unsafe(struct vm_area_struct *vma, unsigned long start,
  643. unsigned long end, const struct mm_walk_ops *ops, void *private)
  644. {
  645. struct mm_walk walk = {
  646. .ops = ops,
  647. .mm = vma->vm_mm,
  648. .vma = vma,
  649. .private = private,
  650. };
  651. if (start >= end || !walk.mm)
  652. return -EINVAL;
  653. if (start < vma->vm_start || end > vma->vm_end)
  654. return -EINVAL;
  655. process_mm_walk_lock(walk.mm, ops->walk_lock);
  656. process_vma_walk_lock(vma, ops->walk_lock);
  657. return __walk_page_range(start, end, &walk);
  658. }
  659. int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start,
  660. unsigned long end, const struct mm_walk_ops *ops,
  661. void *private)
  662. {
  663. if (!check_ops_safe(ops))
  664. return -EINVAL;
  665. return walk_page_range_vma_unsafe(vma, start, end, ops, private);
  666. }
  667. int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
  668. void *private)
  669. {
  670. struct mm_walk walk = {
  671. .ops = ops,
  672. .mm = vma->vm_mm,
  673. .vma = vma,
  674. .private = private,
  675. };
  676. if (!walk.mm)
  677. return -EINVAL;
  678. if (!check_ops_safe(ops))
  679. return -EINVAL;
  680. process_mm_walk_lock(walk.mm, ops->walk_lock);
  681. process_vma_walk_lock(vma, ops->walk_lock);
  682. return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
  683. }
  684. /**
  685. * walk_page_mapping - walk all memory areas mapped into a struct address_space.
  686. * @mapping: Pointer to the struct address_space
  687. * @first_index: First page offset in the address_space
  688. * @nr: Number of incremental page offsets to cover
  689. * @ops: operation to call during the walk
  690. * @private: private data for callbacks' usage
  691. *
  692. * This function walks all memory areas mapped into a struct address_space.
  693. * The walk is limited to only the given page-size index range, but if
  694. * the index boundaries cross a huge page-table entry, that entry will be
  695. * included.
  696. *
  697. * Also see walk_page_range() for additional information.
  698. *
  699. * Locking:
  700. * This function can't require that the struct mm_struct::mmap_lock is held,
  701. * since @mapping may be mapped by multiple processes. Instead
  702. * @mapping->i_mmap_rwsem must be held. This might have implications in the
  703. * callbacks, and it's up tho the caller to ensure that the
  704. * struct mm_struct::mmap_lock is not needed.
  705. *
  706. * Also this means that a caller can't rely on the struct
  707. * vm_area_struct::vm_flags to be constant across a call,
  708. * except for immutable flags. Callers requiring this shouldn't use
  709. * this function.
  710. *
  711. * Return: 0 on success, negative error code on failure, positive number on
  712. * caller defined premature termination.
  713. */
  714. int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
  715. pgoff_t nr, const struct mm_walk_ops *ops,
  716. void *private)
  717. {
  718. struct mm_walk walk = {
  719. .ops = ops,
  720. .private = private,
  721. };
  722. struct vm_area_struct *vma;
  723. pgoff_t vba, vea, cba, cea;
  724. unsigned long start_addr, end_addr;
  725. int err = 0;
  726. if (!check_ops_safe(ops))
  727. return -EINVAL;
  728. lockdep_assert_held(&mapping->i_mmap_rwsem);
  729. vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index,
  730. first_index + nr - 1) {
  731. /* Clip to the vma */
  732. vba = vma->vm_pgoff;
  733. vea = vba + vma_pages(vma);
  734. cba = first_index;
  735. cba = max(cba, vba);
  736. cea = first_index + nr;
  737. cea = min(cea, vea);
  738. start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start;
  739. end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start;
  740. if (start_addr >= end_addr)
  741. continue;
  742. walk.vma = vma;
  743. walk.mm = vma->vm_mm;
  744. err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
  745. if (err > 0) {
  746. err = 0;
  747. break;
  748. } else if (err < 0)
  749. break;
  750. err = __walk_page_range(start_addr, end_addr, &walk);
  751. if (err)
  752. break;
  753. }
  754. return err;
  755. }
  756. /**
  757. * folio_walk_start - walk the page tables to a folio
  758. * @fw: filled with information on success.
  759. * @vma: the VMA.
  760. * @addr: the virtual address to use for the page table walk.
  761. * @flags: flags modifying which folios to walk to.
  762. *
  763. * Walk the page tables using @addr in a given @vma to a mapped folio and
  764. * return the folio, making sure that the page table entry referenced by
  765. * @addr cannot change until folio_walk_end() was called.
  766. *
  767. * As default, this function returns only folios that are not special (e.g., not
  768. * the zeropage) and never returns folios that are supposed to be ignored by the
  769. * VM as documented by vm_normal_page(). If requested, zeropages will be
  770. * returned as well.
  771. *
  772. * As default, this function only considers present page table entries.
  773. * If requested, it will also consider migration entries.
  774. *
  775. * If this function returns NULL it might either indicate "there is nothing" or
  776. * "there is nothing suitable".
  777. *
  778. * On success, @fw is filled and the function returns the folio while the PTL
  779. * is still held and folio_walk_end() must be called to clean up,
  780. * releasing any held locks. The returned folio must *not* be used after the
  781. * call to folio_walk_end(), unless a short-term folio reference is taken before
  782. * that call.
  783. *
  784. * @fw->page will correspond to the page that is effectively referenced by
  785. * @addr. However, for migration entries and shared zeropages @fw->page is
  786. * set to NULL. Note that large folios might be mapped by multiple page table
  787. * entries, and this function will always only lookup a single entry as
  788. * specified by @addr, which might or might not cover more than a single page of
  789. * the returned folio.
  790. *
  791. * This function must *not* be used as a naive replacement for
  792. * get_user_pages() / pin_user_pages(), especially not to perform DMA or
  793. * to carelessly modify page content. This function may *only* be used to grab
  794. * short-term folio references, never to grab long-term folio references.
  795. *
  796. * Using the page table entry pointers in @fw for reading or modifying the
  797. * entry should be avoided where possible: however, there might be valid
  798. * use cases.
  799. *
  800. * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care.
  801. * For example, PMD page table sharing might require prior unsharing. Also,
  802. * logical hugetlb entries might span multiple physical page table entries,
  803. * which *must* be modified in a single operation (set_huge_pte_at(),
  804. * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might
  805. * not correspond to the first physical entry of a logical hugetlb entry.
  806. *
  807. * The mmap lock must be held in read mode.
  808. *
  809. * Return: folio pointer on success, otherwise NULL.
  810. */
  811. struct folio *folio_walk_start(struct folio_walk *fw,
  812. struct vm_area_struct *vma, unsigned long addr,
  813. folio_walk_flags_t flags)
  814. {
  815. unsigned long entry_size;
  816. bool expose_page = true;
  817. struct page *page;
  818. pud_t *pudp, pud;
  819. pmd_t *pmdp, pmd;
  820. pte_t *ptep, pte;
  821. spinlock_t *ptl;
  822. pgd_t *pgdp;
  823. p4d_t *p4dp;
  824. mmap_assert_locked(vma->vm_mm);
  825. vma_pgtable_walk_begin(vma);
  826. if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end))
  827. goto not_found;
  828. pgdp = pgd_offset(vma->vm_mm, addr);
  829. if (pgd_none_or_clear_bad(pgdp))
  830. goto not_found;
  831. p4dp = p4d_offset(pgdp, addr);
  832. if (p4d_none_or_clear_bad(p4dp))
  833. goto not_found;
  834. pudp = pud_offset(p4dp, addr);
  835. pud = pudp_get(pudp);
  836. if (pud_none(pud))
  837. goto not_found;
  838. if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
  839. (!pud_present(pud) || pud_leaf(pud))) {
  840. ptl = pud_lock(vma->vm_mm, pudp);
  841. pud = pudp_get(pudp);
  842. entry_size = PUD_SIZE;
  843. fw->level = FW_LEVEL_PUD;
  844. fw->pudp = pudp;
  845. fw->pud = pud;
  846. if (pud_none(pud)) {
  847. spin_unlock(ptl);
  848. goto not_found;
  849. } else if (pud_present(pud) && !pud_leaf(pud)) {
  850. spin_unlock(ptl);
  851. goto pmd_table;
  852. } else if (pud_present(pud)) {
  853. page = vm_normal_page_pud(vma, addr, pud);
  854. if (page)
  855. goto found;
  856. }
  857. /*
  858. * TODO: FW_MIGRATION support for PUD migration entries
  859. * once there are relevant users.
  860. */
  861. spin_unlock(ptl);
  862. goto not_found;
  863. }
  864. pmd_table:
  865. VM_WARN_ON_ONCE(!pud_present(pud) || pud_leaf(pud));
  866. pmdp = pmd_offset(pudp, addr);
  867. pmd = pmdp_get_lockless(pmdp);
  868. if (pmd_none(pmd))
  869. goto not_found;
  870. if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) &&
  871. (!pmd_present(pmd) || pmd_leaf(pmd))) {
  872. ptl = pmd_lock(vma->vm_mm, pmdp);
  873. pmd = pmdp_get(pmdp);
  874. entry_size = PMD_SIZE;
  875. fw->level = FW_LEVEL_PMD;
  876. fw->pmdp = pmdp;
  877. fw->pmd = pmd;
  878. if (pmd_none(pmd)) {
  879. spin_unlock(ptl);
  880. goto not_found;
  881. } else if (pmd_present(pmd) && !pmd_leaf(pmd)) {
  882. spin_unlock(ptl);
  883. goto pte_table;
  884. } else if (pmd_present(pmd)) {
  885. page = vm_normal_page_pmd(vma, addr, pmd);
  886. if (page) {
  887. goto found;
  888. } else if ((flags & FW_ZEROPAGE) &&
  889. is_huge_zero_pmd(pmd)) {
  890. page = pfn_to_page(pmd_pfn(pmd));
  891. expose_page = false;
  892. goto found;
  893. }
  894. } else if ((flags & FW_MIGRATION) &&
  895. pmd_is_migration_entry(pmd)) {
  896. const softleaf_t entry = softleaf_from_pmd(pmd);
  897. page = softleaf_to_page(entry);
  898. expose_page = false;
  899. goto found;
  900. }
  901. spin_unlock(ptl);
  902. goto not_found;
  903. }
  904. pte_table:
  905. VM_WARN_ON_ONCE(!pmd_present(pmd) || pmd_leaf(pmd));
  906. ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
  907. if (!ptep)
  908. goto not_found;
  909. pte = ptep_get(ptep);
  910. entry_size = PAGE_SIZE;
  911. fw->level = FW_LEVEL_PTE;
  912. fw->ptep = ptep;
  913. fw->pte = pte;
  914. if (pte_present(pte)) {
  915. page = vm_normal_page(vma, addr, pte);
  916. if (page)
  917. goto found;
  918. if ((flags & FW_ZEROPAGE) &&
  919. is_zero_pfn(pte_pfn(pte))) {
  920. page = pfn_to_page(pte_pfn(pte));
  921. expose_page = false;
  922. goto found;
  923. }
  924. } else if (!pte_none(pte)) {
  925. const softleaf_t entry = softleaf_from_pte(pte);
  926. if ((flags & FW_MIGRATION) && softleaf_is_migration(entry)) {
  927. page = softleaf_to_page(entry);
  928. expose_page = false;
  929. goto found;
  930. }
  931. }
  932. pte_unmap_unlock(ptep, ptl);
  933. not_found:
  934. vma_pgtable_walk_end(vma);
  935. return NULL;
  936. found:
  937. if (expose_page)
  938. /* Note: Offset from the mapped page, not the folio start. */
  939. fw->page = page + ((addr & (entry_size - 1)) >> PAGE_SHIFT);
  940. else
  941. fw->page = NULL;
  942. fw->ptl = ptl;
  943. return page_folio(page);
  944. }