mmu_gather.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/gfp.h>
  3. #include <linux/highmem.h>
  4. #include <linux/kernel.h>
  5. #include <linux/mmdebug.h>
  6. #include <linux/mm_types.h>
  7. #include <linux/mm_inline.h>
  8. #include <linux/pagemap.h>
  9. #include <linux/rcupdate.h>
  10. #include <linux/smp.h>
  11. #include <linux/swap.h>
  12. #include <linux/rmap.h>
  13. #include <linux/pgalloc.h>
  14. #include <linux/hugetlb.h>
  15. #include <asm/tlb.h>
  16. #ifndef CONFIG_MMU_GATHER_NO_GATHER
  17. static bool tlb_next_batch(struct mmu_gather *tlb)
  18. {
  19. struct mmu_gather_batch *batch;
  20. /* Limit batching if we have delayed rmaps pending */
  21. if (tlb->delayed_rmap && tlb->active != &tlb->local)
  22. return false;
  23. batch = tlb->active;
  24. if (batch->next) {
  25. tlb->active = batch->next;
  26. return true;
  27. }
  28. if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
  29. return false;
  30. batch = (void *)__get_free_page(GFP_NOWAIT);
  31. if (!batch)
  32. return false;
  33. tlb->batch_count++;
  34. batch->next = NULL;
  35. batch->nr = 0;
  36. batch->max = MAX_GATHER_BATCH;
  37. tlb->active->next = batch;
  38. tlb->active = batch;
  39. return true;
  40. }
  41. #ifdef CONFIG_SMP
  42. static void tlb_flush_rmap_batch(struct mmu_gather_batch *batch, struct vm_area_struct *vma)
  43. {
  44. struct encoded_page **pages = batch->encoded_pages;
  45. for (int i = 0; i < batch->nr; i++) {
  46. struct encoded_page *enc = pages[i];
  47. if (encoded_page_flags(enc) & ENCODED_PAGE_BIT_DELAY_RMAP) {
  48. struct page *page = encoded_page_ptr(enc);
  49. unsigned int nr_pages = 1;
  50. if (unlikely(encoded_page_flags(enc) &
  51. ENCODED_PAGE_BIT_NR_PAGES_NEXT))
  52. nr_pages = encoded_nr_pages(pages[++i]);
  53. folio_remove_rmap_ptes(page_folio(page), page, nr_pages,
  54. vma);
  55. }
  56. }
  57. }
  58. /**
  59. * tlb_flush_rmaps - do pending rmap removals after we have flushed the TLB
  60. * @tlb: the current mmu_gather
  61. * @vma: The memory area from which the pages are being removed.
  62. *
  63. * Note that because of how tlb_next_batch() above works, we will
  64. * never start multiple new batches with pending delayed rmaps, so
  65. * we only need to walk through the current active batch and the
  66. * original local one.
  67. */
  68. void tlb_flush_rmaps(struct mmu_gather *tlb, struct vm_area_struct *vma)
  69. {
  70. if (!tlb->delayed_rmap)
  71. return;
  72. tlb_flush_rmap_batch(&tlb->local, vma);
  73. if (tlb->active != &tlb->local)
  74. tlb_flush_rmap_batch(tlb->active, vma);
  75. tlb->delayed_rmap = 0;
  76. }
  77. #endif
  78. /*
  79. * We might end up freeing a lot of pages. Reschedule on a regular
  80. * basis to avoid soft lockups in configurations without full
  81. * preemption enabled. The magic number of 512 folios seems to work.
  82. */
  83. #define MAX_NR_FOLIOS_PER_FREE 512
  84. static void __tlb_batch_free_encoded_pages(struct mmu_gather_batch *batch)
  85. {
  86. struct encoded_page **pages = batch->encoded_pages;
  87. unsigned int nr, nr_pages;
  88. while (batch->nr) {
  89. if (!page_poisoning_enabled_static() && !want_init_on_free()) {
  90. nr = min(MAX_NR_FOLIOS_PER_FREE, batch->nr);
  91. /*
  92. * Make sure we cover page + nr_pages, and don't leave
  93. * nr_pages behind when capping the number of entries.
  94. */
  95. if (unlikely(encoded_page_flags(pages[nr - 1]) &
  96. ENCODED_PAGE_BIT_NR_PAGES_NEXT))
  97. nr++;
  98. } else {
  99. /*
  100. * With page poisoning and init_on_free, the time it
  101. * takes to free memory grows proportionally with the
  102. * actual memory size. Therefore, limit based on the
  103. * actual memory size and not the number of involved
  104. * folios.
  105. */
  106. for (nr = 0, nr_pages = 0;
  107. nr < batch->nr && nr_pages < MAX_NR_FOLIOS_PER_FREE;
  108. nr++) {
  109. if (unlikely(encoded_page_flags(pages[nr]) &
  110. ENCODED_PAGE_BIT_NR_PAGES_NEXT))
  111. nr_pages += encoded_nr_pages(pages[++nr]);
  112. else
  113. nr_pages++;
  114. }
  115. }
  116. free_pages_and_swap_cache(pages, nr);
  117. pages += nr;
  118. batch->nr -= nr;
  119. cond_resched();
  120. }
  121. }
  122. static void tlb_batch_pages_flush(struct mmu_gather *tlb)
  123. {
  124. struct mmu_gather_batch *batch;
  125. for (batch = &tlb->local; batch && batch->nr; batch = batch->next)
  126. __tlb_batch_free_encoded_pages(batch);
  127. tlb->active = &tlb->local;
  128. }
  129. static void tlb_batch_list_free(struct mmu_gather *tlb)
  130. {
  131. struct mmu_gather_batch *batch, *next;
  132. for (batch = tlb->local.next; batch; batch = next) {
  133. next = batch->next;
  134. free_pages((unsigned long)batch, 0);
  135. }
  136. tlb->local.next = NULL;
  137. }
  138. static bool __tlb_remove_folio_pages_size(struct mmu_gather *tlb,
  139. struct page *page, unsigned int nr_pages, bool delay_rmap,
  140. int page_size)
  141. {
  142. int flags = delay_rmap ? ENCODED_PAGE_BIT_DELAY_RMAP : 0;
  143. struct mmu_gather_batch *batch;
  144. VM_BUG_ON(!tlb->end);
  145. #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
  146. VM_WARN_ON(tlb->page_size != page_size);
  147. VM_WARN_ON_ONCE(nr_pages != 1 && page_size != PAGE_SIZE);
  148. VM_WARN_ON_ONCE(page_folio(page) != page_folio(page + nr_pages - 1));
  149. #endif
  150. batch = tlb->active;
  151. /*
  152. * Add the page and check if we are full. If so
  153. * force a flush.
  154. */
  155. if (likely(nr_pages == 1)) {
  156. batch->encoded_pages[batch->nr++] = encode_page(page, flags);
  157. } else {
  158. flags |= ENCODED_PAGE_BIT_NR_PAGES_NEXT;
  159. batch->encoded_pages[batch->nr++] = encode_page(page, flags);
  160. batch->encoded_pages[batch->nr++] = encode_nr_pages(nr_pages);
  161. }
  162. /*
  163. * Make sure that we can always add another "page" + "nr_pages",
  164. * requiring two entries instead of only a single one.
  165. */
  166. if (batch->nr >= batch->max - 1) {
  167. if (!tlb_next_batch(tlb))
  168. return true;
  169. batch = tlb->active;
  170. }
  171. VM_BUG_ON_PAGE(batch->nr > batch->max - 1, page);
  172. return false;
  173. }
  174. bool __tlb_remove_folio_pages(struct mmu_gather *tlb, struct page *page,
  175. unsigned int nr_pages, bool delay_rmap)
  176. {
  177. return __tlb_remove_folio_pages_size(tlb, page, nr_pages, delay_rmap,
  178. PAGE_SIZE);
  179. }
  180. bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
  181. {
  182. return __tlb_remove_folio_pages_size(tlb, page, 1, false, page_size);
  183. }
  184. #endif /* MMU_GATHER_NO_GATHER */
  185. #ifdef CONFIG_MMU_GATHER_TABLE_FREE
  186. static void __tlb_remove_table_free(struct mmu_table_batch *batch)
  187. {
  188. int i;
  189. for (i = 0; i < batch->nr; i++)
  190. __tlb_remove_table(batch->tables[i]);
  191. free_page((unsigned long)batch);
  192. }
  193. #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
  194. /*
  195. * Semi RCU freeing of the page directories.
  196. *
  197. * This is needed by some architectures to implement software pagetable walkers.
  198. *
  199. * gup_fast() and other software pagetable walkers do a lockless page-table
  200. * walk and therefore needs some synchronization with the freeing of the page
  201. * directories. The chosen means to accomplish that is by disabling IRQs over
  202. * the walk.
  203. *
  204. * Architectures that use IPIs to flush TLBs will then automagically DTRT,
  205. * since we unlink the page, flush TLBs, free the page. Since the disabling of
  206. * IRQs delays the completion of the TLB flush we can never observe an already
  207. * freed page.
  208. *
  209. * Not all systems IPI every CPU for this purpose:
  210. *
  211. * - Some architectures have HW support for cross-CPU synchronisation of TLB
  212. * flushes, so there's no IPI at all.
  213. *
  214. * - Paravirt guests can do this TLB flushing in the hypervisor, or coordinate
  215. * with the hypervisor to defer flushing on preempted vCPUs.
  216. *
  217. * Such systems need to delay the freeing by some other means, this is that
  218. * means.
  219. *
  220. * What we do is batch the freed directory pages (tables) and RCU free them.
  221. * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
  222. * holds off grace periods.
  223. *
  224. * However, in order to batch these pages we need to allocate storage, this
  225. * allocation is deep inside the MM code and can thus easily fail on memory
  226. * pressure. To guarantee progress we fall back to single table freeing, see
  227. * the implementation of tlb_remove_table_one().
  228. *
  229. */
  230. static void tlb_remove_table_smp_sync(void *arg)
  231. {
  232. /* Simply deliver the interrupt */
  233. }
  234. void tlb_remove_table_sync_one(void)
  235. {
  236. /*
  237. * This isn't an RCU grace period and hence the page-tables cannot be
  238. * assumed to be actually RCU-freed.
  239. *
  240. * It is however sufficient for software page-table walkers that rely on
  241. * IRQ disabling.
  242. */
  243. smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
  244. }
  245. static void tlb_remove_table_rcu(struct rcu_head *head)
  246. {
  247. __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
  248. }
  249. static void tlb_remove_table_free(struct mmu_table_batch *batch)
  250. {
  251. call_rcu(&batch->rcu, tlb_remove_table_rcu);
  252. }
  253. #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
  254. static void tlb_remove_table_free(struct mmu_table_batch *batch)
  255. {
  256. __tlb_remove_table_free(batch);
  257. }
  258. #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
  259. /*
  260. * If we want tlb_remove_table() to imply TLB invalidates.
  261. */
  262. static inline void tlb_table_invalidate(struct mmu_gather *tlb)
  263. {
  264. if (tlb_needs_table_invalidate()) {
  265. /*
  266. * Invalidate page-table caches used by hardware walkers. Then
  267. * we still need to RCU-sched wait while freeing the pages
  268. * because software walkers can still be in-flight.
  269. */
  270. tlb_flush_mmu_tlbonly(tlb);
  271. }
  272. }
  273. #ifdef CONFIG_PT_RECLAIM
  274. static inline void __tlb_remove_table_one_rcu(struct rcu_head *head)
  275. {
  276. struct ptdesc *ptdesc;
  277. ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
  278. __tlb_remove_table(ptdesc);
  279. }
  280. static inline void __tlb_remove_table_one(void *table)
  281. {
  282. struct ptdesc *ptdesc;
  283. ptdesc = table;
  284. call_rcu(&ptdesc->pt_rcu_head, __tlb_remove_table_one_rcu);
  285. }
  286. #else
  287. static inline void __tlb_remove_table_one(void *table)
  288. {
  289. tlb_remove_table_sync_one();
  290. __tlb_remove_table(table);
  291. }
  292. #endif /* CONFIG_PT_RECLAIM */
  293. static void tlb_remove_table_one(void *table)
  294. {
  295. __tlb_remove_table_one(table);
  296. }
  297. static void tlb_table_flush(struct mmu_gather *tlb)
  298. {
  299. struct mmu_table_batch **batch = &tlb->batch;
  300. if (*batch) {
  301. tlb_table_invalidate(tlb);
  302. tlb_remove_table_free(*batch);
  303. *batch = NULL;
  304. }
  305. }
  306. void tlb_remove_table(struct mmu_gather *tlb, void *table)
  307. {
  308. struct mmu_table_batch **batch = &tlb->batch;
  309. if (*batch == NULL) {
  310. *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT);
  311. if (*batch == NULL) {
  312. tlb_table_invalidate(tlb);
  313. tlb_remove_table_one(table);
  314. return;
  315. }
  316. (*batch)->nr = 0;
  317. }
  318. (*batch)->tables[(*batch)->nr++] = table;
  319. if ((*batch)->nr == MAX_TABLE_BATCH)
  320. tlb_table_flush(tlb);
  321. }
  322. static inline void tlb_table_init(struct mmu_gather *tlb)
  323. {
  324. tlb->batch = NULL;
  325. }
  326. #else /* !CONFIG_MMU_GATHER_TABLE_FREE */
  327. static inline void tlb_table_flush(struct mmu_gather *tlb) { }
  328. static inline void tlb_table_init(struct mmu_gather *tlb) { }
  329. #endif /* CONFIG_MMU_GATHER_TABLE_FREE */
  330. static void tlb_flush_mmu_free(struct mmu_gather *tlb)
  331. {
  332. tlb_table_flush(tlb);
  333. #ifndef CONFIG_MMU_GATHER_NO_GATHER
  334. tlb_batch_pages_flush(tlb);
  335. #endif
  336. }
  337. void tlb_flush_mmu(struct mmu_gather *tlb)
  338. {
  339. tlb_flush_mmu_tlbonly(tlb);
  340. tlb_flush_mmu_free(tlb);
  341. }
  342. static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
  343. bool fullmm)
  344. {
  345. tlb->mm = mm;
  346. tlb->fullmm = fullmm;
  347. #ifndef CONFIG_MMU_GATHER_NO_GATHER
  348. tlb->need_flush_all = 0;
  349. tlb->local.next = NULL;
  350. tlb->local.nr = 0;
  351. tlb->local.max = ARRAY_SIZE(tlb->__pages);
  352. tlb->active = &tlb->local;
  353. tlb->batch_count = 0;
  354. #endif
  355. tlb->delayed_rmap = 0;
  356. tlb_table_init(tlb);
  357. #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
  358. tlb->page_size = 0;
  359. #endif
  360. tlb->vma_pfn = 0;
  361. tlb->fully_unshared_tables = 0;
  362. __tlb_reset_range(tlb);
  363. inc_tlb_flush_pending(tlb->mm);
  364. }
  365. /**
  366. * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
  367. * @tlb: the mmu_gather structure to initialize
  368. * @mm: the mm_struct of the target address space
  369. *
  370. * Called to initialize an (on-stack) mmu_gather structure for page-table
  371. * tear-down from @mm.
  372. */
  373. void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
  374. {
  375. __tlb_gather_mmu(tlb, mm, false);
  376. }
  377. /**
  378. * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
  379. * @tlb: the mmu_gather structure to initialize
  380. * @mm: the mm_struct of the target address space
  381. *
  382. * In this case, @mm is without users and we're going to destroy the
  383. * full address space (exit/execve).
  384. *
  385. * Called to initialize an (on-stack) mmu_gather structure for page-table
  386. * tear-down from @mm.
  387. */
  388. void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
  389. {
  390. __tlb_gather_mmu(tlb, mm, true);
  391. }
  392. /**
  393. * tlb_gather_mmu_vma - initialize an mmu_gather structure for operating on a
  394. * single VMA
  395. * @tlb: the mmu_gather structure to initialize
  396. * @vma: the vm_area_struct
  397. *
  398. * Called to initialize an (on-stack) mmu_gather structure for operating on
  399. * a single VMA. In contrast to tlb_gather_mmu(), calling this function will
  400. * not require another call to tlb_start_vma(). In contrast to tlb_start_vma(),
  401. * this function will *not* call flush_cache_range().
  402. *
  403. * For hugetlb VMAs, this function will also initialize the mmu_gather
  404. * page_size accordingly, not requiring a separate call to
  405. * tlb_change_page_size().
  406. *
  407. */
  408. void tlb_gather_mmu_vma(struct mmu_gather *tlb, struct vm_area_struct *vma)
  409. {
  410. tlb_gather_mmu(tlb, vma->vm_mm);
  411. tlb_update_vma_flags(tlb, vma);
  412. if (is_vm_hugetlb_page(vma))
  413. /* All entries have the same size. */
  414. tlb_change_page_size(tlb, huge_page_size(hstate_vma(vma)));
  415. }
  416. /**
  417. * tlb_finish_mmu - finish an mmu_gather structure
  418. * @tlb: the mmu_gather structure to finish
  419. *
  420. * Called at the end of the shootdown operation to free up any resources that
  421. * were required.
  422. */
  423. void tlb_finish_mmu(struct mmu_gather *tlb)
  424. {
  425. /*
  426. * We expect an earlier huge_pmd_unshare_flush() call to sort this out,
  427. * due to complicated locking requirements with page table unsharing.
  428. */
  429. VM_WARN_ON_ONCE(tlb->fully_unshared_tables);
  430. /*
  431. * If there are parallel threads are doing PTE changes on same range
  432. * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
  433. * flush by batching, one thread may end up seeing inconsistent PTEs
  434. * and result in having stale TLB entries. So flush TLB forcefully
  435. * if we detect parallel PTE batching threads.
  436. *
  437. * However, some syscalls, e.g. munmap(), may free page tables, this
  438. * needs force flush everything in the given range. Otherwise this
  439. * may result in having stale TLB entries for some architectures,
  440. * e.g. aarch64, that could specify flush what level TLB.
  441. */
  442. if (mm_tlb_flush_nested(tlb->mm)) {
  443. /*
  444. * The aarch64 yields better performance with fullmm by
  445. * avoiding multiple CPUs spamming TLBI messages at the
  446. * same time.
  447. *
  448. * On x86 non-fullmm doesn't yield significant difference
  449. * against fullmm.
  450. */
  451. tlb->fullmm = 1;
  452. __tlb_reset_range(tlb);
  453. tlb->freed_tables = 1;
  454. }
  455. tlb_flush_mmu(tlb);
  456. #ifndef CONFIG_MMU_GATHER_NO_GATHER
  457. tlb_batch_list_free(tlb);
  458. #endif
  459. dec_tlb_flush_pending(tlb->mm);
  460. }