hugetlb_vmemmap.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * HugeTLB Vmemmap Optimization (HVO)
  4. *
  5. * Copyright (c) 2020, ByteDance. All rights reserved.
  6. *
  7. * Author: Muchun Song <songmuchun@bytedance.com>
  8. *
  9. * See Documentation/mm/vmemmap_dedup.rst
  10. */
  11. #define pr_fmt(fmt) "HugeTLB: " fmt
  12. #include <linux/pgtable.h>
  13. #include <linux/moduleparam.h>
  14. #include <linux/bootmem_info.h>
  15. #include <linux/mmdebug.h>
  16. #include <linux/pagewalk.h>
  17. #include <linux/pgalloc.h>
  18. #include <asm/tlbflush.h>
  19. #include "hugetlb_vmemmap.h"
  20. /**
  21. * struct vmemmap_remap_walk - walk vmemmap page table
  22. *
  23. * @remap_pte: called for each lowest-level entry (PTE).
  24. * @nr_walked: the number of walked pte.
  25. * @reuse_page: the page which is reused for the tail vmemmap pages.
  26. * @reuse_addr: the virtual address of the @reuse_page page.
  27. * @vmemmap_pages: the list head of the vmemmap pages that can be freed
  28. * or is mapped from.
  29. * @flags: used to modify behavior in vmemmap page table walking
  30. * operations.
  31. */
  32. struct vmemmap_remap_walk {
  33. void (*remap_pte)(pte_t *pte, unsigned long addr,
  34. struct vmemmap_remap_walk *walk);
  35. unsigned long nr_walked;
  36. struct page *reuse_page;
  37. unsigned long reuse_addr;
  38. struct list_head *vmemmap_pages;
  39. /* Skip the TLB flush when we split the PMD */
  40. #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0)
  41. /* Skip the TLB flush when we remap the PTE */
  42. #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1)
  43. /* synchronize_rcu() to avoid writes from page_ref_add_unless() */
  44. #define VMEMMAP_SYNCHRONIZE_RCU BIT(2)
  45. unsigned long flags;
  46. };
  47. static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
  48. struct vmemmap_remap_walk *walk)
  49. {
  50. pmd_t __pmd;
  51. int i;
  52. unsigned long addr = start;
  53. pte_t *pgtable;
  54. pgtable = pte_alloc_one_kernel(&init_mm);
  55. if (!pgtable)
  56. return -ENOMEM;
  57. pmd_populate_kernel(&init_mm, &__pmd, pgtable);
  58. for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
  59. pte_t entry, *pte;
  60. pgprot_t pgprot = PAGE_KERNEL;
  61. entry = mk_pte(head + i, pgprot);
  62. pte = pte_offset_kernel(&__pmd, addr);
  63. set_pte_at(&init_mm, addr, pte, entry);
  64. }
  65. spin_lock(&init_mm.page_table_lock);
  66. if (likely(pmd_leaf(*pmd))) {
  67. /*
  68. * Higher order allocations from buddy allocator must be able to
  69. * be treated as independent small pages (as they can be freed
  70. * individually).
  71. */
  72. if (!PageReserved(head))
  73. split_page(head, get_order(PMD_SIZE));
  74. /* Make pte visible before pmd. See comment in pmd_install(). */
  75. smp_wmb();
  76. pmd_populate_kernel(&init_mm, pmd, pgtable);
  77. if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
  78. flush_tlb_kernel_range(start, start + PMD_SIZE);
  79. } else {
  80. pte_free_kernel(&init_mm, pgtable);
  81. }
  82. spin_unlock(&init_mm.page_table_lock);
  83. return 0;
  84. }
  85. static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
  86. unsigned long next, struct mm_walk *walk)
  87. {
  88. int ret = 0;
  89. struct page *head;
  90. struct vmemmap_remap_walk *vmemmap_walk = walk->private;
  91. /* Only splitting, not remapping the vmemmap pages. */
  92. if (!vmemmap_walk->remap_pte)
  93. walk->action = ACTION_CONTINUE;
  94. spin_lock(&init_mm.page_table_lock);
  95. head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
  96. /*
  97. * Due to HugeTLB alignment requirements and the vmemmap
  98. * pages being at the start of the hotplugged memory
  99. * region in memory_hotplug.memmap_on_memory case. Checking
  100. * the vmemmap page associated with the first vmemmap page
  101. * if it is self-hosted is sufficient.
  102. *
  103. * [ hotplugged memory ]
  104. * [ section ][...][ section ]
  105. * [ vmemmap ][ usable memory ]
  106. * ^ | ^ |
  107. * +--+ | |
  108. * +------------------------+
  109. */
  110. if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
  111. struct page *page = head ? head + pte_index(addr) :
  112. pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
  113. if (PageVmemmapSelfHosted(page))
  114. ret = -ENOTSUPP;
  115. }
  116. spin_unlock(&init_mm.page_table_lock);
  117. if (!head || ret)
  118. return ret;
  119. return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
  120. }
  121. static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
  122. unsigned long next, struct mm_walk *walk)
  123. {
  124. struct vmemmap_remap_walk *vmemmap_walk = walk->private;
  125. /*
  126. * The reuse_page is found 'first' in page table walking before
  127. * starting remapping.
  128. */
  129. if (!vmemmap_walk->reuse_page)
  130. vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
  131. else
  132. vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
  133. vmemmap_walk->nr_walked++;
  134. return 0;
  135. }
  136. static const struct mm_walk_ops vmemmap_remap_ops = {
  137. .pmd_entry = vmemmap_pmd_entry,
  138. .pte_entry = vmemmap_pte_entry,
  139. };
  140. static int vmemmap_remap_range(unsigned long start, unsigned long end,
  141. struct vmemmap_remap_walk *walk)
  142. {
  143. int ret;
  144. VM_BUG_ON(!PAGE_ALIGNED(start | end));
  145. mmap_read_lock(&init_mm);
  146. ret = walk_kernel_page_table_range(start, end, &vmemmap_remap_ops,
  147. NULL, walk);
  148. mmap_read_unlock(&init_mm);
  149. if (ret)
  150. return ret;
  151. if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
  152. flush_tlb_kernel_range(start, end);
  153. return 0;
  154. }
  155. /*
  156. * Free a vmemmap page. A vmemmap page can be allocated from the memblock
  157. * allocator or buddy allocator. If the PG_reserved flag is set, it means
  158. * that it allocated from the memblock allocator, just free it via the
  159. * free_bootmem_page(). Otherwise, use __free_page().
  160. */
  161. static inline void free_vmemmap_page(struct page *page)
  162. {
  163. if (PageReserved(page)) {
  164. memmap_boot_pages_add(-1);
  165. free_bootmem_page(page);
  166. } else {
  167. memmap_pages_add(-1);
  168. __free_page(page);
  169. }
  170. }
  171. /* Free a list of the vmemmap pages */
  172. static void free_vmemmap_page_list(struct list_head *list)
  173. {
  174. struct page *page, *next;
  175. list_for_each_entry_safe(page, next, list, lru)
  176. free_vmemmap_page(page);
  177. }
  178. static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
  179. struct vmemmap_remap_walk *walk)
  180. {
  181. /*
  182. * Remap the tail pages as read-only to catch illegal write operation
  183. * to the tail pages.
  184. */
  185. pgprot_t pgprot = PAGE_KERNEL_RO;
  186. struct page *page = pte_page(ptep_get(pte));
  187. pte_t entry;
  188. /* Remapping the head page requires r/w */
  189. if (unlikely(addr == walk->reuse_addr)) {
  190. pgprot = PAGE_KERNEL;
  191. list_del(&walk->reuse_page->lru);
  192. /*
  193. * Makes sure that preceding stores to the page contents from
  194. * vmemmap_remap_free() become visible before the set_pte_at()
  195. * write.
  196. */
  197. smp_wmb();
  198. }
  199. entry = mk_pte(walk->reuse_page, pgprot);
  200. list_add(&page->lru, walk->vmemmap_pages);
  201. set_pte_at(&init_mm, addr, pte, entry);
  202. }
  203. /*
  204. * How many struct page structs need to be reset. When we reuse the head
  205. * struct page, the special metadata (e.g. page->flags or page->mapping)
  206. * cannot copy to the tail struct page structs. The invalid value will be
  207. * checked in the free_tail_page_prepare(). In order to avoid the message
  208. * of "corrupted mapping in tail page". We need to reset at least 4 (one
  209. * head struct page struct and three tail struct page structs) struct page
  210. * structs.
  211. */
  212. #define NR_RESET_STRUCT_PAGE 4
  213. static inline void reset_struct_pages(struct page *start)
  214. {
  215. struct page *from = start + NR_RESET_STRUCT_PAGE;
  216. BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
  217. memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
  218. }
  219. static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
  220. struct vmemmap_remap_walk *walk)
  221. {
  222. pgprot_t pgprot = PAGE_KERNEL;
  223. struct page *page;
  224. void *to;
  225. BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
  226. page = list_first_entry(walk->vmemmap_pages, struct page, lru);
  227. list_del(&page->lru);
  228. to = page_to_virt(page);
  229. copy_page(to, (void *)walk->reuse_addr);
  230. reset_struct_pages(to);
  231. /*
  232. * Makes sure that preceding stores to the page contents become visible
  233. * before the set_pte_at() write.
  234. */
  235. smp_wmb();
  236. set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
  237. }
  238. /**
  239. * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
  240. * backing PMDs of the directmap into PTEs
  241. * @start: start address of the vmemmap virtual address range that we want
  242. * to remap.
  243. * @end: end address of the vmemmap virtual address range that we want to
  244. * remap.
  245. * @reuse: reuse address.
  246. *
  247. * Return: %0 on success, negative error code otherwise.
  248. */
  249. static int vmemmap_remap_split(unsigned long start, unsigned long end,
  250. unsigned long reuse)
  251. {
  252. struct vmemmap_remap_walk walk = {
  253. .remap_pte = NULL,
  254. .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH,
  255. };
  256. /* See the comment in the vmemmap_remap_free(). */
  257. BUG_ON(start - reuse != PAGE_SIZE);
  258. return vmemmap_remap_range(reuse, end, &walk);
  259. }
  260. /**
  261. * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
  262. * to the page which @reuse is mapped to, then free vmemmap
  263. * which the range are mapped to.
  264. * @start: start address of the vmemmap virtual address range that we want
  265. * to remap.
  266. * @end: end address of the vmemmap virtual address range that we want to
  267. * remap.
  268. * @reuse: reuse address.
  269. * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers
  270. * responsibility to free pages.
  271. * @flags: modifications to vmemmap_remap_walk flags
  272. *
  273. * Return: %0 on success, negative error code otherwise.
  274. */
  275. static int vmemmap_remap_free(unsigned long start, unsigned long end,
  276. unsigned long reuse,
  277. struct list_head *vmemmap_pages,
  278. unsigned long flags)
  279. {
  280. int ret;
  281. struct vmemmap_remap_walk walk = {
  282. .remap_pte = vmemmap_remap_pte,
  283. .reuse_addr = reuse,
  284. .vmemmap_pages = vmemmap_pages,
  285. .flags = flags,
  286. };
  287. int nid = page_to_nid((struct page *)reuse);
  288. gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
  289. /*
  290. * Allocate a new head vmemmap page to avoid breaking a contiguous
  291. * block of struct page memory when freeing it back to page allocator
  292. * in free_vmemmap_page_list(). This will allow the likely contiguous
  293. * struct page backing memory to be kept contiguous and allowing for
  294. * more allocations of hugepages. Fallback to the currently
  295. * mapped head page in case should it fail to allocate.
  296. */
  297. walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
  298. if (walk.reuse_page) {
  299. copy_page(page_to_virt(walk.reuse_page),
  300. (void *)walk.reuse_addr);
  301. list_add(&walk.reuse_page->lru, vmemmap_pages);
  302. memmap_pages_add(1);
  303. }
  304. /*
  305. * In order to make remapping routine most efficient for the huge pages,
  306. * the routine of vmemmap page table walking has the following rules
  307. * (see more details from the vmemmap_pte_range()):
  308. *
  309. * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
  310. * should be continuous.
  311. * - The @reuse address is part of the range [@reuse, @end) that we are
  312. * walking which is passed to vmemmap_remap_range().
  313. * - The @reuse address is the first in the complete range.
  314. *
  315. * So we need to make sure that @start and @reuse meet the above rules.
  316. */
  317. BUG_ON(start - reuse != PAGE_SIZE);
  318. ret = vmemmap_remap_range(reuse, end, &walk);
  319. if (ret && walk.nr_walked) {
  320. end = reuse + walk.nr_walked * PAGE_SIZE;
  321. /*
  322. * vmemmap_pages contains pages from the previous
  323. * vmemmap_remap_range call which failed. These
  324. * are pages which were removed from the vmemmap.
  325. * They will be restored in the following call.
  326. */
  327. walk = (struct vmemmap_remap_walk) {
  328. .remap_pte = vmemmap_restore_pte,
  329. .reuse_addr = reuse,
  330. .vmemmap_pages = vmemmap_pages,
  331. .flags = 0,
  332. };
  333. vmemmap_remap_range(reuse, end, &walk);
  334. }
  335. return ret;
  336. }
  337. static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
  338. struct list_head *list)
  339. {
  340. gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
  341. unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
  342. int nid = page_to_nid((struct page *)start);
  343. struct page *page, *next;
  344. int i;
  345. for (i = 0; i < nr_pages; i++) {
  346. page = alloc_pages_node(nid, gfp_mask, 0);
  347. if (!page)
  348. goto out;
  349. list_add(&page->lru, list);
  350. }
  351. memmap_pages_add(nr_pages);
  352. return 0;
  353. out:
  354. list_for_each_entry_safe(page, next, list, lru)
  355. __free_page(page);
  356. return -ENOMEM;
  357. }
  358. /**
  359. * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
  360. * to the page which is from the @vmemmap_pages
  361. * respectively.
  362. * @start: start address of the vmemmap virtual address range that we want
  363. * to remap.
  364. * @end: end address of the vmemmap virtual address range that we want to
  365. * remap.
  366. * @reuse: reuse address.
  367. * @flags: modifications to vmemmap_remap_walk flags
  368. *
  369. * Return: %0 on success, negative error code otherwise.
  370. */
  371. static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
  372. unsigned long reuse, unsigned long flags)
  373. {
  374. LIST_HEAD(vmemmap_pages);
  375. struct vmemmap_remap_walk walk = {
  376. .remap_pte = vmemmap_restore_pte,
  377. .reuse_addr = reuse,
  378. .vmemmap_pages = &vmemmap_pages,
  379. .flags = flags,
  380. };
  381. /* See the comment in the vmemmap_remap_free(). */
  382. BUG_ON(start - reuse != PAGE_SIZE);
  383. if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
  384. return -ENOMEM;
  385. return vmemmap_remap_range(reuse, end, &walk);
  386. }
  387. DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
  388. EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
  389. static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
  390. static int __init hugetlb_vmemmap_optimize_param(char *buf)
  391. {
  392. return kstrtobool(buf, &vmemmap_optimize_enabled);
  393. }
  394. early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param);
  395. static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
  396. struct folio *folio, unsigned long flags)
  397. {
  398. int ret;
  399. unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
  400. unsigned long vmemmap_reuse;
  401. VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
  402. VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
  403. if (!folio_test_hugetlb_vmemmap_optimized(folio))
  404. return 0;
  405. if (flags & VMEMMAP_SYNCHRONIZE_RCU)
  406. synchronize_rcu();
  407. vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
  408. vmemmap_reuse = vmemmap_start;
  409. vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
  410. /*
  411. * The pages which the vmemmap virtual address range [@vmemmap_start,
  412. * @vmemmap_end) are mapped to are freed to the buddy allocator, and
  413. * the range is mapped to the page which @vmemmap_reuse is mapped to.
  414. * When a HugeTLB page is freed to the buddy allocator, previously
  415. * discarded vmemmap pages must be allocated and remapping.
  416. */
  417. ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
  418. if (!ret) {
  419. folio_clear_hugetlb_vmemmap_optimized(folio);
  420. static_branch_dec(&hugetlb_optimize_vmemmap_key);
  421. }
  422. return ret;
  423. }
  424. /**
  425. * hugetlb_vmemmap_restore_folio - restore previously optimized (by
  426. * hugetlb_vmemmap_optimize_folio()) vmemmap pages which
  427. * will be reallocated and remapped.
  428. * @h: struct hstate.
  429. * @folio: the folio whose vmemmap pages will be restored.
  430. *
  431. * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
  432. * negative error code otherwise.
  433. */
  434. int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
  435. {
  436. return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU);
  437. }
  438. /**
  439. * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
  440. * @h: hstate.
  441. * @folio_list: list of folios.
  442. * @non_hvo_folios: Output list of folios for which vmemmap exists.
  443. *
  444. * Return: number of folios for which vmemmap was restored, or an error code
  445. * if an error was encountered restoring vmemmap for a folio.
  446. * Folios that have vmemmap are moved to the non_hvo_folios
  447. * list. Processing of entries stops when the first error is
  448. * encountered. The folio that experienced the error and all
  449. * non-processed folios will remain on folio_list.
  450. */
  451. long hugetlb_vmemmap_restore_folios(const struct hstate *h,
  452. struct list_head *folio_list,
  453. struct list_head *non_hvo_folios)
  454. {
  455. struct folio *folio, *t_folio;
  456. long restored = 0;
  457. long ret = 0;
  458. unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
  459. list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
  460. if (folio_test_hugetlb_vmemmap_optimized(folio)) {
  461. ret = __hugetlb_vmemmap_restore_folio(h, folio, flags);
  462. /* only need to synchronize_rcu() once for each batch */
  463. flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
  464. if (ret)
  465. break;
  466. restored++;
  467. }
  468. /* Add non-optimized folios to output list */
  469. list_move(&folio->lru, non_hvo_folios);
  470. }
  471. if (restored)
  472. flush_tlb_all();
  473. if (!ret)
  474. ret = restored;
  475. return ret;
  476. }
  477. /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
  478. static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
  479. {
  480. if (folio_test_hugetlb_vmemmap_optimized(folio))
  481. return false;
  482. if (!READ_ONCE(vmemmap_optimize_enabled))
  483. return false;
  484. if (!hugetlb_vmemmap_optimizable(h))
  485. return false;
  486. return true;
  487. }
  488. static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
  489. struct folio *folio,
  490. struct list_head *vmemmap_pages,
  491. unsigned long flags)
  492. {
  493. int ret = 0;
  494. unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
  495. unsigned long vmemmap_reuse;
  496. VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
  497. VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
  498. if (!vmemmap_should_optimize_folio(h, folio))
  499. return ret;
  500. static_branch_inc(&hugetlb_optimize_vmemmap_key);
  501. if (flags & VMEMMAP_SYNCHRONIZE_RCU)
  502. synchronize_rcu();
  503. /*
  504. * Very Subtle
  505. * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
  506. * immediately after remapping. As a result, subsequent accesses
  507. * and modifications to struct pages associated with the hugetlb
  508. * page could be to the OLD struct pages. Set the vmemmap optimized
  509. * flag here so that it is copied to the new head page. This keeps
  510. * the old and new struct pages in sync.
  511. * If there is an error during optimization, we will immediately FLUSH
  512. * the TLB and clear the flag below.
  513. */
  514. folio_set_hugetlb_vmemmap_optimized(folio);
  515. vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
  516. vmemmap_reuse = vmemmap_start;
  517. vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
  518. /*
  519. * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
  520. * to the page which @vmemmap_reuse is mapped to. Add pages previously
  521. * mapping the range to vmemmap_pages list so that they can be freed by
  522. * the caller.
  523. */
  524. ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
  525. vmemmap_pages, flags);
  526. if (ret) {
  527. static_branch_dec(&hugetlb_optimize_vmemmap_key);
  528. folio_clear_hugetlb_vmemmap_optimized(folio);
  529. }
  530. return ret;
  531. }
  532. /**
  533. * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
  534. * @h: struct hstate.
  535. * @folio: the folio whose vmemmap pages will be optimized.
  536. *
  537. * This function only tries to optimize @folio's vmemmap pages and does not
  538. * guarantee that the optimization will succeed after it returns. The caller
  539. * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
  540. * vmemmap pages have been optimized.
  541. */
  542. void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
  543. {
  544. LIST_HEAD(vmemmap_pages);
  545. __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU);
  546. free_vmemmap_page_list(&vmemmap_pages);
  547. }
  548. static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
  549. {
  550. unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
  551. unsigned long vmemmap_reuse;
  552. if (!vmemmap_should_optimize_folio(h, folio))
  553. return 0;
  554. vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
  555. vmemmap_reuse = vmemmap_start;
  556. vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
  557. /*
  558. * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
  559. * @vmemmap_end]
  560. */
  561. return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
  562. }
  563. static void __hugetlb_vmemmap_optimize_folios(struct hstate *h,
  564. struct list_head *folio_list,
  565. bool boot)
  566. {
  567. struct folio *folio;
  568. int nr_to_optimize;
  569. LIST_HEAD(vmemmap_pages);
  570. unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
  571. nr_to_optimize = 0;
  572. list_for_each_entry(folio, folio_list, lru) {
  573. int ret;
  574. unsigned long spfn, epfn;
  575. if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) {
  576. /*
  577. * Already optimized by pre-HVO, just map the
  578. * mirrored tail page structs RO.
  579. */
  580. spfn = (unsigned long)&folio->page;
  581. epfn = spfn + pages_per_huge_page(h);
  582. vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio),
  583. HUGETLB_VMEMMAP_RESERVE_SIZE);
  584. register_page_bootmem_memmap(pfn_to_section_nr(spfn),
  585. &folio->page,
  586. HUGETLB_VMEMMAP_RESERVE_SIZE);
  587. static_branch_inc(&hugetlb_optimize_vmemmap_key);
  588. continue;
  589. }
  590. nr_to_optimize++;
  591. ret = hugetlb_vmemmap_split_folio(h, folio);
  592. /*
  593. * Splitting the PMD requires allocating a page, thus let's fail
  594. * early once we encounter the first OOM. No point in retrying
  595. * as it can be dynamically done on remap with the memory
  596. * we get back from the vmemmap deduplication.
  597. */
  598. if (ret == -ENOMEM)
  599. break;
  600. }
  601. if (!nr_to_optimize)
  602. /*
  603. * All pre-HVO folios, nothing left to do. It's ok if
  604. * there is a mix of pre-HVO and not yet HVO-ed folios
  605. * here, as __hugetlb_vmemmap_optimize_folio() will
  606. * skip any folios that already have the optimized flag
  607. * set, see vmemmap_should_optimize_folio().
  608. */
  609. goto out;
  610. flush_tlb_all();
  611. list_for_each_entry(folio, folio_list, lru) {
  612. int ret;
  613. ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
  614. /* only need to synchronize_rcu() once for each batch */
  615. flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
  616. /*
  617. * Pages to be freed may have been accumulated. If we
  618. * encounter an ENOMEM, free what we have and try again.
  619. * This can occur in the case that both splitting fails
  620. * halfway and head page allocation also failed. In this
  621. * case __hugetlb_vmemmap_optimize_folio() would free memory
  622. * allowing more vmemmap remaps to occur.
  623. */
  624. if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
  625. flush_tlb_all();
  626. free_vmemmap_page_list(&vmemmap_pages);
  627. INIT_LIST_HEAD(&vmemmap_pages);
  628. __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
  629. }
  630. }
  631. out:
  632. flush_tlb_all();
  633. free_vmemmap_page_list(&vmemmap_pages);
  634. }
  635. void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
  636. {
  637. __hugetlb_vmemmap_optimize_folios(h, folio_list, false);
  638. }
  639. void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list)
  640. {
  641. __hugetlb_vmemmap_optimize_folios(h, folio_list, true);
  642. }
  643. #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
  644. /* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */
  645. static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m)
  646. {
  647. unsigned long section_size, psize, pmd_vmemmap_size;
  648. phys_addr_t paddr;
  649. if (!READ_ONCE(vmemmap_optimize_enabled))
  650. return false;
  651. if (!hugetlb_vmemmap_optimizable(m->hstate))
  652. return false;
  653. psize = huge_page_size(m->hstate);
  654. paddr = virt_to_phys(m);
  655. /*
  656. * Pre-HVO only works if the bootmem huge page
  657. * is aligned to the section size.
  658. */
  659. section_size = (1UL << PA_SECTION_SHIFT);
  660. if (!IS_ALIGNED(paddr, section_size) ||
  661. !IS_ALIGNED(psize, section_size))
  662. return false;
  663. /*
  664. * The pre-HVO code does not deal with splitting PMDS,
  665. * so the bootmem page must be aligned to the number
  666. * of base pages that can be mapped with one vmemmap PMD.
  667. */
  668. pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT;
  669. if (!IS_ALIGNED(paddr, pmd_vmemmap_size) ||
  670. !IS_ALIGNED(psize, pmd_vmemmap_size))
  671. return false;
  672. return true;
  673. }
  674. /*
  675. * Initialize memmap section for a gigantic page, HVO-style.
  676. */
  677. void __init hugetlb_vmemmap_init_early(int nid)
  678. {
  679. unsigned long psize, paddr, section_size;
  680. unsigned long ns, i, pnum, pfn, nr_pages;
  681. unsigned long start, end;
  682. struct huge_bootmem_page *m = NULL;
  683. void *map;
  684. if (!READ_ONCE(vmemmap_optimize_enabled))
  685. return;
  686. section_size = (1UL << PA_SECTION_SHIFT);
  687. list_for_each_entry(m, &huge_boot_pages[nid], list) {
  688. if (!vmemmap_should_optimize_bootmem_page(m))
  689. continue;
  690. nr_pages = pages_per_huge_page(m->hstate);
  691. psize = nr_pages << PAGE_SHIFT;
  692. paddr = virt_to_phys(m);
  693. pfn = PHYS_PFN(paddr);
  694. map = pfn_to_page(pfn);
  695. start = (unsigned long)map;
  696. end = start + nr_pages * sizeof(struct page);
  697. if (vmemmap_populate_hvo(start, end, nid,
  698. HUGETLB_VMEMMAP_RESERVE_SIZE) < 0)
  699. continue;
  700. memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE);
  701. pnum = pfn_to_section_nr(pfn);
  702. ns = psize / section_size;
  703. for (i = 0; i < ns; i++) {
  704. sparse_init_early_section(nid, map, pnum,
  705. SECTION_IS_VMEMMAP_PREINIT);
  706. map += section_map_size();
  707. pnum++;
  708. }
  709. m->flags |= HUGE_BOOTMEM_HVO;
  710. }
  711. }
  712. void __init hugetlb_vmemmap_init_late(int nid)
  713. {
  714. struct huge_bootmem_page *m, *tm;
  715. unsigned long phys, nr_pages, start, end;
  716. unsigned long pfn, nr_mmap;
  717. struct hstate *h;
  718. void *map;
  719. if (!READ_ONCE(vmemmap_optimize_enabled))
  720. return;
  721. list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) {
  722. if (!(m->flags & HUGE_BOOTMEM_HVO))
  723. continue;
  724. phys = virt_to_phys(m);
  725. h = m->hstate;
  726. pfn = PHYS_PFN(phys);
  727. nr_pages = pages_per_huge_page(h);
  728. if (!hugetlb_bootmem_page_zones_valid(nid, m)) {
  729. /*
  730. * Oops, the hugetlb page spans multiple zones.
  731. * Remove it from the list, and undo HVO.
  732. */
  733. list_del(&m->list);
  734. map = pfn_to_page(pfn);
  735. start = (unsigned long)map;
  736. end = start + nr_pages * sizeof(struct page);
  737. vmemmap_undo_hvo(start, end, nid,
  738. HUGETLB_VMEMMAP_RESERVE_SIZE);
  739. nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE;
  740. memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE));
  741. memblock_phys_free(phys, huge_page_size(h));
  742. continue;
  743. } else
  744. m->flags |= HUGE_BOOTMEM_ZONES_VALID;
  745. }
  746. }
  747. #endif
  748. static const struct ctl_table hugetlb_vmemmap_sysctls[] = {
  749. {
  750. .procname = "hugetlb_optimize_vmemmap",
  751. .data = &vmemmap_optimize_enabled,
  752. .maxlen = sizeof(vmemmap_optimize_enabled),
  753. .mode = 0644,
  754. .proc_handler = proc_dobool,
  755. },
  756. };
  757. static int __init hugetlb_vmemmap_init(void)
  758. {
  759. const struct hstate *h;
  760. /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
  761. BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
  762. for_each_hstate(h) {
  763. if (hugetlb_vmemmap_optimizable(h)) {
  764. register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
  765. break;
  766. }
  767. }
  768. return 0;
  769. }
  770. late_initcall(hugetlb_vmemmap_init);