sparse-vmemmap.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Virtual Memory Map support
  4. *
  5. * (C) 2007 sgi. Christoph Lameter.
  6. *
  7. * Virtual memory maps allow VM primitives pfn_to_page, page_to_pfn,
  8. * virt_to_page, page_address() to be implemented as a base offset
  9. * calculation without memory access.
  10. *
  11. * However, virtual mappings need a page table and TLBs. Many Linux
  12. * architectures already map their physical space using 1-1 mappings
  13. * via TLBs. For those arches the virtual memory map is essentially
  14. * for free if we use the same page size as the 1-1 mappings. In that
  15. * case the overhead consists of a few additional pages that are
  16. * allocated to create a view of memory for vmemmap.
  17. *
  18. * The architecture is expected to provide a vmemmap_populate() function
  19. * to instantiate the mapping.
  20. */
  21. #include <linux/mm.h>
  22. #include <linux/mmzone.h>
  23. #include <linux/memblock.h>
  24. #include <linux/memremap.h>
  25. #include <linux/highmem.h>
  26. #include <linux/slab.h>
  27. #include <linux/spinlock.h>
  28. #include <linux/vmalloc.h>
  29. #include <linux/sched.h>
  30. #include <linux/pgalloc.h>
  31. #include <asm/dma.h>
  32. #include <asm/tlbflush.h>
  33. #include "hugetlb_vmemmap.h"
  34. /*
  35. * Flags for vmemmap_populate_range and friends.
  36. */
  37. /* Get a ref on the head page struct page, for ZONE_DEVICE compound pages */
  38. #define VMEMMAP_POPULATE_PAGEREF 0x0001
  39. #include "internal.h"
  40. /*
  41. * Allocate a block of memory to be used to back the virtual memory map
  42. * or to back the page tables that are used to create the mapping.
  43. * Uses the main allocators if they are available, else bootmem.
  44. */
  45. static void * __ref __earlyonly_bootmem_alloc(int node,
  46. unsigned long size,
  47. unsigned long align,
  48. unsigned long goal)
  49. {
  50. return memmap_alloc(size, align, goal, node, false);
  51. }
  52. void * __meminit vmemmap_alloc_block(unsigned long size, int node)
  53. {
  54. /* If the main allocator is up use that, fallback to bootmem. */
  55. if (slab_is_available()) {
  56. gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
  57. int order = get_order(size);
  58. static bool warned;
  59. struct page *page;
  60. page = alloc_pages_node(node, gfp_mask, order);
  61. if (page)
  62. return page_address(page);
  63. if (!warned) {
  64. warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL,
  65. "vmemmap alloc failure: order:%u", order);
  66. warned = true;
  67. }
  68. return NULL;
  69. } else
  70. return __earlyonly_bootmem_alloc(node, size, size,
  71. __pa(MAX_DMA_ADDRESS));
  72. }
  73. static void * __meminit altmap_alloc_block_buf(unsigned long size,
  74. struct vmem_altmap *altmap);
  75. /* need to make sure size is all the same during early stage */
  76. void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
  77. struct vmem_altmap *altmap)
  78. {
  79. void *ptr;
  80. if (altmap)
  81. return altmap_alloc_block_buf(size, altmap);
  82. ptr = sparse_buffer_alloc(size);
  83. if (!ptr)
  84. ptr = vmemmap_alloc_block(size, node);
  85. return ptr;
  86. }
  87. static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap)
  88. {
  89. return altmap->base_pfn + altmap->reserve + altmap->alloc
  90. + altmap->align;
  91. }
  92. static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
  93. {
  94. unsigned long allocated = altmap->alloc + altmap->align;
  95. if (altmap->free > allocated)
  96. return altmap->free - allocated;
  97. return 0;
  98. }
  99. static void * __meminit altmap_alloc_block_buf(unsigned long size,
  100. struct vmem_altmap *altmap)
  101. {
  102. unsigned long pfn, nr_pfns, nr_align;
  103. if (size & ~PAGE_MASK) {
  104. pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n",
  105. __func__, size);
  106. return NULL;
  107. }
  108. pfn = vmem_altmap_next_pfn(altmap);
  109. nr_pfns = size >> PAGE_SHIFT;
  110. nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG);
  111. nr_align = ALIGN(pfn, nr_align) - pfn;
  112. if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap))
  113. return NULL;
  114. altmap->alloc += nr_pfns;
  115. altmap->align += nr_align;
  116. pfn += nr_align;
  117. pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n",
  118. __func__, pfn, altmap->alloc, altmap->align, nr_pfns);
  119. return __va(__pfn_to_phys(pfn));
  120. }
  121. void __meminit vmemmap_verify(pte_t *pte, int node,
  122. unsigned long start, unsigned long end)
  123. {
  124. unsigned long pfn = pte_pfn(ptep_get(pte));
  125. int actual_node = early_pfn_to_nid(pfn);
  126. if (node_distance(actual_node, node) > LOCAL_DISTANCE)
  127. pr_warn_once("[%lx-%lx] potential offnode page_structs\n",
  128. start, end - 1);
  129. }
  130. pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
  131. struct vmem_altmap *altmap,
  132. unsigned long ptpfn, unsigned long flags)
  133. {
  134. pte_t *pte = pte_offset_kernel(pmd, addr);
  135. if (pte_none(ptep_get(pte))) {
  136. pte_t entry;
  137. void *p;
  138. if (ptpfn == (unsigned long)-1) {
  139. p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
  140. if (!p)
  141. return NULL;
  142. ptpfn = PHYS_PFN(__pa(p));
  143. } else {
  144. /*
  145. * When a PTE/PMD entry is freed from the init_mm
  146. * there's a free_pages() call to this page allocated
  147. * above. Thus this get_page() is paired with the
  148. * put_page_testzero() on the freeing path.
  149. * This can only called by certain ZONE_DEVICE path,
  150. * and through vmemmap_populate_compound_pages() when
  151. * slab is available.
  152. */
  153. if (flags & VMEMMAP_POPULATE_PAGEREF)
  154. get_page(pfn_to_page(ptpfn));
  155. }
  156. entry = pfn_pte(ptpfn, PAGE_KERNEL);
  157. set_pte_at(&init_mm, addr, pte, entry);
  158. }
  159. return pte;
  160. }
  161. static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node)
  162. {
  163. void *p = vmemmap_alloc_block(size, node);
  164. if (!p)
  165. return NULL;
  166. memset(p, 0, size);
  167. return p;
  168. }
  169. pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node)
  170. {
  171. pmd_t *pmd = pmd_offset(pud, addr);
  172. if (pmd_none(*pmd)) {
  173. void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
  174. if (!p)
  175. return NULL;
  176. kernel_pte_init(p);
  177. pmd_populate_kernel(&init_mm, pmd, p);
  178. }
  179. return pmd;
  180. }
  181. pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node)
  182. {
  183. pud_t *pud = pud_offset(p4d, addr);
  184. if (pud_none(*pud)) {
  185. void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
  186. if (!p)
  187. return NULL;
  188. pmd_init(p);
  189. pud_populate(&init_mm, pud, p);
  190. }
  191. return pud;
  192. }
  193. p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node)
  194. {
  195. p4d_t *p4d = p4d_offset(pgd, addr);
  196. if (p4d_none(*p4d)) {
  197. void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
  198. if (!p)
  199. return NULL;
  200. pud_init(p);
  201. p4d_populate_kernel(addr, p4d, p);
  202. }
  203. return p4d;
  204. }
  205. pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
  206. {
  207. pgd_t *pgd = pgd_offset_k(addr);
  208. if (pgd_none(*pgd)) {
  209. void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
  210. if (!p)
  211. return NULL;
  212. pgd_populate_kernel(addr, pgd, p);
  213. }
  214. return pgd;
  215. }
  216. static pte_t * __meminit vmemmap_populate_address(unsigned long addr, int node,
  217. struct vmem_altmap *altmap,
  218. unsigned long ptpfn,
  219. unsigned long flags)
  220. {
  221. pgd_t *pgd;
  222. p4d_t *p4d;
  223. pud_t *pud;
  224. pmd_t *pmd;
  225. pte_t *pte;
  226. pgd = vmemmap_pgd_populate(addr, node);
  227. if (!pgd)
  228. return NULL;
  229. p4d = vmemmap_p4d_populate(pgd, addr, node);
  230. if (!p4d)
  231. return NULL;
  232. pud = vmemmap_pud_populate(p4d, addr, node);
  233. if (!pud)
  234. return NULL;
  235. pmd = vmemmap_pmd_populate(pud, addr, node);
  236. if (!pmd)
  237. return NULL;
  238. pte = vmemmap_pte_populate(pmd, addr, node, altmap, ptpfn, flags);
  239. if (!pte)
  240. return NULL;
  241. vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
  242. return pte;
  243. }
  244. static int __meminit vmemmap_populate_range(unsigned long start,
  245. unsigned long end, int node,
  246. struct vmem_altmap *altmap,
  247. unsigned long ptpfn,
  248. unsigned long flags)
  249. {
  250. unsigned long addr = start;
  251. pte_t *pte;
  252. for (; addr < end; addr += PAGE_SIZE) {
  253. pte = vmemmap_populate_address(addr, node, altmap,
  254. ptpfn, flags);
  255. if (!pte)
  256. return -ENOMEM;
  257. }
  258. return 0;
  259. }
  260. int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
  261. int node, struct vmem_altmap *altmap)
  262. {
  263. return vmemmap_populate_range(start, end, node, altmap, -1, 0);
  264. }
  265. /*
  266. * Undo populate_hvo, and replace it with a normal base page mapping.
  267. * Used in memory init in case a HVO mapping needs to be undone.
  268. *
  269. * This can happen when it is discovered that a memblock allocated
  270. * hugetlb page spans multiple zones, which can only be verified
  271. * after zones have been initialized.
  272. *
  273. * We know that:
  274. * 1) The first @headsize / PAGE_SIZE vmemmap pages were individually
  275. * allocated through memblock, and mapped.
  276. *
  277. * 2) The rest of the vmemmap pages are mirrors of the last head page.
  278. */
  279. int __meminit vmemmap_undo_hvo(unsigned long addr, unsigned long end,
  280. int node, unsigned long headsize)
  281. {
  282. unsigned long maddr, pfn;
  283. pte_t *pte;
  284. int headpages;
  285. /*
  286. * Should only be called early in boot, so nothing will
  287. * be accessing these page structures.
  288. */
  289. WARN_ON(!early_boot_irqs_disabled);
  290. headpages = headsize >> PAGE_SHIFT;
  291. /*
  292. * Clear mirrored mappings for tail page structs.
  293. */
  294. for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
  295. pte = virt_to_kpte(maddr);
  296. pte_clear(&init_mm, maddr, pte);
  297. }
  298. /*
  299. * Clear and free mappings for head page and first tail page
  300. * structs.
  301. */
  302. for (maddr = addr; headpages-- > 0; maddr += PAGE_SIZE) {
  303. pte = virt_to_kpte(maddr);
  304. pfn = pte_pfn(ptep_get(pte));
  305. pte_clear(&init_mm, maddr, pte);
  306. memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE);
  307. }
  308. flush_tlb_kernel_range(addr, end);
  309. return vmemmap_populate(addr, end, node, NULL);
  310. }
  311. /*
  312. * Write protect the mirrored tail page structs for HVO. This will be
  313. * called from the hugetlb code when gathering and initializing the
  314. * memblock allocated gigantic pages. The write protect can't be
  315. * done earlier, since it can't be guaranteed that the reserved
  316. * page structures will not be written to during initialization,
  317. * even if CONFIG_DEFERRED_STRUCT_PAGE_INIT is enabled.
  318. *
  319. * The PTEs are known to exist, and nothing else should be touching
  320. * these pages. The caller is responsible for any TLB flushing.
  321. */
  322. void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end,
  323. int node, unsigned long headsize)
  324. {
  325. unsigned long maddr;
  326. pte_t *pte;
  327. for (maddr = addr + headsize; maddr < end; maddr += PAGE_SIZE) {
  328. pte = virt_to_kpte(maddr);
  329. ptep_set_wrprotect(&init_mm, maddr, pte);
  330. }
  331. }
  332. /*
  333. * Populate vmemmap pages HVO-style. The first page contains the head
  334. * page and needed tail pages, the other ones are mirrors of the first
  335. * page.
  336. */
  337. int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end,
  338. int node, unsigned long headsize)
  339. {
  340. pte_t *pte;
  341. unsigned long maddr;
  342. for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) {
  343. pte = vmemmap_populate_address(maddr, node, NULL, -1, 0);
  344. if (!pte)
  345. return -ENOMEM;
  346. }
  347. /*
  348. * Reuse the last page struct page mapped above for the rest.
  349. */
  350. return vmemmap_populate_range(maddr, end, node, NULL,
  351. pte_pfn(ptep_get(pte)), 0);
  352. }
  353. void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
  354. unsigned long addr, unsigned long next)
  355. {
  356. }
  357. int __weak __meminit vmemmap_check_pmd(pmd_t *pmd, int node,
  358. unsigned long addr, unsigned long next)
  359. {
  360. return 0;
  361. }
  362. int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long end,
  363. int node, struct vmem_altmap *altmap)
  364. {
  365. unsigned long addr;
  366. unsigned long next;
  367. pgd_t *pgd;
  368. p4d_t *p4d;
  369. pud_t *pud;
  370. pmd_t *pmd;
  371. for (addr = start; addr < end; addr = next) {
  372. next = pmd_addr_end(addr, end);
  373. pgd = vmemmap_pgd_populate(addr, node);
  374. if (!pgd)
  375. return -ENOMEM;
  376. p4d = vmemmap_p4d_populate(pgd, addr, node);
  377. if (!p4d)
  378. return -ENOMEM;
  379. pud = vmemmap_pud_populate(p4d, addr, node);
  380. if (!pud)
  381. return -ENOMEM;
  382. pmd = pmd_offset(pud, addr);
  383. if (pmd_none(pmdp_get(pmd))) {
  384. void *p;
  385. p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
  386. if (p) {
  387. vmemmap_set_pmd(pmd, p, node, addr, next);
  388. continue;
  389. } else if (altmap) {
  390. /*
  391. * No fallback: In any case we care about, the
  392. * altmap should be reasonably sized and aligned
  393. * such that vmemmap_alloc_block_buf() will always
  394. * succeed. For consistency with the PTE case,
  395. * return an error here as failure could indicate
  396. * a configuration issue with the size of the altmap.
  397. */
  398. return -ENOMEM;
  399. }
  400. } else if (vmemmap_check_pmd(pmd, node, addr, next))
  401. continue;
  402. if (vmemmap_populate_basepages(addr, next, node, altmap))
  403. return -ENOMEM;
  404. }
  405. return 0;
  406. }
  407. #ifndef vmemmap_populate_compound_pages
  408. /*
  409. * For compound pages bigger than section size (e.g. x86 1G compound
  410. * pages with 2M subsection size) fill the rest of sections as tail
  411. * pages.
  412. *
  413. * Note that memremap_pages() resets @nr_range value and will increment
  414. * it after each range successful onlining. Thus the value or @nr_range
  415. * at section memmap populate corresponds to the in-progress range
  416. * being onlined here.
  417. */
  418. static bool __meminit reuse_compound_section(unsigned long start_pfn,
  419. struct dev_pagemap *pgmap)
  420. {
  421. unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
  422. unsigned long offset = start_pfn -
  423. PHYS_PFN(pgmap->ranges[pgmap->nr_range].start);
  424. return !IS_ALIGNED(offset, nr_pages) && nr_pages > PAGES_PER_SUBSECTION;
  425. }
  426. static pte_t * __meminit compound_section_tail_page(unsigned long addr)
  427. {
  428. pte_t *pte;
  429. addr -= PAGE_SIZE;
  430. /*
  431. * Assuming sections are populated sequentially, the previous section's
  432. * page data can be reused.
  433. */
  434. pte = pte_offset_kernel(pmd_off_k(addr), addr);
  435. if (!pte)
  436. return NULL;
  437. return pte;
  438. }
  439. static int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
  440. unsigned long start,
  441. unsigned long end, int node,
  442. struct dev_pagemap *pgmap)
  443. {
  444. unsigned long size, addr;
  445. pte_t *pte;
  446. int rc;
  447. if (reuse_compound_section(start_pfn, pgmap)) {
  448. pte = compound_section_tail_page(start);
  449. if (!pte)
  450. return -ENOMEM;
  451. /*
  452. * Reuse the page that was populated in the prior iteration
  453. * with just tail struct pages.
  454. */
  455. return vmemmap_populate_range(start, end, node, NULL,
  456. pte_pfn(ptep_get(pte)),
  457. VMEMMAP_POPULATE_PAGEREF);
  458. }
  459. size = min(end - start, pgmap_vmemmap_nr(pgmap) * sizeof(struct page));
  460. for (addr = start; addr < end; addr += size) {
  461. unsigned long next, last = addr + size;
  462. /* Populate the head page vmemmap page */
  463. pte = vmemmap_populate_address(addr, node, NULL, -1, 0);
  464. if (!pte)
  465. return -ENOMEM;
  466. /* Populate the tail pages vmemmap page */
  467. next = addr + PAGE_SIZE;
  468. pte = vmemmap_populate_address(next, node, NULL, -1, 0);
  469. if (!pte)
  470. return -ENOMEM;
  471. /*
  472. * Reuse the previous page for the rest of tail pages
  473. * See layout diagram in Documentation/mm/vmemmap_dedup.rst
  474. */
  475. next += PAGE_SIZE;
  476. rc = vmemmap_populate_range(next, last, node, NULL,
  477. pte_pfn(ptep_get(pte)),
  478. VMEMMAP_POPULATE_PAGEREF);
  479. if (rc)
  480. return -ENOMEM;
  481. }
  482. return 0;
  483. }
  484. #endif
  485. struct page * __meminit __populate_section_memmap(unsigned long pfn,
  486. unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
  487. struct dev_pagemap *pgmap)
  488. {
  489. unsigned long start = (unsigned long) pfn_to_page(pfn);
  490. unsigned long end = start + nr_pages * sizeof(struct page);
  491. int r;
  492. if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
  493. !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
  494. return NULL;
  495. if (vmemmap_can_optimize(altmap, pgmap))
  496. r = vmemmap_populate_compound_pages(pfn, start, end, nid, pgmap);
  497. else
  498. r = vmemmap_populate(start, end, nid, altmap);
  499. if (r < 0)
  500. return NULL;
  501. return pfn_to_page(pfn);
  502. }
  503. #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
  504. /*
  505. * This is called just before initializing sections for a NUMA node.
  506. * Any special initialization that needs to be done before the
  507. * generic initialization can be done from here. Sections that
  508. * are initialized in hooks called from here will be skipped by
  509. * the generic initialization.
  510. */
  511. void __init sparse_vmemmap_init_nid_early(int nid)
  512. {
  513. hugetlb_vmemmap_init_early(nid);
  514. }
  515. /*
  516. * This is called just before the initialization of page structures
  517. * through memmap_init. Zones are now initialized, so any work that
  518. * needs to be done that needs zone information can be done from
  519. * here.
  520. */
  521. void __init sparse_vmemmap_init_nid_late(int nid)
  522. {
  523. hugetlb_vmemmap_init_late(nid);
  524. }
  525. #endif