page_ext.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/mm.h>
  3. #include <linux/mmzone.h>
  4. #include <linux/memblock.h>
  5. #include <linux/page_ext.h>
  6. #include <linux/memory.h>
  7. #include <linux/vmalloc.h>
  8. #include <linux/kmemleak.h>
  9. #include <linux/page_owner.h>
  10. #include <linux/page_idle.h>
  11. #include <linux/page_table_check.h>
  12. #include <linux/rcupdate.h>
  13. #include <linux/pgalloc_tag.h>
  14. #include <linux/iommu-debug-pagealloc.h>
  15. /*
  16. * struct page extension
  17. *
  18. * This is the feature to manage memory for extended data per page.
  19. *
  20. * Until now, we must modify struct page itself to store extra data per page.
  21. * This requires rebuilding the kernel and it is really time consuming process.
  22. * And, sometimes, rebuild is impossible due to third party module dependency.
  23. * At last, enlarging struct page could cause un-wanted system behaviour change.
  24. *
  25. * This feature is intended to overcome above mentioned problems. This feature
  26. * allocates memory for extended data per page in certain place rather than
  27. * the struct page itself. This memory can be accessed by the accessor
  28. * functions provided by this code. During the boot process, it checks whether
  29. * allocation of huge chunk of memory is needed or not. If not, it avoids
  30. * allocating memory at all. With this advantage, we can include this feature
  31. * into the kernel in default and can avoid rebuild and solve related problems.
  32. *
  33. * To help these things to work well, there are two callbacks for clients. One
  34. * is the need callback which is mandatory if user wants to avoid useless
  35. * memory allocation at boot-time. The other is optional, init callback, which
  36. * is used to do proper initialization after memory is allocated.
  37. *
  38. * The need callback is used to decide whether extended memory allocation is
  39. * needed or not. Sometimes users want to deactivate some features in this
  40. * boot and extra memory would be unnecessary. In this case, to avoid
  41. * allocating huge chunk of memory, each clients represent their need of
  42. * extra memory through the need callback. If one of the need callbacks
  43. * returns true, it means that someone needs extra memory so that
  44. * page extension core should allocates memory for page extension. If
  45. * none of need callbacks return true, memory isn't needed at all in this boot
  46. * and page extension core can skip to allocate memory. As result,
  47. * none of memory is wasted.
  48. *
  49. * When need callback returns true, page_ext checks if there is a request for
  50. * extra memory through size in struct page_ext_operations. If it is non-zero,
  51. * extra space is allocated for each page_ext entry and offset is returned to
  52. * user through offset in struct page_ext_operations.
  53. *
  54. * The init callback is used to do proper initialization after page extension
  55. * is completely initialized. In sparse memory system, extra memory is
  56. * allocated some time later than memmap is allocated. In other words, lifetime
  57. * of memory for page extension isn't same with memmap for struct page.
  58. * Therefore, clients can't store extra data until page extension is
  59. * initialized, even if pages are allocated and used freely. This could
  60. * cause inadequate state of extra data per page, so, to prevent it, client
  61. * can utilize this callback to initialize the state of it correctly.
  62. */
  63. #ifdef CONFIG_SPARSEMEM
  64. #define PAGE_EXT_INVALID (0x1)
  65. #endif
  66. #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
  67. static bool need_page_idle(void)
  68. {
  69. return true;
  70. }
  71. static struct page_ext_operations page_idle_ops __initdata = {
  72. .need = need_page_idle,
  73. .need_shared_flags = true,
  74. };
  75. #endif
  76. static struct page_ext_operations *page_ext_ops[] __initdata = {
  77. #ifdef CONFIG_PAGE_OWNER
  78. &page_owner_ops,
  79. #endif
  80. #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
  81. &page_idle_ops,
  82. #endif
  83. #ifdef CONFIG_MEM_ALLOC_PROFILING
  84. &page_alloc_tagging_ops,
  85. #endif
  86. #ifdef CONFIG_PAGE_TABLE_CHECK
  87. &page_table_check_ops,
  88. #endif
  89. #ifdef CONFIG_IOMMU_DEBUG_PAGEALLOC
  90. &page_iommu_debug_ops,
  91. #endif
  92. };
  93. unsigned long page_ext_size;
  94. static unsigned long total_usage;
  95. #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG
  96. /*
  97. * To ensure correct allocation tagging for pages, page_ext should be available
  98. * before the first page allocation. Otherwise early task stacks will be
  99. * allocated before page_ext initialization and missing tags will be flagged.
  100. */
  101. bool early_page_ext __meminitdata = true;
  102. #else
  103. bool early_page_ext __meminitdata;
  104. #endif
  105. static int __init setup_early_page_ext(char *str)
  106. {
  107. early_page_ext = true;
  108. return 0;
  109. }
  110. early_param("early_page_ext", setup_early_page_ext);
  111. static bool __init invoke_need_callbacks(void)
  112. {
  113. int i;
  114. int entries = ARRAY_SIZE(page_ext_ops);
  115. bool need = false;
  116. for (i = 0; i < entries; i++) {
  117. if (page_ext_ops[i]->need()) {
  118. if (page_ext_ops[i]->need_shared_flags) {
  119. page_ext_size = sizeof(struct page_ext);
  120. break;
  121. }
  122. }
  123. }
  124. for (i = 0; i < entries; i++) {
  125. if (page_ext_ops[i]->need()) {
  126. page_ext_ops[i]->offset = page_ext_size;
  127. page_ext_size += page_ext_ops[i]->size;
  128. need = true;
  129. }
  130. }
  131. return need;
  132. }
  133. static void __init invoke_init_callbacks(void)
  134. {
  135. int i;
  136. int entries = ARRAY_SIZE(page_ext_ops);
  137. for (i = 0; i < entries; i++) {
  138. if (page_ext_ops[i]->init)
  139. page_ext_ops[i]->init();
  140. }
  141. }
  142. static inline struct page_ext *get_entry(void *base, unsigned long index)
  143. {
  144. return base + page_ext_size * index;
  145. }
  146. #ifndef CONFIG_SPARSEMEM
  147. void __init page_ext_init_flatmem_late(void)
  148. {
  149. invoke_init_callbacks();
  150. }
  151. void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
  152. {
  153. pgdat->node_page_ext = NULL;
  154. }
  155. static struct page_ext *lookup_page_ext(const struct page *page)
  156. {
  157. unsigned long pfn = page_to_pfn(page);
  158. unsigned long index;
  159. struct page_ext *base;
  160. WARN_ON_ONCE(!rcu_read_lock_held());
  161. base = NODE_DATA(page_to_nid(page))->node_page_ext;
  162. /*
  163. * The sanity checks the page allocator does upon freeing a
  164. * page can reach here before the page_ext arrays are
  165. * allocated when feeding a range of pages to the allocator
  166. * for the first time during bootup or memory hotplug.
  167. */
  168. if (unlikely(!base))
  169. return NULL;
  170. index = pfn - round_down(node_start_pfn(page_to_nid(page)),
  171. MAX_ORDER_NR_PAGES);
  172. return get_entry(base, index);
  173. }
  174. static int __init alloc_node_page_ext(int nid)
  175. {
  176. struct page_ext *base;
  177. unsigned long table_size;
  178. unsigned long nr_pages;
  179. nr_pages = NODE_DATA(nid)->node_spanned_pages;
  180. if (!nr_pages)
  181. return 0;
  182. /*
  183. * Need extra space if node range is not aligned with
  184. * MAX_ORDER_NR_PAGES. When page allocator's buddy algorithm
  185. * checks buddy's status, range could be out of exact node range.
  186. */
  187. if (!IS_ALIGNED(node_start_pfn(nid), MAX_ORDER_NR_PAGES) ||
  188. !IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
  189. nr_pages += MAX_ORDER_NR_PAGES;
  190. table_size = page_ext_size * nr_pages;
  191. base = memblock_alloc_try_nid(
  192. table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
  193. MEMBLOCK_ALLOC_ACCESSIBLE, nid);
  194. if (!base)
  195. return -ENOMEM;
  196. NODE_DATA(nid)->node_page_ext = base;
  197. total_usage += table_size;
  198. memmap_boot_pages_add(DIV_ROUND_UP(table_size, PAGE_SIZE));
  199. return 0;
  200. }
  201. void __init page_ext_init_flatmem(void)
  202. {
  203. int nid, fail;
  204. if (!invoke_need_callbacks())
  205. return;
  206. for_each_online_node(nid) {
  207. fail = alloc_node_page_ext(nid);
  208. if (fail)
  209. goto fail;
  210. }
  211. pr_info("allocated %ld bytes of page_ext\n", total_usage);
  212. return;
  213. fail:
  214. pr_crit("allocation of page_ext failed.\n");
  215. panic("Out of memory");
  216. }
  217. #else /* CONFIG_SPARSEMEM */
  218. static bool page_ext_invalid(struct page_ext *page_ext)
  219. {
  220. return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID);
  221. }
  222. static struct page_ext *lookup_page_ext(const struct page *page)
  223. {
  224. unsigned long pfn = page_to_pfn(page);
  225. struct mem_section *section = __pfn_to_section(pfn);
  226. struct page_ext *page_ext = READ_ONCE(section->page_ext);
  227. WARN_ON_ONCE(!rcu_read_lock_held());
  228. /*
  229. * The sanity checks the page allocator does upon freeing a
  230. * page can reach here before the page_ext arrays are
  231. * allocated when feeding a range of pages to the allocator
  232. * for the first time during bootup or memory hotplug.
  233. */
  234. if (page_ext_invalid(page_ext))
  235. return NULL;
  236. return get_entry(page_ext, pfn);
  237. }
  238. static void *__meminit alloc_page_ext(size_t size, int nid)
  239. {
  240. gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
  241. void *addr = NULL;
  242. addr = alloc_pages_exact_nid(nid, size, flags);
  243. if (addr)
  244. kmemleak_alloc(addr, size, 1, flags);
  245. else
  246. addr = vzalloc_node(size, nid);
  247. if (addr)
  248. memmap_pages_add(DIV_ROUND_UP(size, PAGE_SIZE));
  249. return addr;
  250. }
  251. static int __meminit init_section_page_ext(unsigned long pfn, int nid)
  252. {
  253. struct mem_section *section;
  254. struct page_ext *base;
  255. unsigned long table_size;
  256. section = __pfn_to_section(pfn);
  257. if (section->page_ext)
  258. return 0;
  259. table_size = page_ext_size * PAGES_PER_SECTION;
  260. base = alloc_page_ext(table_size, nid);
  261. /*
  262. * The value stored in section->page_ext is (base - pfn)
  263. * and it does not point to the memory block allocated above,
  264. * causing kmemleak false positives.
  265. */
  266. kmemleak_not_leak(base);
  267. if (!base) {
  268. pr_err("page ext allocation failure\n");
  269. return -ENOMEM;
  270. }
  271. /*
  272. * The passed "pfn" may not be aligned to SECTION. For the calculation
  273. * we need to apply a mask.
  274. */
  275. pfn &= PAGE_SECTION_MASK;
  276. section->page_ext = (void *)base - page_ext_size * pfn;
  277. total_usage += table_size;
  278. return 0;
  279. }
  280. static void free_page_ext(void *addr)
  281. {
  282. size_t table_size;
  283. struct page *page;
  284. table_size = page_ext_size * PAGES_PER_SECTION;
  285. memmap_pages_add(-1L * (DIV_ROUND_UP(table_size, PAGE_SIZE)));
  286. if (is_vmalloc_addr(addr)) {
  287. vfree(addr);
  288. } else {
  289. page = virt_to_page(addr);
  290. BUG_ON(PageReserved(page));
  291. kmemleak_free(addr);
  292. free_pages_exact(addr, table_size);
  293. }
  294. }
  295. static void __free_page_ext(unsigned long pfn)
  296. {
  297. struct mem_section *ms;
  298. struct page_ext *base;
  299. ms = __pfn_to_section(pfn);
  300. if (!ms || !ms->page_ext)
  301. return;
  302. base = READ_ONCE(ms->page_ext);
  303. /*
  304. * page_ext here can be valid while doing the roll back
  305. * operation in online_page_ext().
  306. */
  307. if (page_ext_invalid(base))
  308. base = (void *)base - PAGE_EXT_INVALID;
  309. WRITE_ONCE(ms->page_ext, NULL);
  310. base = get_entry(base, pfn);
  311. free_page_ext(base);
  312. }
  313. static void __invalidate_page_ext(unsigned long pfn)
  314. {
  315. struct mem_section *ms;
  316. void *val;
  317. ms = __pfn_to_section(pfn);
  318. if (!ms || !ms->page_ext)
  319. return;
  320. val = (void *)ms->page_ext + PAGE_EXT_INVALID;
  321. WRITE_ONCE(ms->page_ext, val);
  322. }
  323. static int __meminit online_page_ext(unsigned long start_pfn,
  324. unsigned long nr_pages)
  325. {
  326. int nid = pfn_to_nid(start_pfn);
  327. unsigned long start, end, pfn;
  328. int fail = 0;
  329. start = SECTION_ALIGN_DOWN(start_pfn);
  330. end = SECTION_ALIGN_UP(start_pfn + nr_pages);
  331. for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION)
  332. fail = init_section_page_ext(pfn, nid);
  333. if (!fail)
  334. return 0;
  335. /* rollback */
  336. end = pfn - PAGES_PER_SECTION;
  337. for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
  338. __free_page_ext(pfn);
  339. return -ENOMEM;
  340. }
  341. static void __meminit offline_page_ext(unsigned long start_pfn,
  342. unsigned long nr_pages)
  343. {
  344. unsigned long start, end, pfn;
  345. start = SECTION_ALIGN_DOWN(start_pfn);
  346. end = SECTION_ALIGN_UP(start_pfn + nr_pages);
  347. /*
  348. * Freeing of page_ext is done in 3 steps to avoid
  349. * use-after-free of it:
  350. * 1) Traverse all the sections and mark their page_ext
  351. * as invalid.
  352. * 2) Wait for all the existing users of page_ext who
  353. * started before invalidation to finish.
  354. * 3) Free the page_ext.
  355. */
  356. for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
  357. __invalidate_page_ext(pfn);
  358. synchronize_rcu();
  359. for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
  360. __free_page_ext(pfn);
  361. }
  362. static int __meminit page_ext_callback(struct notifier_block *self,
  363. unsigned long action, void *arg)
  364. {
  365. struct memory_notify *mn = arg;
  366. int ret = 0;
  367. switch (action) {
  368. case MEM_GOING_ONLINE:
  369. ret = online_page_ext(mn->start_pfn, mn->nr_pages);
  370. break;
  371. case MEM_OFFLINE:
  372. offline_page_ext(mn->start_pfn,
  373. mn->nr_pages);
  374. break;
  375. case MEM_CANCEL_ONLINE:
  376. offline_page_ext(mn->start_pfn,
  377. mn->nr_pages);
  378. break;
  379. case MEM_GOING_OFFLINE:
  380. break;
  381. case MEM_ONLINE:
  382. case MEM_CANCEL_OFFLINE:
  383. break;
  384. }
  385. return notifier_from_errno(ret);
  386. }
  387. void __init page_ext_init(void)
  388. {
  389. unsigned long pfn;
  390. int nid;
  391. if (!invoke_need_callbacks())
  392. return;
  393. for_each_node_state(nid, N_MEMORY) {
  394. unsigned long start_pfn, end_pfn;
  395. start_pfn = node_start_pfn(nid);
  396. end_pfn = node_end_pfn(nid);
  397. /*
  398. * start_pfn and end_pfn may not be aligned to SECTION and the
  399. * page->flags of out of node pages are not initialized. So we
  400. * scan [start_pfn, the biggest section's pfn < end_pfn) here.
  401. */
  402. for (pfn = start_pfn; pfn < end_pfn;
  403. pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
  404. if (!pfn_valid(pfn))
  405. continue;
  406. /*
  407. * Nodes's pfns can be overlapping.
  408. * We know some arch can have a nodes layout such as
  409. * -------------pfn-------------->
  410. * N0 | N1 | N2 | N0 | N1 | N2|....
  411. */
  412. if (pfn_to_nid(pfn) != nid)
  413. continue;
  414. if (init_section_page_ext(pfn, nid))
  415. goto oom;
  416. cond_resched();
  417. }
  418. }
  419. hotplug_memory_notifier(page_ext_callback, DEFAULT_CALLBACK_PRI);
  420. pr_info("allocated %ld bytes of page_ext\n", total_usage);
  421. invoke_init_callbacks();
  422. return;
  423. oom:
  424. panic("Out of memory");
  425. }
  426. void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
  427. {
  428. }
  429. #endif
  430. /**
  431. * page_ext_lookup() - Lookup a page extension for a PFN.
  432. * @pfn: PFN of the page we're interested in.
  433. *
  434. * Must be called with RCU read lock taken and @pfn must be valid.
  435. *
  436. * Return: NULL if no page_ext exists for this page.
  437. */
  438. struct page_ext *page_ext_lookup(unsigned long pfn)
  439. {
  440. return lookup_page_ext(pfn_to_page(pfn));
  441. }
  442. /**
  443. * page_ext_get() - Get the extended information for a page.
  444. * @page: The page we're interested in.
  445. *
  446. * Ensures that the page_ext will remain valid until page_ext_put()
  447. * is called.
  448. *
  449. * Return: NULL if no page_ext exists for this page.
  450. * Context: Any context. Caller may not sleep until they have called
  451. * page_ext_put().
  452. */
  453. struct page_ext *page_ext_get(const struct page *page)
  454. {
  455. struct page_ext *page_ext;
  456. rcu_read_lock();
  457. page_ext = lookup_page_ext(page);
  458. if (!page_ext) {
  459. rcu_read_unlock();
  460. return NULL;
  461. }
  462. return page_ext;
  463. }
  464. /**
  465. * page_ext_from_phys() - Get the page_ext structure for a physical address.
  466. * @phys: The physical address to query.
  467. *
  468. * This function safely gets the `struct page_ext` associated with a given
  469. * physical address. It performs validation to ensure the address corresponds
  470. * to a valid, online struct page before attempting to access it.
  471. * It returns NULL for MMIO, ZONE_DEVICE, holes and offline memory.
  472. *
  473. * Return: NULL if no page_ext exists for this physical address.
  474. * Context: Any context. Caller may not sleep until they have called
  475. * page_ext_put().
  476. */
  477. struct page_ext *page_ext_from_phys(phys_addr_t phys)
  478. {
  479. struct page *page = pfn_to_online_page(__phys_to_pfn(phys));
  480. if (!page)
  481. return NULL;
  482. return page_ext_get(page);
  483. }
  484. /**
  485. * page_ext_put() - Working with page extended information is done.
  486. * @page_ext: Page extended information received from page_ext_get().
  487. *
  488. * The page extended information of the page may not be valid after this
  489. * function is called.
  490. *
  491. * Return: None.
  492. * Context: Any context with corresponding page_ext_get() is called.
  493. */
  494. void page_ext_put(struct page_ext *page_ext)
  495. {
  496. if (unlikely(!page_ext))
  497. return;
  498. rcu_read_unlock();
  499. }