sparse.c 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * sparse memory mappings.
  4. */
  5. #include <linux/mm.h>
  6. #include <linux/slab.h>
  7. #include <linux/mmzone.h>
  8. #include <linux/memblock.h>
  9. #include <linux/compiler.h>
  10. #include <linux/highmem.h>
  11. #include <linux/export.h>
  12. #include <linux/spinlock.h>
  13. #include <linux/vmalloc.h>
  14. #include <linux/swap.h>
  15. #include <linux/swapops.h>
  16. #include <linux/bootmem_info.h>
  17. #include <linux/vmstat.h>
  18. #include "internal.h"
  19. #include <asm/dma.h>
  20. /*
  21. * Permanent SPARSEMEM data:
  22. *
  23. * 1) mem_section - memory sections, mem_map's for valid memory
  24. */
  25. #ifdef CONFIG_SPARSEMEM_EXTREME
  26. struct mem_section **mem_section;
  27. #else
  28. struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]
  29. ____cacheline_internodealigned_in_smp;
  30. #endif
  31. EXPORT_SYMBOL(mem_section);
  32. #ifdef NODE_NOT_IN_PAGE_FLAGS
  33. /*
  34. * If we did not store the node number in the page then we have to
  35. * do a lookup in the section_to_node_table in order to find which
  36. * node the page belongs to.
  37. */
  38. #if MAX_NUMNODES <= 256
  39. static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
  40. #else
  41. static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned;
  42. #endif
  43. int memdesc_nid(memdesc_flags_t mdf)
  44. {
  45. return section_to_node_table[memdesc_section(mdf)];
  46. }
  47. EXPORT_SYMBOL(memdesc_nid);
  48. static void set_section_nid(unsigned long section_nr, int nid)
  49. {
  50. section_to_node_table[section_nr] = nid;
  51. }
  52. #else /* !NODE_NOT_IN_PAGE_FLAGS */
  53. static inline void set_section_nid(unsigned long section_nr, int nid)
  54. {
  55. }
  56. #endif
  57. #ifdef CONFIG_SPARSEMEM_EXTREME
  58. static noinline struct mem_section __ref *sparse_index_alloc(int nid)
  59. {
  60. struct mem_section *section = NULL;
  61. unsigned long array_size = SECTIONS_PER_ROOT *
  62. sizeof(struct mem_section);
  63. if (slab_is_available()) {
  64. section = kzalloc_node(array_size, GFP_KERNEL, nid);
  65. } else {
  66. section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
  67. nid);
  68. if (!section)
  69. panic("%s: Failed to allocate %lu bytes nid=%d\n",
  70. __func__, array_size, nid);
  71. }
  72. return section;
  73. }
  74. static int __meminit sparse_index_init(unsigned long section_nr, int nid)
  75. {
  76. unsigned long root = SECTION_NR_TO_ROOT(section_nr);
  77. struct mem_section *section;
  78. /*
  79. * An existing section is possible in the sub-section hotplug
  80. * case. First hot-add instantiates, follow-on hot-add reuses
  81. * the existing section.
  82. *
  83. * The mem_hotplug_lock resolves the apparent race below.
  84. */
  85. if (mem_section[root])
  86. return 0;
  87. section = sparse_index_alloc(nid);
  88. if (!section)
  89. return -ENOMEM;
  90. mem_section[root] = section;
  91. return 0;
  92. }
  93. #else /* !SPARSEMEM_EXTREME */
  94. static inline int sparse_index_init(unsigned long section_nr, int nid)
  95. {
  96. return 0;
  97. }
  98. #endif
  99. /*
  100. * During early boot, before section_mem_map is used for an actual
  101. * mem_map, we use section_mem_map to store the section's NUMA
  102. * node. This keeps us from having to use another data structure. The
  103. * node information is cleared just before we store the real mem_map.
  104. */
  105. static inline unsigned long sparse_encode_early_nid(int nid)
  106. {
  107. return ((unsigned long)nid << SECTION_NID_SHIFT);
  108. }
  109. static inline int sparse_early_nid(struct mem_section *section)
  110. {
  111. return (section->section_mem_map >> SECTION_NID_SHIFT);
  112. }
  113. /* Validate the physical addressing limitations of the model */
  114. static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
  115. unsigned long *end_pfn)
  116. {
  117. unsigned long max_sparsemem_pfn = (DIRECT_MAP_PHYSMEM_END + 1) >> PAGE_SHIFT;
  118. /*
  119. * Sanity checks - do not allow an architecture to pass
  120. * in larger pfns than the maximum scope of sparsemem:
  121. */
  122. if (*start_pfn > max_sparsemem_pfn) {
  123. mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
  124. "Start of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
  125. *start_pfn, *end_pfn, max_sparsemem_pfn);
  126. WARN_ON_ONCE(1);
  127. *start_pfn = max_sparsemem_pfn;
  128. *end_pfn = max_sparsemem_pfn;
  129. } else if (*end_pfn > max_sparsemem_pfn) {
  130. mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
  131. "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
  132. *start_pfn, *end_pfn, max_sparsemem_pfn);
  133. WARN_ON_ONCE(1);
  134. *end_pfn = max_sparsemem_pfn;
  135. }
  136. }
  137. /*
  138. * There are a number of times that we loop over NR_MEM_SECTIONS,
  139. * looking for section_present() on each. But, when we have very
  140. * large physical address spaces, NR_MEM_SECTIONS can also be
  141. * very large which makes the loops quite long.
  142. *
  143. * Keeping track of this gives us an easy way to break out of
  144. * those loops early.
  145. */
  146. unsigned long __highest_present_section_nr;
  147. static void __section_mark_present(struct mem_section *ms,
  148. unsigned long section_nr)
  149. {
  150. if (section_nr > __highest_present_section_nr)
  151. __highest_present_section_nr = section_nr;
  152. ms->section_mem_map |= SECTION_MARKED_PRESENT;
  153. }
  154. static inline unsigned long first_present_section_nr(void)
  155. {
  156. return next_present_section_nr(-1);
  157. }
  158. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  159. static void subsection_mask_set(unsigned long *map, unsigned long pfn,
  160. unsigned long nr_pages)
  161. {
  162. int idx = subsection_map_index(pfn);
  163. int end = subsection_map_index(pfn + nr_pages - 1);
  164. bitmap_set(map, idx, end - idx + 1);
  165. }
  166. void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
  167. {
  168. int end_sec_nr = pfn_to_section_nr(pfn + nr_pages - 1);
  169. unsigned long nr, start_sec_nr = pfn_to_section_nr(pfn);
  170. for (nr = start_sec_nr; nr <= end_sec_nr; nr++) {
  171. struct mem_section *ms;
  172. unsigned long pfns;
  173. pfns = min(nr_pages, PAGES_PER_SECTION
  174. - (pfn & ~PAGE_SECTION_MASK));
  175. ms = __nr_to_section(nr);
  176. subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
  177. pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
  178. pfns, subsection_map_index(pfn),
  179. subsection_map_index(pfn + pfns - 1));
  180. pfn += pfns;
  181. nr_pages -= pfns;
  182. }
  183. }
  184. #else
  185. void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
  186. {
  187. }
  188. #endif
  189. /* Record a memory area against a node. */
  190. static void __init memory_present(int nid, unsigned long start, unsigned long end)
  191. {
  192. unsigned long pfn;
  193. start &= PAGE_SECTION_MASK;
  194. mminit_validate_memmodel_limits(&start, &end);
  195. for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) {
  196. unsigned long section_nr = pfn_to_section_nr(pfn);
  197. struct mem_section *ms;
  198. sparse_index_init(section_nr, nid);
  199. set_section_nid(section_nr, nid);
  200. ms = __nr_to_section(section_nr);
  201. if (!ms->section_mem_map) {
  202. ms->section_mem_map = sparse_encode_early_nid(nid) |
  203. SECTION_IS_ONLINE;
  204. __section_mark_present(ms, section_nr);
  205. }
  206. }
  207. }
  208. /*
  209. * Mark all memblocks as present using memory_present().
  210. * This is a convenience function that is useful to mark all of the systems
  211. * memory as present during initialization.
  212. */
  213. static void __init memblocks_present(void)
  214. {
  215. unsigned long start, end;
  216. int i, nid;
  217. #ifdef CONFIG_SPARSEMEM_EXTREME
  218. if (unlikely(!mem_section)) {
  219. unsigned long size, align;
  220. size = sizeof(struct mem_section *) * NR_SECTION_ROOTS;
  221. align = 1 << (INTERNODE_CACHE_SHIFT);
  222. mem_section = memblock_alloc_or_panic(size, align);
  223. }
  224. #endif
  225. for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid)
  226. memory_present(nid, start, end);
  227. }
  228. /*
  229. * Subtle, we encode the real pfn into the mem_map such that
  230. * the identity pfn - section_mem_map will return the actual
  231. * physical page frame number.
  232. */
  233. static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum)
  234. {
  235. unsigned long coded_mem_map =
  236. (unsigned long)(mem_map - (section_nr_to_pfn(pnum)));
  237. BUILD_BUG_ON(SECTION_MAP_LAST_BIT > PFN_SECTION_SHIFT);
  238. BUG_ON(coded_mem_map & ~SECTION_MAP_MASK);
  239. return coded_mem_map;
  240. }
  241. #ifdef CONFIG_MEMORY_HOTPLUG
  242. /*
  243. * Decode mem_map from the coded memmap
  244. */
  245. struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
  246. {
  247. /* mask off the extra low bits of information */
  248. coded_mem_map &= SECTION_MAP_MASK;
  249. return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
  250. }
  251. #endif /* CONFIG_MEMORY_HOTPLUG */
  252. static void __meminit sparse_init_one_section(struct mem_section *ms,
  253. unsigned long pnum, struct page *mem_map,
  254. struct mem_section_usage *usage, unsigned long flags)
  255. {
  256. ms->section_mem_map &= ~SECTION_MAP_MASK;
  257. ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
  258. | SECTION_HAS_MEM_MAP | flags;
  259. ms->usage = usage;
  260. }
  261. static unsigned long usemap_size(void)
  262. {
  263. return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
  264. }
  265. size_t mem_section_usage_size(void)
  266. {
  267. return sizeof(struct mem_section_usage) + usemap_size();
  268. }
  269. #ifdef CONFIG_MEMORY_HOTREMOVE
  270. static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat)
  271. {
  272. #ifndef CONFIG_NUMA
  273. VM_BUG_ON(pgdat != &contig_page_data);
  274. return __pa_symbol(&contig_page_data);
  275. #else
  276. return __pa(pgdat);
  277. #endif
  278. }
  279. static struct mem_section_usage * __init
  280. sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
  281. unsigned long size)
  282. {
  283. struct mem_section_usage *usage;
  284. unsigned long goal, limit;
  285. int nid;
  286. /*
  287. * A page may contain usemaps for other sections preventing the
  288. * page being freed and making a section unremovable while
  289. * other sections referencing the usemap remain active. Similarly,
  290. * a pgdat can prevent a section being removed. If section A
  291. * contains a pgdat and section B contains the usemap, both
  292. * sections become inter-dependent. This allocates usemaps
  293. * from the same section as the pgdat where possible to avoid
  294. * this problem.
  295. */
  296. goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT);
  297. limit = goal + (1UL << PA_SECTION_SHIFT);
  298. nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
  299. again:
  300. usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
  301. if (!usage && limit) {
  302. limit = MEMBLOCK_ALLOC_ACCESSIBLE;
  303. goto again;
  304. }
  305. return usage;
  306. }
  307. static void __init check_usemap_section_nr(int nid,
  308. struct mem_section_usage *usage)
  309. {
  310. unsigned long usemap_snr, pgdat_snr;
  311. static unsigned long old_usemap_snr;
  312. static unsigned long old_pgdat_snr;
  313. struct pglist_data *pgdat = NODE_DATA(nid);
  314. int usemap_nid;
  315. /* First call */
  316. if (!old_usemap_snr) {
  317. old_usemap_snr = NR_MEM_SECTIONS;
  318. old_pgdat_snr = NR_MEM_SECTIONS;
  319. }
  320. usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
  321. pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT);
  322. if (usemap_snr == pgdat_snr)
  323. return;
  324. if (old_usemap_snr == usemap_snr && old_pgdat_snr == pgdat_snr)
  325. /* skip redundant message */
  326. return;
  327. old_usemap_snr = usemap_snr;
  328. old_pgdat_snr = pgdat_snr;
  329. usemap_nid = sparse_early_nid(__nr_to_section(usemap_snr));
  330. if (usemap_nid != nid) {
  331. pr_info("node %d must be removed before remove section %ld\n",
  332. nid, usemap_snr);
  333. return;
  334. }
  335. /*
  336. * There is a circular dependency.
  337. * Some platforms allow un-removable section because they will just
  338. * gather other removable sections for dynamic partitioning.
  339. * Just notify un-removable section's number here.
  340. */
  341. pr_info("Section %ld and %ld (node %d) have a circular dependency on usemap and pgdat allocations\n",
  342. usemap_snr, pgdat_snr, nid);
  343. }
  344. #else
  345. static struct mem_section_usage * __init
  346. sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
  347. unsigned long size)
  348. {
  349. return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
  350. }
  351. static void __init check_usemap_section_nr(int nid,
  352. struct mem_section_usage *usage)
  353. {
  354. }
  355. #endif /* CONFIG_MEMORY_HOTREMOVE */
  356. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  357. unsigned long __init section_map_size(void)
  358. {
  359. return ALIGN(sizeof(struct page) * PAGES_PER_SECTION, PMD_SIZE);
  360. }
  361. #else
  362. unsigned long __init section_map_size(void)
  363. {
  364. return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
  365. }
  366. struct page __init *__populate_section_memmap(unsigned long pfn,
  367. unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
  368. struct dev_pagemap *pgmap)
  369. {
  370. unsigned long size = section_map_size();
  371. struct page *map = sparse_buffer_alloc(size);
  372. phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
  373. if (map)
  374. return map;
  375. map = memmap_alloc(size, size, addr, nid, false);
  376. if (!map)
  377. panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
  378. __func__, size, PAGE_SIZE, nid, &addr);
  379. return map;
  380. }
  381. #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
  382. static void *sparsemap_buf __meminitdata;
  383. static void *sparsemap_buf_end __meminitdata;
  384. static inline void __meminit sparse_buffer_free(unsigned long size)
  385. {
  386. WARN_ON(!sparsemap_buf || size == 0);
  387. memblock_free(sparsemap_buf, size);
  388. }
  389. static void __init sparse_buffer_init(unsigned long size, int nid)
  390. {
  391. phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
  392. WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
  393. /*
  394. * Pre-allocated buffer is mainly used by __populate_section_memmap
  395. * and we want it to be properly aligned to the section size - this is
  396. * especially the case for VMEMMAP which maps memmap to PMDs
  397. */
  398. sparsemap_buf = memmap_alloc(size, section_map_size(), addr, nid, true);
  399. sparsemap_buf_end = sparsemap_buf + size;
  400. }
  401. static void __init sparse_buffer_fini(void)
  402. {
  403. unsigned long size = sparsemap_buf_end - sparsemap_buf;
  404. if (sparsemap_buf && size > 0)
  405. sparse_buffer_free(size);
  406. sparsemap_buf = NULL;
  407. }
  408. void * __meminit sparse_buffer_alloc(unsigned long size)
  409. {
  410. void *ptr = NULL;
  411. if (sparsemap_buf) {
  412. ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
  413. if (ptr + size > sparsemap_buf_end)
  414. ptr = NULL;
  415. else {
  416. /* Free redundant aligned space */
  417. if ((unsigned long)(ptr - sparsemap_buf) > 0)
  418. sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
  419. sparsemap_buf = ptr + size;
  420. }
  421. }
  422. return ptr;
  423. }
  424. void __weak __meminit vmemmap_populate_print_last(void)
  425. {
  426. }
  427. static void *sparse_usagebuf __meminitdata;
  428. static void *sparse_usagebuf_end __meminitdata;
  429. /*
  430. * Helper function that is used for generic section initialization, and
  431. * can also be used by any hooks added above.
  432. */
  433. void __init sparse_init_early_section(int nid, struct page *map,
  434. unsigned long pnum, unsigned long flags)
  435. {
  436. BUG_ON(!sparse_usagebuf || sparse_usagebuf >= sparse_usagebuf_end);
  437. check_usemap_section_nr(nid, sparse_usagebuf);
  438. sparse_init_one_section(__nr_to_section(pnum), pnum, map,
  439. sparse_usagebuf, SECTION_IS_EARLY | flags);
  440. sparse_usagebuf = (void *)sparse_usagebuf + mem_section_usage_size();
  441. }
  442. static int __init sparse_usage_init(int nid, unsigned long map_count)
  443. {
  444. unsigned long size;
  445. size = mem_section_usage_size() * map_count;
  446. sparse_usagebuf = sparse_early_usemaps_alloc_pgdat_section(
  447. NODE_DATA(nid), size);
  448. if (!sparse_usagebuf) {
  449. sparse_usagebuf_end = NULL;
  450. return -ENOMEM;
  451. }
  452. sparse_usagebuf_end = sparse_usagebuf + size;
  453. return 0;
  454. }
  455. static void __init sparse_usage_fini(void)
  456. {
  457. sparse_usagebuf = sparse_usagebuf_end = NULL;
  458. }
  459. /*
  460. * Initialize sparse on a specific node. The node spans [pnum_begin, pnum_end)
  461. * And number of present sections in this node is map_count.
  462. */
  463. static void __init sparse_init_nid(int nid, unsigned long pnum_begin,
  464. unsigned long pnum_end,
  465. unsigned long map_count)
  466. {
  467. unsigned long pnum;
  468. struct page *map;
  469. struct mem_section *ms;
  470. if (sparse_usage_init(nid, map_count)) {
  471. pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
  472. goto failed;
  473. }
  474. sparse_buffer_init(map_count * section_map_size(), nid);
  475. sparse_vmemmap_init_nid_early(nid);
  476. for_each_present_section_nr(pnum_begin, pnum) {
  477. unsigned long pfn = section_nr_to_pfn(pnum);
  478. if (pnum >= pnum_end)
  479. break;
  480. ms = __nr_to_section(pnum);
  481. if (!preinited_vmemmap_section(ms)) {
  482. map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
  483. nid, NULL, NULL);
  484. if (!map) {
  485. pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
  486. __func__, nid);
  487. pnum_begin = pnum;
  488. sparse_usage_fini();
  489. sparse_buffer_fini();
  490. goto failed;
  491. }
  492. memmap_boot_pages_add(DIV_ROUND_UP(PAGES_PER_SECTION * sizeof(struct page),
  493. PAGE_SIZE));
  494. sparse_init_early_section(nid, map, pnum, 0);
  495. }
  496. }
  497. sparse_usage_fini();
  498. sparse_buffer_fini();
  499. return;
  500. failed:
  501. /*
  502. * We failed to allocate, mark all the following pnums as not present,
  503. * except the ones already initialized earlier.
  504. */
  505. for_each_present_section_nr(pnum_begin, pnum) {
  506. if (pnum >= pnum_end)
  507. break;
  508. ms = __nr_to_section(pnum);
  509. if (!preinited_vmemmap_section(ms))
  510. ms->section_mem_map = 0;
  511. ms->section_mem_map = 0;
  512. }
  513. }
  514. /*
  515. * Allocate the accumulated non-linear sections, allocate a mem_map
  516. * for each and record the physical to section mapping.
  517. */
  518. void __init sparse_init(void)
  519. {
  520. unsigned long pnum_end, pnum_begin, map_count = 1;
  521. int nid_begin;
  522. /* see include/linux/mmzone.h 'struct mem_section' definition */
  523. BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section)));
  524. memblocks_present();
  525. pnum_begin = first_present_section_nr();
  526. nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
  527. /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
  528. set_pageblock_order();
  529. for_each_present_section_nr(pnum_begin + 1, pnum_end) {
  530. int nid = sparse_early_nid(__nr_to_section(pnum_end));
  531. if (nid == nid_begin) {
  532. map_count++;
  533. continue;
  534. }
  535. /* Init node with sections in range [pnum_begin, pnum_end) */
  536. sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
  537. nid_begin = nid;
  538. pnum_begin = pnum_end;
  539. map_count = 1;
  540. }
  541. /* cover the last node */
  542. sparse_init_nid(nid_begin, pnum_begin, pnum_end, map_count);
  543. vmemmap_populate_print_last();
  544. }
  545. #ifdef CONFIG_MEMORY_HOTPLUG
  546. /* Mark all memory sections within the pfn range as online */
  547. void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
  548. {
  549. unsigned long pfn;
  550. for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
  551. unsigned long section_nr = pfn_to_section_nr(pfn);
  552. struct mem_section *ms;
  553. /* onlining code should never touch invalid ranges */
  554. if (WARN_ON(!valid_section_nr(section_nr)))
  555. continue;
  556. ms = __nr_to_section(section_nr);
  557. ms->section_mem_map |= SECTION_IS_ONLINE;
  558. }
  559. }
  560. /* Mark all memory sections within the pfn range as offline */
  561. void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
  562. {
  563. unsigned long pfn;
  564. for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
  565. unsigned long section_nr = pfn_to_section_nr(pfn);
  566. struct mem_section *ms;
  567. /*
  568. * TODO this needs some double checking. Offlining code makes
  569. * sure to check pfn_valid but those checks might be just bogus
  570. */
  571. if (WARN_ON(!valid_section_nr(section_nr)))
  572. continue;
  573. ms = __nr_to_section(section_nr);
  574. ms->section_mem_map &= ~SECTION_IS_ONLINE;
  575. }
  576. }
  577. #ifdef CONFIG_SPARSEMEM_VMEMMAP
  578. static struct page * __meminit populate_section_memmap(unsigned long pfn,
  579. unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
  580. struct dev_pagemap *pgmap)
  581. {
  582. return __populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
  583. }
  584. static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
  585. struct vmem_altmap *altmap)
  586. {
  587. unsigned long start = (unsigned long) pfn_to_page(pfn);
  588. unsigned long end = start + nr_pages * sizeof(struct page);
  589. vmemmap_free(start, end, altmap);
  590. }
  591. static void free_map_bootmem(struct page *memmap)
  592. {
  593. unsigned long start = (unsigned long)memmap;
  594. unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
  595. vmemmap_free(start, end, NULL);
  596. }
  597. static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
  598. {
  599. DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
  600. DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
  601. struct mem_section *ms = __pfn_to_section(pfn);
  602. unsigned long *subsection_map = ms->usage
  603. ? &ms->usage->subsection_map[0] : NULL;
  604. subsection_mask_set(map, pfn, nr_pages);
  605. if (subsection_map)
  606. bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
  607. if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
  608. "section already deactivated (%#lx + %ld)\n",
  609. pfn, nr_pages))
  610. return -EINVAL;
  611. bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
  612. return 0;
  613. }
  614. static bool is_subsection_map_empty(struct mem_section *ms)
  615. {
  616. return bitmap_empty(&ms->usage->subsection_map[0],
  617. SUBSECTIONS_PER_SECTION);
  618. }
  619. static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
  620. {
  621. struct mem_section *ms = __pfn_to_section(pfn);
  622. DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
  623. unsigned long *subsection_map;
  624. int rc = 0;
  625. subsection_mask_set(map, pfn, nr_pages);
  626. subsection_map = &ms->usage->subsection_map[0];
  627. if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
  628. rc = -EINVAL;
  629. else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
  630. rc = -EEXIST;
  631. else
  632. bitmap_or(subsection_map, map, subsection_map,
  633. SUBSECTIONS_PER_SECTION);
  634. return rc;
  635. }
  636. #else
  637. static struct page * __meminit populate_section_memmap(unsigned long pfn,
  638. unsigned long nr_pages, int nid, struct vmem_altmap *altmap,
  639. struct dev_pagemap *pgmap)
  640. {
  641. return kvmalloc_node(array_size(sizeof(struct page),
  642. PAGES_PER_SECTION), GFP_KERNEL, nid);
  643. }
  644. static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
  645. struct vmem_altmap *altmap)
  646. {
  647. kvfree(pfn_to_page(pfn));
  648. }
  649. static void free_map_bootmem(struct page *memmap)
  650. {
  651. unsigned long maps_section_nr, removing_section_nr, i;
  652. unsigned long type, nr_pages;
  653. struct page *page = virt_to_page(memmap);
  654. nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
  655. >> PAGE_SHIFT;
  656. for (i = 0; i < nr_pages; i++, page++) {
  657. type = bootmem_type(page);
  658. BUG_ON(type == NODE_INFO);
  659. maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
  660. removing_section_nr = bootmem_info(page);
  661. /*
  662. * When this function is called, the removing section is
  663. * logical offlined state. This means all pages are isolated
  664. * from page allocator. If removing section's memmap is placed
  665. * on the same section, it must not be freed.
  666. * If it is freed, page allocator may allocate it which will
  667. * be removed physically soon.
  668. */
  669. if (maps_section_nr != removing_section_nr)
  670. put_page_bootmem(page);
  671. }
  672. }
  673. static int clear_subsection_map(unsigned long pfn, unsigned long nr_pages)
  674. {
  675. return 0;
  676. }
  677. static bool is_subsection_map_empty(struct mem_section *ms)
  678. {
  679. return true;
  680. }
  681. static int fill_subsection_map(unsigned long pfn, unsigned long nr_pages)
  682. {
  683. return 0;
  684. }
  685. #endif /* CONFIG_SPARSEMEM_VMEMMAP */
  686. /*
  687. * To deactivate a memory region, there are 3 cases to handle across
  688. * two configurations (SPARSEMEM_VMEMMAP={y,n}):
  689. *
  690. * 1. deactivation of a partial hot-added section (only possible in
  691. * the SPARSEMEM_VMEMMAP=y case).
  692. * a) section was present at memory init.
  693. * b) section was hot-added post memory init.
  694. * 2. deactivation of a complete hot-added section.
  695. * 3. deactivation of a complete section from memory init.
  696. *
  697. * For 1, when subsection_map does not empty we will not be freeing the
  698. * usage map, but still need to free the vmemmap range.
  699. *
  700. * For 2 and 3, the SPARSEMEM_VMEMMAP={y,n} cases are unified
  701. */
  702. static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
  703. struct vmem_altmap *altmap)
  704. {
  705. struct mem_section *ms = __pfn_to_section(pfn);
  706. bool section_is_early = early_section(ms);
  707. struct page *memmap = NULL;
  708. bool empty;
  709. if (clear_subsection_map(pfn, nr_pages))
  710. return;
  711. empty = is_subsection_map_empty(ms);
  712. if (empty) {
  713. unsigned long section_nr = pfn_to_section_nr(pfn);
  714. /*
  715. * Mark the section invalid so that valid_section()
  716. * return false. This prevents code from dereferencing
  717. * ms->usage array.
  718. */
  719. ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
  720. /*
  721. * When removing an early section, the usage map is kept (as the
  722. * usage maps of other sections fall into the same page). It
  723. * will be re-used when re-adding the section - which is then no
  724. * longer an early section. If the usage map is PageReserved, it
  725. * was allocated during boot.
  726. */
  727. if (!PageReserved(virt_to_page(ms->usage))) {
  728. kfree_rcu(ms->usage, rcu);
  729. WRITE_ONCE(ms->usage, NULL);
  730. }
  731. memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
  732. }
  733. /*
  734. * The memmap of early sections is always fully populated. See
  735. * section_activate() and pfn_valid() .
  736. */
  737. if (!section_is_early) {
  738. memmap_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE)));
  739. depopulate_section_memmap(pfn, nr_pages, altmap);
  740. } else if (memmap) {
  741. memmap_boot_pages_add(-1L * (DIV_ROUND_UP(nr_pages * sizeof(struct page),
  742. PAGE_SIZE)));
  743. free_map_bootmem(memmap);
  744. }
  745. if (empty)
  746. ms->section_mem_map = (unsigned long)NULL;
  747. }
  748. static struct page * __meminit section_activate(int nid, unsigned long pfn,
  749. unsigned long nr_pages, struct vmem_altmap *altmap,
  750. struct dev_pagemap *pgmap)
  751. {
  752. struct mem_section *ms = __pfn_to_section(pfn);
  753. struct mem_section_usage *usage = NULL;
  754. struct page *memmap;
  755. int rc;
  756. if (!ms->usage) {
  757. usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
  758. if (!usage)
  759. return ERR_PTR(-ENOMEM);
  760. ms->usage = usage;
  761. }
  762. rc = fill_subsection_map(pfn, nr_pages);
  763. if (rc) {
  764. if (usage)
  765. ms->usage = NULL;
  766. kfree(usage);
  767. return ERR_PTR(rc);
  768. }
  769. /*
  770. * The early init code does not consider partially populated
  771. * initial sections, it simply assumes that memory will never be
  772. * referenced. If we hot-add memory into such a section then we
  773. * do not need to populate the memmap and can simply reuse what
  774. * is already there.
  775. */
  776. if (nr_pages < PAGES_PER_SECTION && early_section(ms))
  777. return pfn_to_page(pfn);
  778. memmap = populate_section_memmap(pfn, nr_pages, nid, altmap, pgmap);
  779. if (!memmap) {
  780. section_deactivate(pfn, nr_pages, altmap);
  781. return ERR_PTR(-ENOMEM);
  782. }
  783. memmap_pages_add(DIV_ROUND_UP(nr_pages * sizeof(struct page), PAGE_SIZE));
  784. return memmap;
  785. }
  786. /**
  787. * sparse_add_section - add a memory section, or populate an existing one
  788. * @nid: The node to add section on
  789. * @start_pfn: start pfn of the memory range
  790. * @nr_pages: number of pfns to add in the section
  791. * @altmap: alternate pfns to allocate the memmap backing store
  792. * @pgmap: alternate compound page geometry for devmap mappings
  793. *
  794. * This is only intended for hotplug.
  795. *
  796. * Note that only VMEMMAP supports sub-section aligned hotplug,
  797. * the proper alignment and size are gated by check_pfn_span().
  798. *
  799. *
  800. * Return:
  801. * * 0 - On success.
  802. * * -EEXIST - Section has been present.
  803. * * -ENOMEM - Out of memory.
  804. */
  805. int __meminit sparse_add_section(int nid, unsigned long start_pfn,
  806. unsigned long nr_pages, struct vmem_altmap *altmap,
  807. struct dev_pagemap *pgmap)
  808. {
  809. unsigned long section_nr = pfn_to_section_nr(start_pfn);
  810. struct mem_section *ms;
  811. struct page *memmap;
  812. int ret;
  813. ret = sparse_index_init(section_nr, nid);
  814. if (ret < 0)
  815. return ret;
  816. memmap = section_activate(nid, start_pfn, nr_pages, altmap, pgmap);
  817. if (IS_ERR(memmap))
  818. return PTR_ERR(memmap);
  819. /*
  820. * Poison uninitialized struct pages in order to catch invalid flags
  821. * combinations.
  822. */
  823. page_init_poison(memmap, sizeof(struct page) * nr_pages);
  824. ms = __nr_to_section(section_nr);
  825. set_section_nid(section_nr, nid);
  826. __section_mark_present(ms, section_nr);
  827. /* Align memmap to section boundary in the subsection case */
  828. if (section_nr_to_pfn(section_nr) != start_pfn)
  829. memmap = pfn_to_page(section_nr_to_pfn(section_nr));
  830. sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);
  831. return 0;
  832. }
  833. void sparse_remove_section(unsigned long pfn, unsigned long nr_pages,
  834. struct vmem_altmap *altmap)
  835. {
  836. struct mem_section *ms = __pfn_to_section(pfn);
  837. if (WARN_ON_ONCE(!valid_section(ms)))
  838. return;
  839. section_deactivate(pfn, nr_pages, altmap);
  840. }
  841. #endif /* CONFIG_MEMORY_HOTPLUG */