numa_emulation.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * NUMA emulation
  4. */
  5. #include <linux/kernel.h>
  6. #include <linux/errno.h>
  7. #include <linux/topology.h>
  8. #include <linux/memblock.h>
  9. #include <linux/numa_memblks.h>
  10. #include <asm/numa.h>
  11. #include <acpi/acpi_numa.h>
  12. #define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
  13. #define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
  14. int emu_nid_to_phys[MAX_NUMNODES];
  15. static char *emu_cmdline __initdata;
  16. int __init numa_emu_cmdline(char *str)
  17. {
  18. emu_cmdline = str;
  19. return 0;
  20. }
  21. static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
  22. {
  23. int i;
  24. for (i = 0; i < mi->nr_blks; i++)
  25. if (mi->blk[i].nid == nid)
  26. return i;
  27. return -ENOENT;
  28. }
  29. static u64 __init mem_hole_size(u64 start, u64 end)
  30. {
  31. unsigned long start_pfn = PFN_UP(start);
  32. unsigned long end_pfn = PFN_DOWN(end);
  33. if (start_pfn < end_pfn)
  34. return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
  35. return 0;
  36. }
  37. /*
  38. * Sets up nid to range from @start to @end. The return value is -errno if
  39. * something went wrong, 0 otherwise.
  40. */
  41. static int __init emu_setup_memblk(struct numa_meminfo *ei,
  42. struct numa_meminfo *pi,
  43. int nid, int phys_blk, u64 size)
  44. {
  45. struct numa_memblk *eb = &ei->blk[ei->nr_blks];
  46. struct numa_memblk *pb = &pi->blk[phys_blk];
  47. if (ei->nr_blks >= NR_NODE_MEMBLKS) {
  48. pr_err("NUMA: Too many emulated memblks, failing emulation\n");
  49. return -EINVAL;
  50. }
  51. ei->nr_blks++;
  52. eb->start = pb->start;
  53. eb->end = pb->start + size;
  54. eb->nid = nid;
  55. if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
  56. emu_nid_to_phys[nid] = pb->nid;
  57. pb->start += size;
  58. if (pb->start >= pb->end) {
  59. WARN_ON_ONCE(pb->start > pb->end);
  60. numa_remove_memblk_from(phys_blk, pi);
  61. }
  62. printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
  63. nid, eb->start, eb->end - 1, (eb->end - eb->start) / SZ_1M);
  64. return 0;
  65. }
  66. /*
  67. * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
  68. * to max_addr.
  69. *
  70. * Returns zero on success or negative on error.
  71. */
  72. static int __init split_nodes_interleave(struct numa_meminfo *ei,
  73. struct numa_meminfo *pi,
  74. u64 addr, u64 max_addr, int nr_nodes)
  75. {
  76. nodemask_t physnode_mask = numa_nodes_parsed;
  77. u64 size;
  78. int big;
  79. int nid = 0;
  80. int i, ret;
  81. if (nr_nodes <= 0)
  82. return -1;
  83. if (nr_nodes > MAX_NUMNODES) {
  84. pr_info("numa=fake=%d too large, reducing to %d\n",
  85. nr_nodes, MAX_NUMNODES);
  86. nr_nodes = MAX_NUMNODES;
  87. }
  88. /*
  89. * Calculate target node size. x86_32 freaks on __udivdi3() so do
  90. * the division in ulong number of pages and convert back.
  91. */
  92. size = max_addr - addr - mem_hole_size(addr, max_addr);
  93. size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
  94. /*
  95. * Calculate the number of big nodes that can be allocated as a result
  96. * of consolidating the remainder.
  97. */
  98. big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
  99. FAKE_NODE_MIN_SIZE;
  100. size &= FAKE_NODE_MIN_HASH_MASK;
  101. if (!size) {
  102. pr_err("Not enough memory for each node. "
  103. "NUMA emulation disabled.\n");
  104. return -1;
  105. }
  106. /*
  107. * Continue to fill physical nodes with fake nodes until there is no
  108. * memory left on any of them.
  109. */
  110. while (!nodes_empty(physnode_mask)) {
  111. for_each_node_mask(i, physnode_mask) {
  112. u64 dma32_end = numa_emu_dma_end();
  113. u64 start, limit, end;
  114. int phys_blk;
  115. phys_blk = emu_find_memblk_by_nid(i, pi);
  116. if (phys_blk < 0) {
  117. node_clear(i, physnode_mask);
  118. continue;
  119. }
  120. start = pi->blk[phys_blk].start;
  121. limit = pi->blk[phys_blk].end;
  122. end = start + size;
  123. if (nid < big)
  124. end += FAKE_NODE_MIN_SIZE;
  125. /*
  126. * Continue to add memory to this fake node if its
  127. * non-reserved memory is less than the per-node size.
  128. */
  129. while (end - start - mem_hole_size(start, end) < size) {
  130. end += FAKE_NODE_MIN_SIZE;
  131. if (end > limit) {
  132. end = limit;
  133. break;
  134. }
  135. }
  136. /*
  137. * If there won't be at least FAKE_NODE_MIN_SIZE of
  138. * non-reserved memory in ZONE_DMA32 for the next node,
  139. * this one must extend to the boundary.
  140. */
  141. if (end < dma32_end && dma32_end - end -
  142. mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
  143. end = dma32_end;
  144. /*
  145. * If there won't be enough non-reserved memory for the
  146. * next node, this one must extend to the end of the
  147. * physical node.
  148. */
  149. if (limit - end - mem_hole_size(end, limit) < size)
  150. end = limit;
  151. ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
  152. phys_blk,
  153. min(end, limit) - start);
  154. if (ret < 0)
  155. return ret;
  156. }
  157. }
  158. return 0;
  159. }
  160. /*
  161. * Returns the end address of a node so that there is at least `size' amount of
  162. * non-reserved memory or `max_addr' is reached.
  163. */
  164. static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
  165. {
  166. u64 end = start + size;
  167. while (end - start - mem_hole_size(start, end) < size) {
  168. end += FAKE_NODE_MIN_SIZE;
  169. if (end > max_addr) {
  170. end = max_addr;
  171. break;
  172. }
  173. }
  174. return end;
  175. }
  176. static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes)
  177. {
  178. unsigned long max_pfn = PHYS_PFN(max_addr);
  179. unsigned long base_pfn = PHYS_PFN(base);
  180. unsigned long hole_pfns = PHYS_PFN(hole);
  181. return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes);
  182. }
  183. /*
  184. * Sets up fake nodes of `size' interleaved over physical nodes ranging from
  185. * `addr' to `max_addr'.
  186. *
  187. * Returns zero on success or negative on error.
  188. */
  189. static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
  190. struct numa_meminfo *pi,
  191. u64 addr, u64 max_addr, u64 size,
  192. int nr_nodes, struct numa_memblk *pblk,
  193. int nid)
  194. {
  195. nodemask_t physnode_mask = numa_nodes_parsed;
  196. int i, ret, uniform = 0;
  197. u64 min_size;
  198. if ((!size && !nr_nodes) || (nr_nodes && !pblk))
  199. return -1;
  200. /*
  201. * In the 'uniform' case split the passed in physical node by
  202. * nr_nodes, in the non-uniform case, ignore the passed in
  203. * physical block and try to create nodes of at least size
  204. * @size.
  205. *
  206. * In the uniform case, split the nodes strictly by physical
  207. * capacity, i.e. ignore holes. In the non-uniform case account
  208. * for holes and treat @size as a minimum floor.
  209. */
  210. if (!nr_nodes)
  211. nr_nodes = MAX_NUMNODES;
  212. else {
  213. nodes_clear(physnode_mask);
  214. node_set(pblk->nid, physnode_mask);
  215. uniform = 1;
  216. }
  217. if (uniform) {
  218. min_size = uniform_size(max_addr, addr, 0, nr_nodes);
  219. size = min_size;
  220. } else {
  221. /*
  222. * The limit on emulated nodes is MAX_NUMNODES, so the
  223. * size per node is increased accordingly if the
  224. * requested size is too small. This creates a uniform
  225. * distribution of node sizes across the entire machine
  226. * (but not necessarily over physical nodes).
  227. */
  228. min_size = uniform_size(max_addr, addr,
  229. mem_hole_size(addr, max_addr), nr_nodes);
  230. }
  231. min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
  232. if (size < min_size) {
  233. pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
  234. size / SZ_1M, min_size / SZ_1M);
  235. size = min_size;
  236. }
  237. size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
  238. /*
  239. * Fill physical nodes with fake nodes of size until there is no memory
  240. * left on any of them.
  241. */
  242. while (!nodes_empty(physnode_mask)) {
  243. for_each_node_mask(i, physnode_mask) {
  244. u64 dma32_end = numa_emu_dma_end();
  245. u64 start, limit, end;
  246. int phys_blk;
  247. phys_blk = emu_find_memblk_by_nid(i, pi);
  248. if (phys_blk < 0) {
  249. node_clear(i, physnode_mask);
  250. continue;
  251. }
  252. start = pi->blk[phys_blk].start;
  253. limit = pi->blk[phys_blk].end;
  254. if (uniform)
  255. end = start + size;
  256. else
  257. end = find_end_of_node(start, limit, size);
  258. /*
  259. * If there won't be at least FAKE_NODE_MIN_SIZE of
  260. * non-reserved memory in ZONE_DMA32 for the next node,
  261. * this one must extend to the boundary.
  262. */
  263. if (end < dma32_end && dma32_end - end -
  264. mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
  265. end = dma32_end;
  266. /*
  267. * If there won't be enough non-reserved memory for the
  268. * next node, this one must extend to the end of the
  269. * physical node.
  270. */
  271. if ((limit - end - mem_hole_size(end, limit) < size)
  272. && !uniform)
  273. end = limit;
  274. ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
  275. phys_blk,
  276. min(end, limit) - start);
  277. if (ret < 0)
  278. return ret;
  279. }
  280. }
  281. return nid;
  282. }
  283. static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
  284. struct numa_meminfo *pi,
  285. u64 addr, u64 max_addr, u64 size)
  286. {
  287. return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
  288. 0, NULL, 0);
  289. }
  290. static int __init setup_emu2phys_nid(int *dfl_phys_nid)
  291. {
  292. int i, max_emu_nid = 0;
  293. *dfl_phys_nid = NUMA_NO_NODE;
  294. for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
  295. if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
  296. max_emu_nid = i;
  297. if (*dfl_phys_nid == NUMA_NO_NODE)
  298. *dfl_phys_nid = emu_nid_to_phys[i];
  299. }
  300. }
  301. return max_emu_nid;
  302. }
  303. /**
  304. * numa_emulation - Emulate NUMA nodes
  305. * @numa_meminfo: NUMA configuration to massage
  306. * @numa_dist_cnt: The size of the physical NUMA distance table
  307. *
  308. * Emulate NUMA nodes according to the numa=fake kernel parameter.
  309. * @numa_meminfo contains the physical memory configuration and is modified
  310. * to reflect the emulated configuration on success. @numa_dist_cnt is
  311. * used to determine the size of the physical distance table.
  312. *
  313. * On success, the following modifications are made.
  314. *
  315. * - @numa_meminfo is updated to reflect the emulated nodes.
  316. *
  317. * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
  318. * emulated nodes.
  319. *
  320. * - NUMA distance table is rebuilt to represent distances between emulated
  321. * nodes. The distances are determined considering how emulated nodes
  322. * are mapped to physical nodes and match the actual distances.
  323. *
  324. * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
  325. * nodes. This is used by numa_add_cpu() and numa_remove_cpu().
  326. *
  327. * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
  328. * identity mapping and no other modification is made.
  329. */
  330. void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
  331. {
  332. static struct numa_meminfo ei __initdata;
  333. static struct numa_meminfo pi __initdata;
  334. const u64 max_addr = PFN_PHYS(max_pfn);
  335. u8 *phys_dist = NULL;
  336. size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
  337. int max_emu_nid, dfl_phys_nid;
  338. int i, j, ret;
  339. nodemask_t physnode_mask = numa_nodes_parsed;
  340. if (!emu_cmdline)
  341. goto no_emu;
  342. memset(&ei, 0, sizeof(ei));
  343. pi = *numa_meminfo;
  344. for (i = 0; i < MAX_NUMNODES; i++)
  345. emu_nid_to_phys[i] = NUMA_NO_NODE;
  346. /*
  347. * If the numa=fake command-line contains a 'M' or 'G', it represents
  348. * the fixed node size. Otherwise, if it is just a single number N,
  349. * split the system RAM into N fake nodes.
  350. */
  351. if (strchr(emu_cmdline, 'U')) {
  352. unsigned long n;
  353. int nid = 0;
  354. n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
  355. ret = -1;
  356. for_each_node_mask(i, physnode_mask) {
  357. /*
  358. * The reason we pass in blk[0] is due to
  359. * numa_remove_memblk_from() called by
  360. * emu_setup_memblk() will delete entry 0
  361. * and then move everything else up in the pi.blk
  362. * array. Therefore we should always be looking
  363. * at blk[0].
  364. */
  365. ret = split_nodes_size_interleave_uniform(&ei, &pi,
  366. pi.blk[0].start, pi.blk[0].end, 0,
  367. n, &pi.blk[0], nid);
  368. if (ret < 0)
  369. break;
  370. if (ret < n) {
  371. pr_info("%s: phys: %d only got %d of %ld nodes, failing\n",
  372. __func__, i, ret, n);
  373. ret = -1;
  374. break;
  375. }
  376. nid = ret;
  377. }
  378. } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
  379. u64 size;
  380. size = memparse(emu_cmdline, &emu_cmdline);
  381. ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
  382. } else {
  383. unsigned long n;
  384. n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
  385. ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
  386. }
  387. if (*emu_cmdline == ':')
  388. emu_cmdline++;
  389. if (ret < 0)
  390. goto no_emu;
  391. if (numa_cleanup_meminfo(&ei) < 0) {
  392. pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
  393. goto no_emu;
  394. }
  395. /* copy the physical distance table */
  396. if (numa_dist_cnt) {
  397. phys_dist = memblock_alloc(phys_size, PAGE_SIZE);
  398. if (!phys_dist) {
  399. pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
  400. goto no_emu;
  401. }
  402. for (i = 0; i < numa_dist_cnt; i++)
  403. for (j = 0; j < numa_dist_cnt; j++)
  404. phys_dist[i * numa_dist_cnt + j] =
  405. node_distance(i, j);
  406. }
  407. /*
  408. * Determine the max emulated nid and the default phys nid to use
  409. * for unmapped nodes.
  410. */
  411. max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
  412. /* Make sure numa_nodes_parsed only contains emulated nodes */
  413. nodes_clear(numa_nodes_parsed);
  414. for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
  415. if (ei.blk[i].start != ei.blk[i].end &&
  416. ei.blk[i].nid != NUMA_NO_NODE)
  417. node_set(ei.blk[i].nid, numa_nodes_parsed);
  418. /* fix pxm_to_node_map[] and node_to_pxm_map[] to avoid collision
  419. * with faked numa nodes, particularly during later memory hotplug
  420. * handling, and also update numa_nodes_parsed accordingly.
  421. */
  422. ret = fix_pxm_node_maps(max_emu_nid);
  423. if (ret < 0)
  424. goto no_emu;
  425. /* commit */
  426. *numa_meminfo = ei;
  427. numa_emu_update_cpu_to_node(emu_nid_to_phys, max_emu_nid + 1);
  428. /* make sure all emulated nodes are mapped to a physical node */
  429. for (i = 0; i < max_emu_nid + 1; i++)
  430. if (emu_nid_to_phys[i] == NUMA_NO_NODE)
  431. emu_nid_to_phys[i] = dfl_phys_nid;
  432. /* transform distance table */
  433. numa_reset_distance();
  434. for (i = 0; i < max_emu_nid + 1; i++) {
  435. for (j = 0; j < max_emu_nid + 1; j++) {
  436. int physi = emu_nid_to_phys[i];
  437. int physj = emu_nid_to_phys[j];
  438. int dist;
  439. if (get_option(&emu_cmdline, &dist) == 2)
  440. ;
  441. else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
  442. dist = physi == physj ?
  443. LOCAL_DISTANCE : REMOTE_DISTANCE;
  444. else
  445. dist = phys_dist[physi * numa_dist_cnt + physj];
  446. numa_set_distance(i, j, dist);
  447. }
  448. }
  449. for (i = 0; i < numa_distance_cnt; i++) {
  450. for (j = 0; j < numa_distance_cnt; j++) {
  451. int physi, physj;
  452. u8 dist;
  453. /* distance between fake nodes is already ok */
  454. if (emu_nid_to_phys[i] != NUMA_NO_NODE &&
  455. emu_nid_to_phys[j] != NUMA_NO_NODE)
  456. continue;
  457. if (emu_nid_to_phys[i] != NUMA_NO_NODE)
  458. physi = emu_nid_to_phys[i];
  459. else
  460. physi = i - max_emu_nid;
  461. if (emu_nid_to_phys[j] != NUMA_NO_NODE)
  462. physj = emu_nid_to_phys[j];
  463. else
  464. physj = j - max_emu_nid;
  465. dist = phys_dist[physi * numa_dist_cnt + physj];
  466. numa_set_distance(i, j, dist);
  467. }
  468. }
  469. /* free the copied physical distance table */
  470. memblock_free(phys_dist, phys_size);
  471. return;
  472. no_emu:
  473. numa_nodes_parsed = physnode_mask;
  474. /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
  475. for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
  476. emu_nid_to_phys[i] = i;
  477. }
  478. #ifndef CONFIG_DEBUG_PER_CPU_MAPS
  479. void numa_add_cpu(unsigned int cpu)
  480. {
  481. int physnid, nid;
  482. nid = early_cpu_to_node(cpu);
  483. BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
  484. physnid = emu_nid_to_phys[nid];
  485. /*
  486. * Map the cpu to each emulated node that is allocated on the physical
  487. * node of the cpu's apic id.
  488. */
  489. for_each_online_node(nid)
  490. if (emu_nid_to_phys[nid] == physnid)
  491. cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
  492. }
  493. void numa_remove_cpu(unsigned int cpu)
  494. {
  495. int i;
  496. for_each_online_node(i)
  497. cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
  498. }
  499. #else /* !CONFIG_DEBUG_PER_CPU_MAPS */
  500. static void numa_set_cpumask(unsigned int cpu, bool enable)
  501. {
  502. int nid, physnid;
  503. nid = early_cpu_to_node(cpu);
  504. if (nid == NUMA_NO_NODE) {
  505. /* early_cpu_to_node() already emits a warning and trace */
  506. return;
  507. }
  508. physnid = emu_nid_to_phys[nid];
  509. for_each_online_node(nid) {
  510. if (emu_nid_to_phys[nid] != physnid)
  511. continue;
  512. debug_cpumask_set_cpu(cpu, nid, enable);
  513. }
  514. }
  515. void numa_add_cpu(unsigned int cpu)
  516. {
  517. numa_set_cpumask(cpu, true);
  518. }
  519. void numa_remove_cpu(unsigned int cpu)
  520. {
  521. numa_set_cpumask(cpu, false);
  522. }
  523. #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */