numa.c 36 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * pSeries NUMA support
  4. *
  5. * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
  6. */
  7. #define pr_fmt(fmt) "numa: " fmt
  8. #include <linux/threads.h>
  9. #include <linux/memblock.h>
  10. #include <linux/init.h>
  11. #include <linux/mm.h>
  12. #include <linux/mmzone.h>
  13. #include <linux/export.h>
  14. #include <linux/nodemask.h>
  15. #include <linux/cpu.h>
  16. #include <linux/notifier.h>
  17. #include <linux/of.h>
  18. #include <linux/of_address.h>
  19. #include <linux/pfn.h>
  20. #include <linux/cpuset.h>
  21. #include <linux/node.h>
  22. #include <linux/stop_machine.h>
  23. #include <linux/proc_fs.h>
  24. #include <linux/seq_file.h>
  25. #include <linux/uaccess.h>
  26. #include <linux/slab.h>
  27. #include <asm/cputhreads.h>
  28. #include <asm/sparsemem.h>
  29. #include <asm/smp.h>
  30. #include <asm/topology.h>
  31. #include <asm/firmware.h>
  32. #include <asm/paca.h>
  33. #include <asm/hvcall.h>
  34. #include <asm/setup.h>
  35. #include <asm/vdso.h>
  36. #include <asm/vphn.h>
  37. #include <asm/drmem.h>
  38. static int numa_enabled = 1;
  39. static char *cmdline __initdata;
  40. int numa_cpu_lookup_table[NR_CPUS];
  41. cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
  42. EXPORT_SYMBOL(numa_cpu_lookup_table);
  43. EXPORT_SYMBOL(node_to_cpumask_map);
  44. static int primary_domain_index;
  45. static int n_mem_addr_cells, n_mem_size_cells;
  46. #define FORM0_AFFINITY 0
  47. #define FORM1_AFFINITY 1
  48. #define FORM2_AFFINITY 2
  49. static int affinity_form;
  50. #define MAX_DISTANCE_REF_POINTS 4
  51. static int distance_ref_points_depth;
  52. static const __be32 *distance_ref_points;
  53. static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
  54. static int numa_distance_table[MAX_NUMNODES][MAX_NUMNODES] = {
  55. [0 ... MAX_NUMNODES - 1] = { [0 ... MAX_NUMNODES - 1] = -1 }
  56. };
  57. static int numa_id_index_table[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE };
  58. /*
  59. * Allocate node_to_cpumask_map based on number of available nodes
  60. * Requires node_possible_map to be valid.
  61. *
  62. * Note: cpumask_of_node() is not valid until after this is done.
  63. */
  64. static void __init setup_node_to_cpumask_map(void)
  65. {
  66. unsigned int node;
  67. /* setup nr_node_ids if not done yet */
  68. if (nr_node_ids == MAX_NUMNODES)
  69. setup_nr_node_ids();
  70. /* allocate the map */
  71. for_each_node(node)
  72. alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
  73. /* cpumask_of_node() will now work */
  74. pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
  75. }
  76. static int __init fake_numa_create_new_node(unsigned long end_pfn,
  77. unsigned int *nid)
  78. {
  79. unsigned long long mem;
  80. char *p = cmdline;
  81. static unsigned int fake_nid;
  82. static unsigned long long curr_boundary;
  83. /*
  84. * Modify node id, iff we started creating NUMA nodes
  85. * We want to continue from where we left of the last time
  86. */
  87. if (fake_nid)
  88. *nid = fake_nid;
  89. /*
  90. * In case there are no more arguments to parse, the
  91. * node_id should be the same as the last fake node id
  92. * (we've handled this above).
  93. */
  94. if (!p)
  95. return 0;
  96. mem = memparse(p, &p);
  97. if (!mem)
  98. return 0;
  99. if (mem < curr_boundary)
  100. return 0;
  101. curr_boundary = mem;
  102. if ((end_pfn << PAGE_SHIFT) > mem) {
  103. /*
  104. * Skip commas and spaces
  105. */
  106. while (*p == ',' || *p == ' ' || *p == '\t')
  107. p++;
  108. cmdline = p;
  109. fake_nid++;
  110. *nid = fake_nid;
  111. pr_debug("created new fake_node with id %d\n", fake_nid);
  112. return 1;
  113. }
  114. return 0;
  115. }
  116. static void __init reset_numa_cpu_lookup_table(void)
  117. {
  118. unsigned int cpu;
  119. for_each_possible_cpu(cpu)
  120. numa_cpu_lookup_table[cpu] = -1;
  121. }
  122. void map_cpu_to_node(int cpu, int node)
  123. {
  124. update_numa_cpu_lookup_table(cpu, node);
  125. if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node]))) {
  126. pr_debug("adding cpu %d to node %d\n", cpu, node);
  127. cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
  128. }
  129. }
  130. #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
  131. void unmap_cpu_from_node(unsigned long cpu)
  132. {
  133. int node = numa_cpu_lookup_table[cpu];
  134. if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
  135. cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
  136. pr_debug("removing cpu %lu from node %d\n", cpu, node);
  137. } else {
  138. pr_warn("Warning: cpu %lu not found in node %d\n", cpu, node);
  139. }
  140. }
  141. #endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
  142. static int __associativity_to_nid(const __be32 *associativity,
  143. int max_array_sz)
  144. {
  145. int nid;
  146. /*
  147. * primary_domain_index is 1 based array index.
  148. */
  149. int index = primary_domain_index - 1;
  150. if (!numa_enabled || index >= max_array_sz)
  151. return NUMA_NO_NODE;
  152. nid = of_read_number(&associativity[index], 1);
  153. /* POWER4 LPAR uses 0xffff as invalid node */
  154. if (nid == 0xffff || nid >= nr_node_ids)
  155. nid = NUMA_NO_NODE;
  156. return nid;
  157. }
  158. /*
  159. * Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
  160. * info is found.
  161. */
  162. static int associativity_to_nid(const __be32 *associativity)
  163. {
  164. int array_sz = of_read_number(associativity, 1);
  165. /* Skip the first element in the associativity array */
  166. return __associativity_to_nid((associativity + 1), array_sz);
  167. }
  168. static int __cpu_form2_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
  169. {
  170. int dist;
  171. int node1, node2;
  172. node1 = associativity_to_nid(cpu1_assoc);
  173. node2 = associativity_to_nid(cpu2_assoc);
  174. dist = numa_distance_table[node1][node2];
  175. if (dist <= LOCAL_DISTANCE)
  176. return 0;
  177. else if (dist <= REMOTE_DISTANCE)
  178. return 1;
  179. else
  180. return 2;
  181. }
  182. static int __cpu_form1_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
  183. {
  184. int dist = 0;
  185. int i, index;
  186. for (i = 0; i < distance_ref_points_depth; i++) {
  187. index = be32_to_cpu(distance_ref_points[i]);
  188. if (cpu1_assoc[index] == cpu2_assoc[index])
  189. break;
  190. dist++;
  191. }
  192. return dist;
  193. }
  194. int cpu_relative_distance(__be32 *cpu1_assoc, __be32 *cpu2_assoc)
  195. {
  196. /* We should not get called with FORM0 */
  197. VM_WARN_ON(affinity_form == FORM0_AFFINITY);
  198. if (affinity_form == FORM1_AFFINITY)
  199. return __cpu_form1_relative_distance(cpu1_assoc, cpu2_assoc);
  200. return __cpu_form2_relative_distance(cpu1_assoc, cpu2_assoc);
  201. }
  202. /* must hold reference to node during call */
  203. static const __be32 *of_get_associativity(struct device_node *dev)
  204. {
  205. return of_get_property(dev, "ibm,associativity", NULL);
  206. }
  207. int __node_distance(int a, int b)
  208. {
  209. int i;
  210. int distance = LOCAL_DISTANCE;
  211. if (affinity_form == FORM2_AFFINITY)
  212. return numa_distance_table[a][b];
  213. else if (affinity_form == FORM0_AFFINITY)
  214. return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
  215. for (i = 0; i < distance_ref_points_depth; i++) {
  216. if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
  217. break;
  218. /* Double the distance for each NUMA level */
  219. distance *= 2;
  220. }
  221. return distance;
  222. }
  223. EXPORT_SYMBOL(__node_distance);
  224. /* Returns the nid associated with the given device tree node,
  225. * or -1 if not found.
  226. */
  227. static int of_node_to_nid_single(struct device_node *device)
  228. {
  229. int nid = NUMA_NO_NODE;
  230. const __be32 *tmp;
  231. tmp = of_get_associativity(device);
  232. if (tmp)
  233. nid = associativity_to_nid(tmp);
  234. return nid;
  235. }
  236. /* Walk the device tree upwards, looking for an associativity id */
  237. int of_node_to_nid(struct device_node *device)
  238. {
  239. int nid = NUMA_NO_NODE;
  240. of_node_get(device);
  241. while (device) {
  242. nid = of_node_to_nid_single(device);
  243. if (nid != -1)
  244. break;
  245. device = of_get_next_parent(device);
  246. }
  247. of_node_put(device);
  248. return nid;
  249. }
  250. EXPORT_SYMBOL(of_node_to_nid);
  251. static void __initialize_form1_numa_distance(const __be32 *associativity,
  252. int max_array_sz)
  253. {
  254. int i, nid;
  255. if (affinity_form != FORM1_AFFINITY)
  256. return;
  257. nid = __associativity_to_nid(associativity, max_array_sz);
  258. if (nid != NUMA_NO_NODE) {
  259. for (i = 0; i < distance_ref_points_depth; i++) {
  260. const __be32 *entry;
  261. int index = be32_to_cpu(distance_ref_points[i]) - 1;
  262. /*
  263. * broken hierarchy, return with broken distance table
  264. */
  265. if (WARN(index >= max_array_sz, "Broken ibm,associativity property"))
  266. return;
  267. entry = &associativity[index];
  268. distance_lookup_table[nid][i] = of_read_number(entry, 1);
  269. }
  270. }
  271. }
  272. static void initialize_form1_numa_distance(const __be32 *associativity)
  273. {
  274. int array_sz;
  275. array_sz = of_read_number(associativity, 1);
  276. /* Skip the first element in the associativity array */
  277. __initialize_form1_numa_distance(associativity + 1, array_sz);
  278. }
  279. /*
  280. * Used to update distance information w.r.t newly added node.
  281. */
  282. void update_numa_distance(struct device_node *node)
  283. {
  284. int nid;
  285. if (affinity_form == FORM0_AFFINITY)
  286. return;
  287. else if (affinity_form == FORM1_AFFINITY) {
  288. const __be32 *associativity;
  289. associativity = of_get_associativity(node);
  290. if (!associativity)
  291. return;
  292. initialize_form1_numa_distance(associativity);
  293. return;
  294. }
  295. /* FORM2 affinity */
  296. nid = of_node_to_nid_single(node);
  297. if (nid == NUMA_NO_NODE)
  298. return;
  299. /*
  300. * With FORM2 we expect NUMA distance of all possible NUMA
  301. * nodes to be provided during boot.
  302. */
  303. WARN(numa_distance_table[nid][nid] == -1,
  304. "NUMA distance details for node %d not provided\n", nid);
  305. }
  306. EXPORT_SYMBOL_GPL(update_numa_distance);
  307. /*
  308. * ibm,numa-lookup-index-table= {N, domainid1, domainid2, ..... domainidN}
  309. * ibm,numa-distance-table = { N, 1, 2, 4, 5, 1, 6, .... N elements}
  310. */
  311. static void __init initialize_form2_numa_distance_lookup_table(void)
  312. {
  313. int i, j;
  314. struct device_node *root;
  315. const __u8 *form2_distances;
  316. const __be32 *numa_lookup_index;
  317. int form2_distances_length;
  318. int max_numa_index, distance_index;
  319. if (firmware_has_feature(FW_FEATURE_OPAL))
  320. root = of_find_node_by_path("/ibm,opal");
  321. else
  322. root = of_find_node_by_path("/rtas");
  323. if (!root)
  324. root = of_find_node_by_path("/");
  325. numa_lookup_index = of_get_property(root, "ibm,numa-lookup-index-table", NULL);
  326. max_numa_index = of_read_number(&numa_lookup_index[0], 1);
  327. /* first element of the array is the size and is encode-int */
  328. form2_distances = of_get_property(root, "ibm,numa-distance-table", NULL);
  329. form2_distances_length = of_read_number((const __be32 *)&form2_distances[0], 1);
  330. /* Skip the size which is encoded int */
  331. form2_distances += sizeof(__be32);
  332. pr_debug("form2_distances_len = %d, numa_dist_indexes_len = %d\n",
  333. form2_distances_length, max_numa_index);
  334. for (i = 0; i < max_numa_index; i++)
  335. /* +1 skip the max_numa_index in the property */
  336. numa_id_index_table[i] = of_read_number(&numa_lookup_index[i + 1], 1);
  337. if (form2_distances_length != max_numa_index * max_numa_index) {
  338. WARN(1, "Wrong NUMA distance information\n");
  339. form2_distances = NULL; // don't use it
  340. }
  341. distance_index = 0;
  342. for (i = 0; i < max_numa_index; i++) {
  343. for (j = 0; j < max_numa_index; j++) {
  344. int nodeA = numa_id_index_table[i];
  345. int nodeB = numa_id_index_table[j];
  346. int dist;
  347. if (form2_distances)
  348. dist = form2_distances[distance_index++];
  349. else if (nodeA == nodeB)
  350. dist = LOCAL_DISTANCE;
  351. else
  352. dist = REMOTE_DISTANCE;
  353. numa_distance_table[nodeA][nodeB] = dist;
  354. pr_debug("dist[%d][%d]=%d ", nodeA, nodeB, dist);
  355. }
  356. }
  357. of_node_put(root);
  358. }
  359. static int __init find_primary_domain_index(void)
  360. {
  361. int index;
  362. struct device_node *root;
  363. /*
  364. * Check for which form of affinity.
  365. */
  366. if (firmware_has_feature(FW_FEATURE_OPAL)) {
  367. affinity_form = FORM1_AFFINITY;
  368. } else if (firmware_has_feature(FW_FEATURE_FORM2_AFFINITY)) {
  369. pr_debug("Using form 2 affinity\n");
  370. affinity_form = FORM2_AFFINITY;
  371. } else if (firmware_has_feature(FW_FEATURE_FORM1_AFFINITY)) {
  372. pr_debug("Using form 1 affinity\n");
  373. affinity_form = FORM1_AFFINITY;
  374. } else
  375. affinity_form = FORM0_AFFINITY;
  376. if (firmware_has_feature(FW_FEATURE_OPAL))
  377. root = of_find_node_by_path("/ibm,opal");
  378. else
  379. root = of_find_node_by_path("/rtas");
  380. if (!root)
  381. root = of_find_node_by_path("/");
  382. /*
  383. * This property is a set of 32-bit integers, each representing
  384. * an index into the ibm,associativity nodes.
  385. *
  386. * With form 0 affinity the first integer is for an SMP configuration
  387. * (should be all 0's) and the second is for a normal NUMA
  388. * configuration. We have only one level of NUMA.
  389. *
  390. * With form 1 affinity the first integer is the most significant
  391. * NUMA boundary and the following are progressively less significant
  392. * boundaries. There can be more than one level of NUMA.
  393. */
  394. distance_ref_points = of_get_property(root,
  395. "ibm,associativity-reference-points",
  396. &distance_ref_points_depth);
  397. if (!distance_ref_points) {
  398. pr_debug("ibm,associativity-reference-points not found.\n");
  399. goto err;
  400. }
  401. distance_ref_points_depth /= sizeof(int);
  402. if (affinity_form == FORM0_AFFINITY) {
  403. if (distance_ref_points_depth < 2) {
  404. pr_warn("short ibm,associativity-reference-points\n");
  405. goto err;
  406. }
  407. index = of_read_number(&distance_ref_points[1], 1);
  408. } else {
  409. /*
  410. * Both FORM1 and FORM2 affinity find the primary domain details
  411. * at the same offset.
  412. */
  413. index = of_read_number(distance_ref_points, 1);
  414. }
  415. /*
  416. * Warn and cap if the hardware supports more than
  417. * MAX_DISTANCE_REF_POINTS domains.
  418. */
  419. if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
  420. pr_warn("distance array capped at %d entries\n",
  421. MAX_DISTANCE_REF_POINTS);
  422. distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
  423. }
  424. of_node_put(root);
  425. return index;
  426. err:
  427. of_node_put(root);
  428. return -1;
  429. }
  430. static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
  431. {
  432. struct device_node *memory = NULL;
  433. memory = of_find_node_by_type(memory, "memory");
  434. if (!memory)
  435. panic("numa.c: No memory nodes found!");
  436. *n_addr_cells = of_n_addr_cells(memory);
  437. *n_size_cells = of_n_size_cells(memory);
  438. of_node_put(memory);
  439. }
  440. static unsigned long read_n_cells(int n, const __be32 **buf)
  441. {
  442. unsigned long result = 0;
  443. while (n--) {
  444. result = (result << 32) | of_read_number(*buf, 1);
  445. (*buf)++;
  446. }
  447. return result;
  448. }
  449. struct assoc_arrays {
  450. u32 n_arrays;
  451. u32 array_sz;
  452. const __be32 *arrays;
  453. };
  454. /*
  455. * Retrieve and validate the list of associativity arrays for drconf
  456. * memory from the ibm,associativity-lookup-arrays property of the
  457. * device tree..
  458. *
  459. * The layout of the ibm,associativity-lookup-arrays property is a number N
  460. * indicating the number of associativity arrays, followed by a number M
  461. * indicating the size of each associativity array, followed by a list
  462. * of N associativity arrays.
  463. */
  464. static int of_get_assoc_arrays(struct assoc_arrays *aa)
  465. {
  466. struct device_node *memory;
  467. const __be32 *prop;
  468. u32 len;
  469. memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
  470. if (!memory)
  471. return -1;
  472. prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
  473. if (!prop || len < 2 * sizeof(unsigned int)) {
  474. of_node_put(memory);
  475. return -1;
  476. }
  477. aa->n_arrays = of_read_number(prop++, 1);
  478. aa->array_sz = of_read_number(prop++, 1);
  479. of_node_put(memory);
  480. /* Now that we know the number of arrays and size of each array,
  481. * revalidate the size of the property read in.
  482. */
  483. if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
  484. return -1;
  485. aa->arrays = prop;
  486. return 0;
  487. }
  488. static int __init get_nid_and_numa_distance(struct drmem_lmb *lmb)
  489. {
  490. struct assoc_arrays aa = { .arrays = NULL };
  491. int default_nid = NUMA_NO_NODE;
  492. int nid = default_nid;
  493. int rc, index;
  494. if ((primary_domain_index < 0) || !numa_enabled)
  495. return default_nid;
  496. rc = of_get_assoc_arrays(&aa);
  497. if (rc)
  498. return default_nid;
  499. if (primary_domain_index <= aa.array_sz &&
  500. !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
  501. const __be32 *associativity;
  502. index = lmb->aa_index * aa.array_sz;
  503. associativity = &aa.arrays[index];
  504. nid = __associativity_to_nid(associativity, aa.array_sz);
  505. if (nid > 0 && affinity_form == FORM1_AFFINITY) {
  506. /*
  507. * lookup array associativity entries have
  508. * no length of the array as the first element.
  509. */
  510. __initialize_form1_numa_distance(associativity, aa.array_sz);
  511. }
  512. }
  513. return nid;
  514. }
  515. /*
  516. * This is like of_node_to_nid_single() for memory represented in the
  517. * ibm,dynamic-reconfiguration-memory node.
  518. */
  519. int of_drconf_to_nid_single(struct drmem_lmb *lmb)
  520. {
  521. struct assoc_arrays aa = { .arrays = NULL };
  522. int default_nid = NUMA_NO_NODE;
  523. int nid = default_nid;
  524. int rc, index;
  525. if ((primary_domain_index < 0) || !numa_enabled)
  526. return default_nid;
  527. rc = of_get_assoc_arrays(&aa);
  528. if (rc)
  529. return default_nid;
  530. if (primary_domain_index <= aa.array_sz &&
  531. !(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
  532. const __be32 *associativity;
  533. index = lmb->aa_index * aa.array_sz;
  534. associativity = &aa.arrays[index];
  535. nid = __associativity_to_nid(associativity, aa.array_sz);
  536. }
  537. return nid;
  538. }
  539. #ifdef CONFIG_PPC_SPLPAR
  540. static int __vphn_get_associativity(long lcpu, __be32 *associativity)
  541. {
  542. long rc, hwid;
  543. /*
  544. * On a shared lpar, device tree will not have node associativity.
  545. * At this time lppaca, or its __old_status field may not be
  546. * updated. Hence kernel cannot detect if its on a shared lpar. So
  547. * request an explicit associativity irrespective of whether the
  548. * lpar is shared or dedicated. Use the device tree property as a
  549. * fallback. cpu_to_phys_id is only valid between
  550. * smp_setup_cpu_maps() and smp_setup_pacas().
  551. */
  552. if (firmware_has_feature(FW_FEATURE_VPHN)) {
  553. if (cpu_to_phys_id)
  554. hwid = cpu_to_phys_id[lcpu];
  555. else
  556. hwid = get_hard_smp_processor_id(lcpu);
  557. rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
  558. if (rc == H_SUCCESS)
  559. return 0;
  560. }
  561. return -1;
  562. }
  563. static int vphn_get_nid(long lcpu)
  564. {
  565. __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
  566. if (!__vphn_get_associativity(lcpu, associativity))
  567. return associativity_to_nid(associativity);
  568. return NUMA_NO_NODE;
  569. }
  570. #else
  571. static int __vphn_get_associativity(long lcpu, __be32 *associativity)
  572. {
  573. return -1;
  574. }
  575. static int vphn_get_nid(long unused)
  576. {
  577. return NUMA_NO_NODE;
  578. }
  579. #endif /* CONFIG_PPC_SPLPAR */
  580. /*
  581. * Figure out to which domain a cpu belongs and stick it there.
  582. * Return the id of the domain used.
  583. */
  584. static int numa_setup_cpu(unsigned long lcpu)
  585. {
  586. struct device_node *cpu;
  587. int fcpu = cpu_first_thread_sibling(lcpu);
  588. int nid = NUMA_NO_NODE;
  589. if (!cpu_present(lcpu)) {
  590. set_cpu_numa_node(lcpu, first_online_node);
  591. return first_online_node;
  592. }
  593. /*
  594. * If a valid cpu-to-node mapping is already available, use it
  595. * directly instead of querying the firmware, since it represents
  596. * the most recent mapping notified to us by the platform (eg: VPHN).
  597. * Since cpu_to_node binding remains the same for all threads in the
  598. * core. If a valid cpu-to-node mapping is already available, for
  599. * the first thread in the core, use it.
  600. */
  601. nid = numa_cpu_lookup_table[fcpu];
  602. if (nid >= 0) {
  603. map_cpu_to_node(lcpu, nid);
  604. return nid;
  605. }
  606. nid = vphn_get_nid(lcpu);
  607. if (nid != NUMA_NO_NODE)
  608. goto out_present;
  609. cpu = of_get_cpu_node(lcpu, NULL);
  610. if (!cpu) {
  611. WARN_ON(1);
  612. if (cpu_present(lcpu))
  613. goto out_present;
  614. else
  615. goto out;
  616. }
  617. nid = of_node_to_nid_single(cpu);
  618. of_node_put(cpu);
  619. out_present:
  620. if (nid < 0 || !node_possible(nid))
  621. nid = first_online_node;
  622. /*
  623. * Update for the first thread of the core. All threads of a core
  624. * have to be part of the same node. This not only avoids querying
  625. * for every other thread in the core, but always avoids a case
  626. * where virtual node associativity change causes subsequent threads
  627. * of a core to be associated with different nid. However if first
  628. * thread is already online, expect it to have a valid mapping.
  629. */
  630. if (fcpu != lcpu) {
  631. WARN_ON(cpu_online(fcpu));
  632. map_cpu_to_node(fcpu, nid);
  633. }
  634. map_cpu_to_node(lcpu, nid);
  635. out:
  636. return nid;
  637. }
  638. static void verify_cpu_node_mapping(int cpu, int node)
  639. {
  640. int base, sibling, i;
  641. /* Verify that all the threads in the core belong to the same node */
  642. base = cpu_first_thread_sibling(cpu);
  643. for (i = 0; i < threads_per_core; i++) {
  644. sibling = base + i;
  645. if (sibling == cpu || cpu_is_offline(sibling))
  646. continue;
  647. if (cpu_to_node(sibling) != node) {
  648. WARN(1, "CPU thread siblings %d and %d don't belong"
  649. " to the same node!\n", cpu, sibling);
  650. break;
  651. }
  652. }
  653. }
  654. /* Must run before sched domains notifier. */
  655. static int ppc_numa_cpu_prepare(unsigned int cpu)
  656. {
  657. int nid;
  658. nid = numa_setup_cpu(cpu);
  659. verify_cpu_node_mapping(cpu, nid);
  660. return 0;
  661. }
  662. static int ppc_numa_cpu_dead(unsigned int cpu)
  663. {
  664. return 0;
  665. }
  666. /*
  667. * Check and possibly modify a memory region to enforce the memory limit.
  668. *
  669. * Returns the size the region should have to enforce the memory limit.
  670. * This will either be the original value of size, a truncated value,
  671. * or zero. If the returned value of size is 0 the region should be
  672. * discarded as it lies wholly above the memory limit.
  673. */
  674. static unsigned long __init numa_enforce_memory_limit(unsigned long start,
  675. unsigned long size)
  676. {
  677. /*
  678. * We use memblock_end_of_DRAM() in here instead of memory_limit because
  679. * we've already adjusted it for the limit and it takes care of
  680. * having memory holes below the limit. Also, in the case of
  681. * iommu_is_off, memory_limit is not set but is implicitly enforced.
  682. */
  683. if (start + size <= memblock_end_of_DRAM())
  684. return size;
  685. if (start >= memblock_end_of_DRAM())
  686. return 0;
  687. return memblock_end_of_DRAM() - start;
  688. }
  689. /*
  690. * Reads the counter for a given entry in
  691. * linux,drconf-usable-memory property
  692. */
  693. static inline int __init read_usm_ranges(const __be32 **usm)
  694. {
  695. /*
  696. * For each lmb in ibm,dynamic-memory a corresponding
  697. * entry in linux,drconf-usable-memory property contains
  698. * a counter followed by that many (base, size) duple.
  699. * read the counter from linux,drconf-usable-memory
  700. */
  701. return read_n_cells(n_mem_size_cells, usm);
  702. }
  703. /*
  704. * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
  705. * node. This assumes n_mem_{addr,size}_cells have been set.
  706. */
  707. static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
  708. const __be32 **usm,
  709. void *data)
  710. {
  711. unsigned int ranges, is_kexec_kdump = 0;
  712. unsigned long base, size, sz;
  713. int nid;
  714. /*
  715. * Skip this block if the reserved bit is set in flags (0x80)
  716. * or if the block is not assigned to this partition (0x8)
  717. */
  718. if ((lmb->flags & DRCONF_MEM_RESERVED)
  719. || !(lmb->flags & DRCONF_MEM_ASSIGNED))
  720. return 0;
  721. if (*usm)
  722. is_kexec_kdump = 1;
  723. base = lmb->base_addr;
  724. size = drmem_lmb_size();
  725. ranges = 1;
  726. if (is_kexec_kdump) {
  727. ranges = read_usm_ranges(usm);
  728. if (!ranges) /* there are no (base, size) duple */
  729. return 0;
  730. }
  731. do {
  732. if (is_kexec_kdump) {
  733. base = read_n_cells(n_mem_addr_cells, usm);
  734. size = read_n_cells(n_mem_size_cells, usm);
  735. }
  736. nid = get_nid_and_numa_distance(lmb);
  737. fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
  738. &nid);
  739. node_set_online(nid);
  740. sz = numa_enforce_memory_limit(base, size);
  741. if (sz)
  742. memblock_set_node(base, sz, &memblock.memory, nid);
  743. } while (--ranges);
  744. return 0;
  745. }
  746. static int __init parse_numa_properties(void)
  747. {
  748. struct device_node *memory, *pci;
  749. int default_nid = 0;
  750. unsigned long i;
  751. const __be32 *associativity;
  752. if (numa_enabled == 0) {
  753. pr_warn("disabled by user\n");
  754. return -1;
  755. }
  756. primary_domain_index = find_primary_domain_index();
  757. if (primary_domain_index < 0) {
  758. /*
  759. * if we fail to parse primary_domain_index from device tree
  760. * mark the numa disabled, boot with numa disabled.
  761. */
  762. numa_enabled = false;
  763. return primary_domain_index;
  764. }
  765. pr_debug("associativity depth for CPU/Memory: %d\n", primary_domain_index);
  766. /*
  767. * If it is FORM2 initialize the distance table here.
  768. */
  769. if (affinity_form == FORM2_AFFINITY)
  770. initialize_form2_numa_distance_lookup_table();
  771. /*
  772. * Even though we connect cpus to numa domains later in SMP
  773. * init, we need to know the node ids now. This is because
  774. * each node to be onlined must have NODE_DATA etc backing it.
  775. */
  776. for_each_present_cpu(i) {
  777. __be32 vphn_assoc[VPHN_ASSOC_BUFSIZE];
  778. struct device_node *cpu;
  779. int nid = NUMA_NO_NODE;
  780. memset(vphn_assoc, 0, VPHN_ASSOC_BUFSIZE * sizeof(__be32));
  781. if (__vphn_get_associativity(i, vphn_assoc) == 0) {
  782. nid = associativity_to_nid(vphn_assoc);
  783. initialize_form1_numa_distance(vphn_assoc);
  784. } else {
  785. /*
  786. * Don't fall back to default_nid yet -- we will plug
  787. * cpus into nodes once the memory scan has discovered
  788. * the topology.
  789. */
  790. cpu = of_get_cpu_node(i, NULL);
  791. BUG_ON(!cpu);
  792. associativity = of_get_associativity(cpu);
  793. if (associativity) {
  794. nid = associativity_to_nid(associativity);
  795. initialize_form1_numa_distance(associativity);
  796. }
  797. of_node_put(cpu);
  798. }
  799. /* node_set_online() is an UB if 'nid' is negative */
  800. if (likely(nid >= 0))
  801. node_set_online(nid);
  802. }
  803. get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
  804. for_each_node_by_type(memory, "memory") {
  805. unsigned long start;
  806. unsigned long size;
  807. int nid;
  808. int ranges;
  809. const __be32 *memcell_buf;
  810. unsigned int len;
  811. memcell_buf = of_get_property(memory,
  812. "linux,usable-memory", &len);
  813. if (!memcell_buf || len <= 0)
  814. memcell_buf = of_get_property(memory, "reg", &len);
  815. if (!memcell_buf || len <= 0)
  816. continue;
  817. /* ranges in cell */
  818. ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
  819. new_range:
  820. /* these are order-sensitive, and modify the buffer pointer */
  821. start = read_n_cells(n_mem_addr_cells, &memcell_buf);
  822. size = read_n_cells(n_mem_size_cells, &memcell_buf);
  823. /*
  824. * Assumption: either all memory nodes or none will
  825. * have associativity properties. If none, then
  826. * everything goes to default_nid.
  827. */
  828. associativity = of_get_associativity(memory);
  829. if (associativity) {
  830. nid = associativity_to_nid(associativity);
  831. initialize_form1_numa_distance(associativity);
  832. } else
  833. nid = default_nid;
  834. fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
  835. node_set_online(nid);
  836. size = numa_enforce_memory_limit(start, size);
  837. if (size)
  838. memblock_set_node(start, size, &memblock.memory, nid);
  839. if (--ranges)
  840. goto new_range;
  841. }
  842. for_each_node_by_name(pci, "pci") {
  843. int nid = NUMA_NO_NODE;
  844. associativity = of_get_associativity(pci);
  845. if (associativity) {
  846. nid = associativity_to_nid(associativity);
  847. initialize_form1_numa_distance(associativity);
  848. }
  849. if (likely(nid >= 0) && !node_online(nid))
  850. node_set_online(nid);
  851. }
  852. /*
  853. * Now do the same thing for each MEMBLOCK listed in the
  854. * ibm,dynamic-memory property in the
  855. * ibm,dynamic-reconfiguration-memory node.
  856. */
  857. memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
  858. if (memory) {
  859. walk_drmem_lmbs(memory, NULL, numa_setup_drmem_lmb);
  860. of_node_put(memory);
  861. }
  862. return 0;
  863. }
  864. static void __init setup_nonnuma(void)
  865. {
  866. unsigned long top_of_ram = memblock_end_of_DRAM();
  867. unsigned long total_ram = memblock_phys_mem_size();
  868. unsigned long start_pfn, end_pfn;
  869. unsigned int nid = 0;
  870. int i;
  871. pr_debug("Top of RAM: 0x%lx, Total RAM: 0x%lx\n", top_of_ram, total_ram);
  872. pr_debug("Memory hole size: %ldMB\n", (top_of_ram - total_ram) >> 20);
  873. for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
  874. fake_numa_create_new_node(end_pfn, &nid);
  875. memblock_set_node(PFN_PHYS(start_pfn),
  876. PFN_PHYS(end_pfn - start_pfn),
  877. &memblock.memory, nid);
  878. node_set_online(nid);
  879. }
  880. }
  881. void __init dump_numa_cpu_topology(void)
  882. {
  883. unsigned int node;
  884. unsigned int cpu, count;
  885. if (!numa_enabled)
  886. return;
  887. for_each_online_node(node) {
  888. pr_info("Node %d CPUs:", node);
  889. count = 0;
  890. /*
  891. * If we used a CPU iterator here we would miss printing
  892. * the holes in the cpumap.
  893. */
  894. for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
  895. if (cpumask_test_cpu(cpu,
  896. node_to_cpumask_map[node])) {
  897. if (count == 0)
  898. pr_cont(" %u", cpu);
  899. ++count;
  900. } else {
  901. if (count > 1)
  902. pr_cont("-%u", cpu - 1);
  903. count = 0;
  904. }
  905. }
  906. if (count > 1)
  907. pr_cont("-%u", nr_cpu_ids - 1);
  908. pr_cont("\n");
  909. }
  910. }
  911. /* Initialize NODE_DATA for a node on the local memory */
  912. static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
  913. {
  914. u64 spanned_pages = end_pfn - start_pfn;
  915. alloc_node_data(nid);
  916. NODE_DATA(nid)->node_id = nid;
  917. NODE_DATA(nid)->node_start_pfn = start_pfn;
  918. NODE_DATA(nid)->node_spanned_pages = spanned_pages;
  919. }
  920. static void __init find_possible_nodes(void)
  921. {
  922. struct device_node *rtas, *root;
  923. const __be32 *domains = NULL;
  924. int prop_length, max_nodes;
  925. u32 i;
  926. if (!numa_enabled)
  927. return;
  928. rtas = of_find_node_by_path("/rtas");
  929. if (!rtas)
  930. return;
  931. /*
  932. * ibm,current-associativity-domains is a fairly recent property. If
  933. * it doesn't exist, then fallback on ibm,max-associativity-domains.
  934. * Current denotes what the platform can support compared to max
  935. * which denotes what the Hypervisor can support.
  936. *
  937. * If the LPAR is migratable, new nodes might be activated after a LPM,
  938. * so we should consider the max number in that case.
  939. */
  940. root = of_find_node_by_path("/");
  941. if (!of_get_property(root, "ibm,migratable-partition", NULL))
  942. domains = of_get_property(rtas,
  943. "ibm,current-associativity-domains",
  944. &prop_length);
  945. of_node_put(root);
  946. if (!domains) {
  947. domains = of_get_property(rtas, "ibm,max-associativity-domains",
  948. &prop_length);
  949. if (!domains)
  950. goto out;
  951. }
  952. max_nodes = of_read_number(&domains[primary_domain_index], 1);
  953. pr_info("Partition configured for %d NUMA nodes.\n", max_nodes);
  954. for (i = 0; i < max_nodes; i++) {
  955. if (!node_possible(i))
  956. node_set(i, node_possible_map);
  957. }
  958. prop_length /= sizeof(int);
  959. if (prop_length > primary_domain_index + 2)
  960. coregroup_enabled = 1;
  961. out:
  962. of_node_put(rtas);
  963. }
  964. void __init mem_topology_setup(void)
  965. {
  966. int cpu;
  967. max_low_pfn = max_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
  968. min_low_pfn = MEMORY_START >> PAGE_SHIFT;
  969. /*
  970. * Linux/mm assumes node 0 to be online at boot. However this is not
  971. * true on PowerPC, where node 0 is similar to any other node, it
  972. * could be cpuless, memoryless node. So force node 0 to be offline
  973. * for now. This will prevent cpuless, memoryless node 0 showing up
  974. * unnecessarily as online. If a node has cpus or memory that need
  975. * to be online, then node will anyway be marked online.
  976. */
  977. node_set_offline(0);
  978. if (parse_numa_properties())
  979. setup_nonnuma();
  980. /*
  981. * Modify the set of possible NUMA nodes to reflect information
  982. * available about the set of online nodes, and the set of nodes
  983. * that we expect to make use of for this platform's affinity
  984. * calculations.
  985. */
  986. nodes_and(node_possible_map, node_possible_map, node_online_map);
  987. find_possible_nodes();
  988. setup_node_to_cpumask_map();
  989. reset_numa_cpu_lookup_table();
  990. for_each_possible_cpu(cpu) {
  991. /*
  992. * Powerpc with CONFIG_NUMA always used to have a node 0,
  993. * even if it was memoryless or cpuless. For all cpus that
  994. * are possible but not present, cpu_to_node() would point
  995. * to node 0. To remove a cpuless, memoryless dummy node,
  996. * powerpc need to make sure all possible but not present
  997. * cpu_to_node are set to a proper node.
  998. */
  999. numa_setup_cpu(cpu);
  1000. }
  1001. }
  1002. void __init initmem_init(void)
  1003. {
  1004. int nid;
  1005. memblock_dump_all();
  1006. for_each_online_node(nid) {
  1007. unsigned long start_pfn, end_pfn;
  1008. get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
  1009. setup_node_data(nid, start_pfn, end_pfn);
  1010. }
  1011. /*
  1012. * We need the numa_cpu_lookup_table to be accurate for all CPUs,
  1013. * even before we online them, so that we can use cpu_to_{node,mem}
  1014. * early in boot, cf. smp_prepare_cpus().
  1015. * _nocalls() + manual invocation is used because cpuhp is not yet
  1016. * initialized for the boot CPU.
  1017. */
  1018. cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare",
  1019. ppc_numa_cpu_prepare, ppc_numa_cpu_dead);
  1020. }
  1021. static int __init early_numa(char *p)
  1022. {
  1023. if (!p)
  1024. return 0;
  1025. if (strstr(p, "off"))
  1026. numa_enabled = 0;
  1027. p = strstr(p, "fake=");
  1028. if (p)
  1029. cmdline = p + strlen("fake=");
  1030. return 0;
  1031. }
  1032. early_param("numa", early_numa);
  1033. #ifdef CONFIG_MEMORY_HOTPLUG
  1034. /*
  1035. * Find the node associated with a hot added memory section for
  1036. * memory represented in the device tree by the property
  1037. * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
  1038. */
  1039. static int hot_add_drconf_scn_to_nid(unsigned long scn_addr)
  1040. {
  1041. struct drmem_lmb *lmb;
  1042. unsigned long lmb_size;
  1043. int nid = NUMA_NO_NODE;
  1044. lmb_size = drmem_lmb_size();
  1045. for_each_drmem_lmb(lmb) {
  1046. /* skip this block if it is reserved or not assigned to
  1047. * this partition */
  1048. if ((lmb->flags & DRCONF_MEM_RESERVED)
  1049. || !(lmb->flags & DRCONF_MEM_ASSIGNED))
  1050. continue;
  1051. if ((scn_addr < lmb->base_addr)
  1052. || (scn_addr >= (lmb->base_addr + lmb_size)))
  1053. continue;
  1054. nid = of_drconf_to_nid_single(lmb);
  1055. break;
  1056. }
  1057. return nid;
  1058. }
  1059. /*
  1060. * Find the node associated with a hot added memory section for memory
  1061. * represented in the device tree as a node (i.e. memory@XXXX) for
  1062. * each memblock.
  1063. */
  1064. static int hot_add_node_scn_to_nid(unsigned long scn_addr)
  1065. {
  1066. struct device_node *memory;
  1067. int nid = NUMA_NO_NODE;
  1068. for_each_node_by_type(memory, "memory") {
  1069. int i = 0;
  1070. while (1) {
  1071. struct resource res;
  1072. if (of_address_to_resource(memory, i++, &res))
  1073. break;
  1074. if ((scn_addr < res.start) || (scn_addr > res.end))
  1075. continue;
  1076. nid = of_node_to_nid_single(memory);
  1077. break;
  1078. }
  1079. if (nid >= 0)
  1080. break;
  1081. }
  1082. of_node_put(memory);
  1083. return nid;
  1084. }
  1085. /*
  1086. * Find the node associated with a hot added memory section. Section
  1087. * corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that
  1088. * sections are fully contained within a single MEMBLOCK.
  1089. */
  1090. int hot_add_scn_to_nid(unsigned long scn_addr)
  1091. {
  1092. struct device_node *memory = NULL;
  1093. int nid;
  1094. if (!numa_enabled)
  1095. return first_online_node;
  1096. memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
  1097. if (memory) {
  1098. nid = hot_add_drconf_scn_to_nid(scn_addr);
  1099. of_node_put(memory);
  1100. } else {
  1101. nid = hot_add_node_scn_to_nid(scn_addr);
  1102. }
  1103. if (nid < 0 || !node_possible(nid))
  1104. nid = first_online_node;
  1105. return nid;
  1106. }
  1107. u64 hot_add_drconf_memory_max(void)
  1108. {
  1109. struct device_node *memory = NULL;
  1110. struct device_node *dn = NULL;
  1111. const __be64 *lrdr = NULL;
  1112. dn = of_find_node_by_path("/rtas");
  1113. if (dn) {
  1114. lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL);
  1115. of_node_put(dn);
  1116. if (lrdr)
  1117. return be64_to_cpup(lrdr);
  1118. }
  1119. memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
  1120. if (memory) {
  1121. of_node_put(memory);
  1122. return drmem_lmb_memory_max();
  1123. }
  1124. return 0;
  1125. }
  1126. /*
  1127. * memory_hotplug_max - return max address of memory that may be added
  1128. *
  1129. * This is currently only used on systems that support drconfig memory
  1130. * hotplug.
  1131. */
  1132. u64 memory_hotplug_max(void)
  1133. {
  1134. return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
  1135. }
  1136. #endif /* CONFIG_MEMORY_HOTPLUG */
  1137. /* Virtual Processor Home Node (VPHN) support */
  1138. #ifdef CONFIG_PPC_SPLPAR
  1139. static int topology_inited;
  1140. /*
  1141. * Retrieve the new associativity information for a virtual processor's
  1142. * home node.
  1143. */
  1144. static long vphn_get_associativity(unsigned long cpu,
  1145. __be32 *associativity)
  1146. {
  1147. long rc;
  1148. rc = hcall_vphn(get_hard_smp_processor_id(cpu),
  1149. VPHN_FLAG_VCPU, associativity);
  1150. switch (rc) {
  1151. case H_SUCCESS:
  1152. pr_debug("VPHN hcall succeeded. Reset polling...\n");
  1153. goto out;
  1154. case H_FUNCTION:
  1155. pr_err_ratelimited("VPHN unsupported. Disabling polling...\n");
  1156. break;
  1157. case H_HARDWARE:
  1158. pr_err_ratelimited("hcall_vphn() experienced a hardware fault "
  1159. "preventing VPHN. Disabling polling...\n");
  1160. break;
  1161. case H_PARAMETER:
  1162. pr_err_ratelimited("hcall_vphn() was passed an invalid parameter. "
  1163. "Disabling polling...\n");
  1164. break;
  1165. default:
  1166. pr_err_ratelimited("hcall_vphn() returned %ld. Disabling polling...\n"
  1167. , rc);
  1168. break;
  1169. }
  1170. out:
  1171. return rc;
  1172. }
  1173. void find_and_update_cpu_nid(int cpu)
  1174. {
  1175. __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
  1176. int new_nid;
  1177. /* Use associativity from first thread for all siblings */
  1178. if (vphn_get_associativity(cpu, associativity))
  1179. return;
  1180. /* Do not have previous associativity, so find it now. */
  1181. new_nid = associativity_to_nid(associativity);
  1182. if (new_nid < 0 || !node_possible(new_nid))
  1183. new_nid = first_online_node;
  1184. else
  1185. // Associate node <-> cpu, so cpu_up() calls
  1186. // try_online_node() on the right node.
  1187. set_cpu_numa_node(cpu, new_nid);
  1188. pr_debug("%s:%d cpu %d nid %d\n", __func__, __LINE__, cpu, new_nid);
  1189. }
  1190. int cpu_to_coregroup_id(int cpu)
  1191. {
  1192. __be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
  1193. int index;
  1194. if (cpu < 0 || cpu > nr_cpu_ids)
  1195. return -1;
  1196. if (!coregroup_enabled)
  1197. goto out;
  1198. if (!firmware_has_feature(FW_FEATURE_VPHN))
  1199. goto out;
  1200. if (vphn_get_associativity(cpu, associativity))
  1201. goto out;
  1202. index = of_read_number(associativity, 1);
  1203. if (index > primary_domain_index + 1)
  1204. return of_read_number(&associativity[index - 1], 1);
  1205. out:
  1206. return cpu_to_core_id(cpu);
  1207. }
  1208. static int topology_update_init(void)
  1209. {
  1210. topology_inited = 1;
  1211. return 0;
  1212. }
  1213. device_initcall(topology_update_init);
  1214. #endif /* CONFIG_PPC_SPLPAR */