group_cpus.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 2016 Thomas Gleixner.
  4. * Copyright (C) 2016-2017 Christoph Hellwig.
  5. */
  6. #include <linux/kernel.h>
  7. #include <linux/slab.h>
  8. #include <linux/cpu.h>
  9. #include <linux/sort.h>
  10. #include <linux/group_cpus.h>
  11. #ifdef CONFIG_SMP
  12. static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
  13. unsigned int cpus_per_grp)
  14. {
  15. const struct cpumask *siblmsk;
  16. int cpu, sibl;
  17. for ( ; cpus_per_grp > 0; ) {
  18. cpu = cpumask_first(nmsk);
  19. /* Should not happen, but I'm too lazy to think about it */
  20. if (cpu >= nr_cpu_ids)
  21. return;
  22. cpumask_clear_cpu(cpu, nmsk);
  23. cpumask_set_cpu(cpu, irqmsk);
  24. cpus_per_grp--;
  25. /* If the cpu has siblings, use them first */
  26. siblmsk = topology_sibling_cpumask(cpu);
  27. for (sibl = -1; cpus_per_grp > 0; ) {
  28. sibl = cpumask_next(sibl, siblmsk);
  29. if (sibl >= nr_cpu_ids)
  30. break;
  31. if (!cpumask_test_and_clear_cpu(sibl, nmsk))
  32. continue;
  33. cpumask_set_cpu(sibl, irqmsk);
  34. cpus_per_grp--;
  35. }
  36. }
  37. }
  38. static cpumask_var_t *alloc_node_to_cpumask(void)
  39. {
  40. cpumask_var_t *masks;
  41. int node;
  42. masks = kzalloc_objs(cpumask_var_t, nr_node_ids);
  43. if (!masks)
  44. return NULL;
  45. for (node = 0; node < nr_node_ids; node++) {
  46. if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL))
  47. goto out_unwind;
  48. }
  49. return masks;
  50. out_unwind:
  51. while (--node >= 0)
  52. free_cpumask_var(masks[node]);
  53. kfree(masks);
  54. return NULL;
  55. }
  56. static void free_node_to_cpumask(cpumask_var_t *masks)
  57. {
  58. int node;
  59. for (node = 0; node < nr_node_ids; node++)
  60. free_cpumask_var(masks[node]);
  61. kfree(masks);
  62. }
  63. static void build_node_to_cpumask(cpumask_var_t *masks)
  64. {
  65. int cpu;
  66. for_each_possible_cpu(cpu)
  67. cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
  68. }
  69. static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask,
  70. const struct cpumask *mask, nodemask_t *nodemsk)
  71. {
  72. int n, nodes = 0;
  73. /* Calculate the number of nodes in the supplied affinity mask */
  74. for_each_node(n) {
  75. if (cpumask_intersects(mask, node_to_cpumask[n])) {
  76. node_set(n, *nodemsk);
  77. nodes++;
  78. }
  79. }
  80. return nodes;
  81. }
  82. struct node_groups {
  83. unsigned id;
  84. union {
  85. unsigned ngroups;
  86. unsigned ncpus;
  87. };
  88. };
  89. static int ncpus_cmp_func(const void *l, const void *r)
  90. {
  91. const struct node_groups *ln = l;
  92. const struct node_groups *rn = r;
  93. return ln->ncpus - rn->ncpus;
  94. }
  95. static void alloc_groups_to_nodes(unsigned int numgrps,
  96. unsigned int numcpus,
  97. struct node_groups *node_groups,
  98. unsigned int num_nodes)
  99. {
  100. unsigned int n, remaining_ncpus = numcpus;
  101. unsigned int ngroups, ncpus;
  102. sort(node_groups, num_nodes, sizeof(node_groups[0]),
  103. ncpus_cmp_func, NULL);
  104. /*
  105. * Allocate groups for each node according to the ratio of this
  106. * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is
  107. * bigger than number of active numa nodes. Always start the
  108. * allocation from the node with minimized nr_cpus.
  109. *
  110. * This way guarantees that each active node gets allocated at
  111. * least one group, and the theory is simple: over-allocation
  112. * is only done when this node is assigned by one group, so
  113. * other nodes will be allocated >= 1 groups, since 'numgrps' is
  114. * bigger than number of numa nodes.
  115. *
  116. * One perfect invariant is that number of allocated groups for
  117. * each node is <= CPU count of this node:
  118. *
  119. * 1) suppose there are two nodes: A and B
  120. * ncpu(X) is CPU count of node X
  121. * grps(X) is the group count allocated to node X via this
  122. * algorithm
  123. *
  124. * ncpu(A) <= ncpu(B)
  125. * ncpu(A) + ncpu(B) = N
  126. * grps(A) + grps(B) = G
  127. *
  128. * grps(A) = max(1, round_down(G * ncpu(A) / N))
  129. * grps(B) = G - grps(A)
  130. *
  131. * both N and G are integer, and 2 <= G <= N, suppose
  132. * G = N - delta, and 0 <= delta <= N - 2
  133. *
  134. * 2) obviously grps(A) <= ncpu(A) because:
  135. *
  136. * if grps(A) is 1, then grps(A) <= ncpu(A) given
  137. * ncpu(A) >= 1
  138. *
  139. * otherwise,
  140. * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N
  141. *
  142. * 3) prove how grps(B) <= ncpu(B):
  143. *
  144. * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be
  145. * over-allocated, so grps(B) <= ncpu(B),
  146. *
  147. * otherwise:
  148. *
  149. * grps(A) =
  150. * round_down(G * ncpu(A) / N) =
  151. * round_down((N - delta) * ncpu(A) / N) =
  152. * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=
  153. * round_down((N * ncpu(A) - delta * N) / N) =
  154. * cpu(A) - delta
  155. *
  156. * then:
  157. *
  158. * grps(A) - G >= ncpu(A) - delta - G
  159. * =>
  160. * G - grps(A) <= G + delta - ncpu(A)
  161. * =>
  162. * grps(B) <= N - ncpu(A)
  163. * =>
  164. * grps(B) <= cpu(B)
  165. *
  166. * For nodes >= 3, it can be thought as one node and another big
  167. * node given that is exactly what this algorithm is implemented,
  168. * and we always re-calculate 'remaining_ncpus' & 'numgrps', and
  169. * finally for each node X: grps(X) <= ncpu(X).
  170. *
  171. */
  172. for (n = 0; n < num_nodes; n++) {
  173. if (node_groups[n].ncpus == UINT_MAX)
  174. continue;
  175. WARN_ON_ONCE(numgrps == 0);
  176. ncpus = node_groups[n].ncpus;
  177. ngroups = max_t(unsigned, 1,
  178. numgrps * ncpus / remaining_ncpus);
  179. WARN_ON_ONCE(ngroups > ncpus);
  180. node_groups[n].ngroups = ngroups;
  181. remaining_ncpus -= ncpus;
  182. numgrps -= ngroups;
  183. }
  184. }
  185. /*
  186. * Allocate group number for each node, so that for each node:
  187. *
  188. * 1) the allocated number is >= 1
  189. *
  190. * 2) the allocated number is <= active CPU number of this node
  191. *
  192. * The actual allocated total groups may be less than @numgrps when
  193. * active total CPU number is less than @numgrps.
  194. *
  195. * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]'
  196. * for each node.
  197. */
  198. static void alloc_nodes_groups(unsigned int numgrps,
  199. cpumask_var_t *node_to_cpumask,
  200. const struct cpumask *cpu_mask,
  201. const nodemask_t nodemsk,
  202. struct cpumask *nmsk,
  203. struct node_groups *node_groups)
  204. {
  205. unsigned int n, numcpus = 0;
  206. for (n = 0; n < nr_node_ids; n++) {
  207. node_groups[n].id = n;
  208. node_groups[n].ncpus = UINT_MAX;
  209. }
  210. for_each_node_mask(n, nodemsk) {
  211. unsigned int ncpus;
  212. cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
  213. ncpus = cpumask_weight(nmsk);
  214. if (!ncpus)
  215. continue;
  216. numcpus += ncpus;
  217. node_groups[n].ncpus = ncpus;
  218. }
  219. numgrps = min_t(unsigned int, numcpus, numgrps);
  220. alloc_groups_to_nodes(numgrps, numcpus, node_groups, nr_node_ids);
  221. }
  222. static void assign_cpus_to_groups(unsigned int ncpus,
  223. struct cpumask *nmsk,
  224. struct node_groups *nv,
  225. struct cpumask *masks,
  226. unsigned int *curgrp,
  227. unsigned int last_grp)
  228. {
  229. unsigned int v, cpus_per_grp, extra_grps;
  230. /* Account for rounding errors */
  231. extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups);
  232. /* Spread allocated groups on CPUs of the current node */
  233. for (v = 0; v < nv->ngroups; v++, *curgrp += 1) {
  234. cpus_per_grp = ncpus / nv->ngroups;
  235. /* Account for extra groups to compensate rounding errors */
  236. if (extra_grps) {
  237. cpus_per_grp++;
  238. --extra_grps;
  239. }
  240. /*
  241. * wrapping has to be considered given 'startgrp'
  242. * may start anywhere
  243. */
  244. if (*curgrp >= last_grp)
  245. *curgrp = 0;
  246. grp_spread_init_one(&masks[*curgrp], nmsk, cpus_per_grp);
  247. }
  248. }
  249. static int alloc_cluster_groups(unsigned int ncpus,
  250. unsigned int ngroups,
  251. struct cpumask *node_cpumask,
  252. cpumask_var_t msk,
  253. const struct cpumask ***clusters_ptr,
  254. struct node_groups **cluster_groups_ptr)
  255. {
  256. unsigned int ncluster = 0;
  257. unsigned int cpu, nc, n;
  258. const struct cpumask *cluster_mask;
  259. const struct cpumask **clusters;
  260. struct node_groups *cluster_groups;
  261. cpumask_copy(msk, node_cpumask);
  262. /* Probe how many clusters in this node. */
  263. while (1) {
  264. cpu = cpumask_first(msk);
  265. if (cpu >= nr_cpu_ids)
  266. break;
  267. cluster_mask = topology_cluster_cpumask(cpu);
  268. if (!cpumask_weight(cluster_mask))
  269. goto no_cluster;
  270. /* Clean out CPUs on the same cluster. */
  271. cpumask_andnot(msk, msk, cluster_mask);
  272. ncluster++;
  273. }
  274. /* If ngroups < ncluster, cross cluster is inevitable, skip. */
  275. if (ncluster == 0 || ncluster > ngroups)
  276. goto no_cluster;
  277. /* Allocate memory based on cluster number. */
  278. clusters = kzalloc_objs(*clusters, ncluster);
  279. if (!clusters)
  280. goto no_cluster;
  281. cluster_groups = kzalloc_objs(struct node_groups, ncluster);
  282. if (!cluster_groups)
  283. goto fail_cluster_groups;
  284. /* Filling cluster info for later process. */
  285. cpumask_copy(msk, node_cpumask);
  286. for (n = 0; n < ncluster; n++) {
  287. cpu = cpumask_first(msk);
  288. cluster_mask = topology_cluster_cpumask(cpu);
  289. nc = cpumask_weight_and(cluster_mask, node_cpumask);
  290. clusters[n] = cluster_mask;
  291. cluster_groups[n].id = n;
  292. cluster_groups[n].ncpus = nc;
  293. cpumask_andnot(msk, msk, cluster_mask);
  294. }
  295. alloc_groups_to_nodes(ngroups, ncpus, cluster_groups, ncluster);
  296. *clusters_ptr = clusters;
  297. *cluster_groups_ptr = cluster_groups;
  298. return ncluster;
  299. fail_cluster_groups:
  300. kfree(clusters);
  301. no_cluster:
  302. return 0;
  303. }
  304. /*
  305. * Try group CPUs evenly for cluster locality within a NUMA node.
  306. *
  307. * Return: true if success, false otherwise.
  308. */
  309. static bool __try_group_cluster_cpus(unsigned int ncpus,
  310. unsigned int ngroups,
  311. struct cpumask *node_cpumask,
  312. struct cpumask *masks,
  313. unsigned int *curgrp,
  314. unsigned int last_grp)
  315. {
  316. struct node_groups *cluster_groups;
  317. const struct cpumask **clusters;
  318. unsigned int ncluster;
  319. bool ret = false;
  320. cpumask_var_t nmsk;
  321. unsigned int i, nc;
  322. if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
  323. goto fail_nmsk_alloc;
  324. ncluster = alloc_cluster_groups(ncpus, ngroups, node_cpumask, nmsk,
  325. &clusters, &cluster_groups);
  326. if (ncluster == 0)
  327. goto fail_no_clusters;
  328. for (i = 0; i < ncluster; i++) {
  329. struct node_groups *nv = &cluster_groups[i];
  330. /* Get the cpus on this cluster. */
  331. cpumask_and(nmsk, node_cpumask, clusters[nv->id]);
  332. nc = cpumask_weight(nmsk);
  333. if (!nc)
  334. continue;
  335. WARN_ON_ONCE(nv->ngroups > nc);
  336. assign_cpus_to_groups(nc, nmsk, nv, masks, curgrp, last_grp);
  337. }
  338. ret = true;
  339. kfree(cluster_groups);
  340. kfree(clusters);
  341. fail_no_clusters:
  342. free_cpumask_var(nmsk);
  343. fail_nmsk_alloc:
  344. return ret;
  345. }
  346. static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
  347. cpumask_var_t *node_to_cpumask,
  348. const struct cpumask *cpu_mask,
  349. struct cpumask *nmsk, struct cpumask *masks)
  350. {
  351. unsigned int i, n, nodes, done = 0;
  352. unsigned int last_grp = numgrps;
  353. unsigned int curgrp = startgrp;
  354. nodemask_t nodemsk = NODE_MASK_NONE;
  355. struct node_groups *node_groups;
  356. if (cpumask_empty(cpu_mask))
  357. return 0;
  358. nodes = get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk);
  359. /*
  360. * If the number of nodes in the mask is greater than or equal the
  361. * number of groups we just spread the groups across the nodes.
  362. */
  363. if (numgrps <= nodes) {
  364. for_each_node_mask(n, nodemsk) {
  365. /* Ensure that only CPUs which are in both masks are set */
  366. cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
  367. cpumask_or(&masks[curgrp], &masks[curgrp], nmsk);
  368. if (++curgrp == last_grp)
  369. curgrp = 0;
  370. }
  371. return numgrps;
  372. }
  373. node_groups = kzalloc_objs(struct node_groups, nr_node_ids);
  374. if (!node_groups)
  375. return -ENOMEM;
  376. /* allocate group number for each node */
  377. alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask,
  378. nodemsk, nmsk, node_groups);
  379. for (i = 0; i < nr_node_ids; i++) {
  380. unsigned int ncpus;
  381. struct node_groups *nv = &node_groups[i];
  382. if (nv->ngroups == UINT_MAX)
  383. continue;
  384. /* Get the cpus on this node which are in the mask */
  385. cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]);
  386. ncpus = cpumask_weight(nmsk);
  387. if (!ncpus)
  388. continue;
  389. WARN_ON_ONCE(nv->ngroups > ncpus);
  390. if (__try_group_cluster_cpus(ncpus, nv->ngroups, nmsk,
  391. masks, &curgrp, last_grp)) {
  392. done += nv->ngroups;
  393. continue;
  394. }
  395. assign_cpus_to_groups(ncpus, nmsk, nv, masks, &curgrp,
  396. last_grp);
  397. done += nv->ngroups;
  398. }
  399. kfree(node_groups);
  400. return done;
  401. }
  402. /**
  403. * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
  404. * @numgrps: number of groups
  405. * @nummasks: number of initialized cpumasks
  406. *
  407. * Return: cpumask array if successful, NULL otherwise. And each element
  408. * includes CPUs assigned to this group. nummasks contains the number
  409. * of initialized masks which can be less than numgrps.
  410. *
  411. * Try to put close CPUs from viewpoint of CPU and NUMA locality into
  412. * same group, and run two-stage grouping:
  413. * 1) allocate present CPUs on these groups evenly first
  414. * 2) allocate other possible CPUs on these groups evenly
  415. *
  416. * We guarantee in the resulted grouping that all CPUs are covered, and
  417. * no same CPU is assigned to multiple groups
  418. */
  419. struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
  420. {
  421. unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
  422. cpumask_var_t *node_to_cpumask;
  423. cpumask_var_t nmsk, npresmsk;
  424. int ret = -ENOMEM;
  425. struct cpumask *masks = NULL;
  426. if (numgrps == 0)
  427. return NULL;
  428. if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
  429. return NULL;
  430. if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL))
  431. goto fail_nmsk;
  432. node_to_cpumask = alloc_node_to_cpumask();
  433. if (!node_to_cpumask)
  434. goto fail_npresmsk;
  435. masks = kzalloc_objs(*masks, numgrps);
  436. if (!masks)
  437. goto fail_node_to_cpumask;
  438. build_node_to_cpumask(node_to_cpumask);
  439. /*
  440. * Make a local cache of 'cpu_present_mask', so the two stages
  441. * spread can observe consistent 'cpu_present_mask' without holding
  442. * cpu hotplug lock, then we can reduce deadlock risk with cpu
  443. * hotplug code.
  444. *
  445. * Here CPU hotplug may happen when reading `cpu_present_mask`, and
  446. * we can live with the case because it only affects that hotplug
  447. * CPU is handled in the 1st or 2nd stage, and either way is correct
  448. * from API user viewpoint since 2-stage spread is sort of
  449. * optimization.
  450. */
  451. cpumask_copy(npresmsk, data_race(cpu_present_mask));
  452. /* grouping present CPUs first */
  453. ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
  454. npresmsk, nmsk, masks);
  455. if (ret < 0)
  456. goto fail_node_to_cpumask;
  457. nr_present = ret;
  458. /*
  459. * Allocate non present CPUs starting from the next group to be
  460. * handled. If the grouping of present CPUs already exhausted the
  461. * group space, assign the non present CPUs to the already
  462. * allocated out groups.
  463. */
  464. if (nr_present >= numgrps)
  465. curgrp = 0;
  466. else
  467. curgrp = nr_present;
  468. cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk);
  469. ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
  470. npresmsk, nmsk, masks);
  471. if (ret >= 0)
  472. nr_others = ret;
  473. fail_node_to_cpumask:
  474. free_node_to_cpumask(node_to_cpumask);
  475. fail_npresmsk:
  476. free_cpumask_var(npresmsk);
  477. fail_nmsk:
  478. free_cpumask_var(nmsk);
  479. if (ret < 0) {
  480. kfree(masks);
  481. return NULL;
  482. }
  483. *nummasks = min(nr_present + nr_others, numgrps);
  484. return masks;
  485. }
  486. #else /* CONFIG_SMP */
  487. struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
  488. {
  489. struct cpumask *masks;
  490. if (numgrps == 0)
  491. return NULL;
  492. masks = kzalloc_objs(*masks, numgrps);
  493. if (!masks)
  494. return NULL;
  495. /* assign all CPUs(cpu 0) to the 1st group only */
  496. cpumask_copy(&masks[0], cpu_possible_mask);
  497. *nummasks = 1;
  498. return masks;
  499. }
  500. #endif /* CONFIG_SMP */
  501. EXPORT_SYMBOL_GPL(group_cpus_evenly);