cpuset-v1.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. #include "cgroup-internal.h"
  3. #include "cpuset-internal.h"
  4. /*
  5. * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
  6. */
  7. struct cpuset_remove_tasks_struct {
  8. struct work_struct work;
  9. struct cpuset *cs;
  10. };
  11. /*
  12. * Frequency meter - How fast is some event occurring?
  13. *
  14. * These routines manage a digitally filtered, constant time based,
  15. * event frequency meter. There are four routines:
  16. * fmeter_init() - initialize a frequency meter.
  17. * fmeter_markevent() - called each time the event happens.
  18. * fmeter_getrate() - returns the recent rate of such events.
  19. * fmeter_update() - internal routine used to update fmeter.
  20. *
  21. * A common data structure is passed to each of these routines,
  22. * which is used to keep track of the state required to manage the
  23. * frequency meter and its digital filter.
  24. *
  25. * The filter works on the number of events marked per unit time.
  26. * The filter is single-pole low-pass recursive (IIR). The time unit
  27. * is 1 second. Arithmetic is done using 32-bit integers scaled to
  28. * simulate 3 decimal digits of precision (multiplied by 1000).
  29. *
  30. * With an FM_COEF of 933, and a time base of 1 second, the filter
  31. * has a half-life of 10 seconds, meaning that if the events quit
  32. * happening, then the rate returned from the fmeter_getrate()
  33. * will be cut in half each 10 seconds, until it converges to zero.
  34. *
  35. * It is not worth doing a real infinitely recursive filter. If more
  36. * than FM_MAXTICKS ticks have elapsed since the last filter event,
  37. * just compute FM_MAXTICKS ticks worth, by which point the level
  38. * will be stable.
  39. *
  40. * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
  41. * arithmetic overflow in the fmeter_update() routine.
  42. *
  43. * Given the simple 32 bit integer arithmetic used, this meter works
  44. * best for reporting rates between one per millisecond (msec) and
  45. * one per 32 (approx) seconds. At constant rates faster than one
  46. * per msec it maxes out at values just under 1,000,000. At constant
  47. * rates between one per msec, and one per second it will stabilize
  48. * to a value N*1000, where N is the rate of events per second.
  49. * At constant rates between one per second and one per 32 seconds,
  50. * it will be choppy, moving up on the seconds that have an event,
  51. * and then decaying until the next event. At rates slower than
  52. * about one in 32 seconds, it decays all the way back to zero between
  53. * each event.
  54. */
  55. #define FM_COEF 933 /* coefficient for half-life of 10 secs */
  56. #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
  57. #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
  58. #define FM_SCALE 1000 /* faux fixed point scale */
  59. /* Initialize a frequency meter */
  60. static void fmeter_init(struct fmeter *fmp)
  61. {
  62. fmp->cnt = 0;
  63. fmp->val = 0;
  64. fmp->time = 0;
  65. spin_lock_init(&fmp->lock);
  66. }
  67. /* Internal meter update - process cnt events and update value */
  68. static void fmeter_update(struct fmeter *fmp)
  69. {
  70. time64_t now;
  71. u32 ticks;
  72. now = ktime_get_seconds();
  73. ticks = now - fmp->time;
  74. if (ticks == 0)
  75. return;
  76. ticks = min(FM_MAXTICKS, ticks);
  77. while (ticks-- > 0)
  78. fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
  79. fmp->time = now;
  80. fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
  81. fmp->cnt = 0;
  82. }
  83. /* Process any previous ticks, then bump cnt by one (times scale). */
  84. static void fmeter_markevent(struct fmeter *fmp)
  85. {
  86. spin_lock(&fmp->lock);
  87. fmeter_update(fmp);
  88. fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
  89. spin_unlock(&fmp->lock);
  90. }
  91. /* Process any previous ticks, then return current value. */
  92. static int fmeter_getrate(struct fmeter *fmp)
  93. {
  94. int val;
  95. spin_lock(&fmp->lock);
  96. fmeter_update(fmp);
  97. val = fmp->val;
  98. spin_unlock(&fmp->lock);
  99. return val;
  100. }
  101. /*
  102. * Collection of memory_pressure is suppressed unless
  103. * this flag is enabled by writing "1" to the special
  104. * cpuset file 'memory_pressure_enabled' in the root cpuset.
  105. */
  106. int cpuset_memory_pressure_enabled __read_mostly;
  107. /*
  108. * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
  109. *
  110. * Keep a running average of the rate of synchronous (direct)
  111. * page reclaim efforts initiated by tasks in each cpuset.
  112. *
  113. * This represents the rate at which some task in the cpuset
  114. * ran low on memory on all nodes it was allowed to use, and
  115. * had to enter the kernels page reclaim code in an effort to
  116. * create more free memory by tossing clean pages or swapping
  117. * or writing dirty pages.
  118. *
  119. * Display to user space in the per-cpuset read-only file
  120. * "memory_pressure". Value displayed is an integer
  121. * representing the recent rate of entry into the synchronous
  122. * (direct) page reclaim by any task attached to the cpuset.
  123. */
  124. void __cpuset_memory_pressure_bump(void)
  125. {
  126. rcu_read_lock();
  127. fmeter_markevent(&task_cs(current)->fmeter);
  128. rcu_read_unlock();
  129. }
  130. static int update_relax_domain_level(struct cpuset *cs, s64 val)
  131. {
  132. #ifdef CONFIG_SMP
  133. if (val < -1 || val > sched_domain_level_max + 1)
  134. return -EINVAL;
  135. #endif
  136. if (val != cs->relax_domain_level) {
  137. cs->relax_domain_level = val;
  138. if (!cpumask_empty(cs->cpus_allowed) &&
  139. is_sched_load_balance(cs))
  140. rebuild_sched_domains_locked();
  141. }
  142. return 0;
  143. }
  144. static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
  145. s64 val)
  146. {
  147. struct cpuset *cs = css_cs(css);
  148. cpuset_filetype_t type = cft->private;
  149. int retval = -ENODEV;
  150. cpuset_full_lock();
  151. if (!is_cpuset_online(cs))
  152. goto out_unlock;
  153. switch (type) {
  154. case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  155. pr_info_once("cpuset.%s is deprecated\n", cft->name);
  156. retval = update_relax_domain_level(cs, val);
  157. break;
  158. default:
  159. retval = -EINVAL;
  160. break;
  161. }
  162. out_unlock:
  163. cpuset_full_unlock();
  164. return retval;
  165. }
  166. static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
  167. {
  168. struct cpuset *cs = css_cs(css);
  169. cpuset_filetype_t type = cft->private;
  170. switch (type) {
  171. case FILE_SCHED_RELAX_DOMAIN_LEVEL:
  172. return cs->relax_domain_level;
  173. default:
  174. BUG();
  175. }
  176. /* Unreachable but makes gcc happy */
  177. return 0;
  178. }
  179. /*
  180. * update task's spread flag if cpuset's page/slab spread flag is set
  181. *
  182. * Call with callback_lock or cpuset_mutex held. The check can be skipped
  183. * if on default hierarchy.
  184. */
  185. void cpuset1_update_task_spread_flags(struct cpuset *cs,
  186. struct task_struct *tsk)
  187. {
  188. if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
  189. return;
  190. if (is_spread_page(cs))
  191. task_set_spread_page(tsk);
  192. else
  193. task_clear_spread_page(tsk);
  194. if (is_spread_slab(cs))
  195. task_set_spread_slab(tsk);
  196. else
  197. task_clear_spread_slab(tsk);
  198. }
  199. /**
  200. * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
  201. * @cs: the cpuset in which each task's spread flags needs to be changed
  202. *
  203. * Iterate through each task of @cs updating its spread flags. As this
  204. * function is called with cpuset_mutex held, cpuset membership stays
  205. * stable.
  206. */
  207. void cpuset1_update_tasks_flags(struct cpuset *cs)
  208. {
  209. struct css_task_iter it;
  210. struct task_struct *task;
  211. css_task_iter_start(&cs->css, 0, &it);
  212. while ((task = css_task_iter_next(&it)))
  213. cpuset1_update_task_spread_flags(cs, task);
  214. css_task_iter_end(&it);
  215. }
  216. /*
  217. * If CPU and/or memory hotplug handlers, below, unplug any CPUs
  218. * or memory nodes, we need to walk over the cpuset hierarchy,
  219. * removing that CPU or node from all cpusets. If this removes the
  220. * last CPU or node from a cpuset, then move the tasks in the empty
  221. * cpuset to its next-highest non-empty parent.
  222. */
  223. static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
  224. {
  225. struct cpuset *parent;
  226. /*
  227. * Find its next-highest non-empty parent, (top cpuset
  228. * has online cpus, so can't be empty).
  229. */
  230. parent = parent_cs(cs);
  231. while (cpumask_empty(parent->cpus_allowed) ||
  232. nodes_empty(parent->mems_allowed))
  233. parent = parent_cs(parent);
  234. if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
  235. pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
  236. pr_cont_cgroup_name(cs->css.cgroup);
  237. pr_cont("\n");
  238. }
  239. }
  240. static void cpuset_migrate_tasks_workfn(struct work_struct *work)
  241. {
  242. struct cpuset_remove_tasks_struct *s;
  243. s = container_of(work, struct cpuset_remove_tasks_struct, work);
  244. remove_tasks_in_empty_cpuset(s->cs);
  245. css_put(&s->cs->css);
  246. kfree(s);
  247. }
  248. void cpuset1_hotplug_update_tasks(struct cpuset *cs,
  249. struct cpumask *new_cpus, nodemask_t *new_mems,
  250. bool cpus_updated, bool mems_updated)
  251. {
  252. bool is_empty;
  253. cpuset_callback_lock_irq();
  254. cpumask_copy(cs->cpus_allowed, new_cpus);
  255. cpumask_copy(cs->effective_cpus, new_cpus);
  256. cs->mems_allowed = *new_mems;
  257. cs->effective_mems = *new_mems;
  258. cpuset_callback_unlock_irq();
  259. /*
  260. * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
  261. * as the tasks will be migrated to an ancestor.
  262. */
  263. if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
  264. cpuset_update_tasks_cpumask(cs, new_cpus);
  265. if (mems_updated && !nodes_empty(cs->mems_allowed))
  266. cpuset_update_tasks_nodemask(cs);
  267. is_empty = cpumask_empty(cs->cpus_allowed) ||
  268. nodes_empty(cs->mems_allowed);
  269. /*
  270. * Move tasks to the nearest ancestor with execution resources,
  271. * This is full cgroup operation which will also call back into
  272. * cpuset. Execute it asynchronously using workqueue.
  273. */
  274. if (is_empty && cs->css.cgroup->nr_populated_csets &&
  275. css_tryget_online(&cs->css)) {
  276. struct cpuset_remove_tasks_struct *s;
  277. s = kzalloc_obj(*s);
  278. if (WARN_ON_ONCE(!s)) {
  279. css_put(&cs->css);
  280. return;
  281. }
  282. s->cs = cs;
  283. INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
  284. schedule_work(&s->work);
  285. }
  286. }
  287. /*
  288. * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
  289. *
  290. * One cpuset is a subset of another if all its allowed CPUs and
  291. * Memory Nodes are a subset of the other, and its exclusive flags
  292. * are only set if the other's are set. Call holding cpuset_mutex.
  293. */
  294. static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  295. {
  296. return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
  297. nodes_subset(p->mems_allowed, q->mems_allowed) &&
  298. is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
  299. is_mem_exclusive(p) <= is_mem_exclusive(q);
  300. }
  301. /*
  302. * cpuset1_validate_change() - Validate conditions specific to legacy (v1)
  303. * behavior.
  304. */
  305. int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
  306. {
  307. struct cgroup_subsys_state *css;
  308. struct cpuset *c, *par;
  309. int ret;
  310. WARN_ON_ONCE(!rcu_read_lock_held());
  311. /* Each of our child cpusets must be a subset of us */
  312. ret = -EBUSY;
  313. cpuset_for_each_child(c, css, cur)
  314. if (!is_cpuset_subset(c, trial))
  315. goto out;
  316. /* On legacy hierarchy, we must be a subset of our parent cpuset. */
  317. ret = -EACCES;
  318. par = parent_cs(cur);
  319. if (par && !is_cpuset_subset(trial, par))
  320. goto out;
  321. /*
  322. * Cpusets with tasks - existing or newly being attached - can't
  323. * be changed to have empty cpus_allowed or mems_allowed.
  324. */
  325. ret = -ENOSPC;
  326. if (cpuset_is_populated(cur)) {
  327. if (!cpumask_empty(cur->cpus_allowed) &&
  328. cpumask_empty(trial->cpus_allowed))
  329. goto out;
  330. if (!nodes_empty(cur->mems_allowed) &&
  331. nodes_empty(trial->mems_allowed))
  332. goto out;
  333. }
  334. ret = 0;
  335. out:
  336. return ret;
  337. }
  338. /*
  339. * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts
  340. * to legacy (v1)
  341. * @cs1: first cpuset to check
  342. * @cs2: second cpuset to check
  343. *
  344. * Returns: true if CPU exclusivity conflict exists, false otherwise
  345. *
  346. * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect.
  347. */
  348. bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
  349. {
  350. if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
  351. return cpumask_intersects(cs1->cpus_allowed,
  352. cs2->cpus_allowed);
  353. return false;
  354. }
  355. #ifdef CONFIG_PROC_PID_CPUSET
  356. /*
  357. * proc_cpuset_show()
  358. * - Print tasks cpuset path into seq_file.
  359. * - Used for /proc/<pid>/cpuset.
  360. */
  361. int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
  362. struct pid *pid, struct task_struct *tsk)
  363. {
  364. char *buf;
  365. struct cgroup_subsys_state *css;
  366. int retval;
  367. retval = -ENOMEM;
  368. buf = kmalloc(PATH_MAX, GFP_KERNEL);
  369. if (!buf)
  370. goto out;
  371. rcu_read_lock();
  372. spin_lock_irq(&css_set_lock);
  373. css = task_css(tsk, cpuset_cgrp_id);
  374. retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
  375. current->nsproxy->cgroup_ns);
  376. spin_unlock_irq(&css_set_lock);
  377. rcu_read_unlock();
  378. if (retval == -E2BIG)
  379. retval = -ENAMETOOLONG;
  380. if (retval < 0)
  381. goto out_free;
  382. seq_puts(m, buf);
  383. seq_putc(m, '\n');
  384. retval = 0;
  385. out_free:
  386. kfree(buf);
  387. out:
  388. return retval;
  389. }
  390. #endif /* CONFIG_PROC_PID_CPUSET */
  391. static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
  392. {
  393. struct cpuset *cs = css_cs(css);
  394. cpuset_filetype_t type = cft->private;
  395. switch (type) {
  396. case FILE_CPU_EXCLUSIVE:
  397. return is_cpu_exclusive(cs);
  398. case FILE_MEM_EXCLUSIVE:
  399. return is_mem_exclusive(cs);
  400. case FILE_MEM_HARDWALL:
  401. return is_mem_hardwall(cs);
  402. case FILE_SCHED_LOAD_BALANCE:
  403. return is_sched_load_balance(cs);
  404. case FILE_MEMORY_MIGRATE:
  405. return is_memory_migrate(cs);
  406. case FILE_MEMORY_PRESSURE_ENABLED:
  407. return cpuset_memory_pressure_enabled;
  408. case FILE_MEMORY_PRESSURE:
  409. return fmeter_getrate(&cs->fmeter);
  410. case FILE_SPREAD_PAGE:
  411. return is_spread_page(cs);
  412. case FILE_SPREAD_SLAB:
  413. return is_spread_slab(cs);
  414. default:
  415. BUG();
  416. }
  417. /* Unreachable but makes gcc happy */
  418. return 0;
  419. }
  420. static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
  421. u64 val)
  422. {
  423. struct cpuset *cs = css_cs(css);
  424. cpuset_filetype_t type = cft->private;
  425. int retval = 0;
  426. cpuset_full_lock();
  427. if (!is_cpuset_online(cs)) {
  428. retval = -ENODEV;
  429. goto out_unlock;
  430. }
  431. switch (type) {
  432. case FILE_CPU_EXCLUSIVE:
  433. retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
  434. break;
  435. case FILE_MEM_EXCLUSIVE:
  436. pr_info_once("cpuset.%s is deprecated\n", cft->name);
  437. retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
  438. break;
  439. case FILE_MEM_HARDWALL:
  440. pr_info_once("cpuset.%s is deprecated\n", cft->name);
  441. retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
  442. break;
  443. case FILE_SCHED_LOAD_BALANCE:
  444. pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name);
  445. retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
  446. break;
  447. case FILE_MEMORY_MIGRATE:
  448. pr_info_once("cpuset.%s is deprecated\n", cft->name);
  449. retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
  450. break;
  451. case FILE_MEMORY_PRESSURE_ENABLED:
  452. pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name);
  453. cpuset_memory_pressure_enabled = !!val;
  454. break;
  455. case FILE_SPREAD_PAGE:
  456. pr_info_once("cpuset.%s is deprecated\n", cft->name);
  457. retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
  458. break;
  459. case FILE_SPREAD_SLAB:
  460. pr_warn_once("cpuset.%s is deprecated\n", cft->name);
  461. retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
  462. break;
  463. default:
  464. retval = -EINVAL;
  465. break;
  466. }
  467. out_unlock:
  468. cpuset_full_unlock();
  469. return retval;
  470. }
  471. void cpuset1_init(struct cpuset *cs)
  472. {
  473. fmeter_init(&cs->fmeter);
  474. cs->relax_domain_level = -1;
  475. }
  476. void cpuset1_online_css(struct cgroup_subsys_state *css)
  477. {
  478. struct cpuset *tmp_cs;
  479. struct cgroup_subsys_state *pos_css;
  480. struct cpuset *cs = css_cs(css);
  481. struct cpuset *parent = parent_cs(cs);
  482. lockdep_assert_cpus_held();
  483. lockdep_assert_cpuset_lock_held();
  484. if (is_spread_page(parent))
  485. set_bit(CS_SPREAD_PAGE, &cs->flags);
  486. if (is_spread_slab(parent))
  487. set_bit(CS_SPREAD_SLAB, &cs->flags);
  488. if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
  489. return;
  490. /*
  491. * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
  492. * set. This flag handling is implemented in cgroup core for
  493. * historical reasons - the flag may be specified during mount.
  494. *
  495. * Currently, if any sibling cpusets have exclusive cpus or mem, we
  496. * refuse to clone the configuration - thereby refusing the task to
  497. * be entered, and as a result refusing the sys_unshare() or
  498. * clone() which initiated it. If this becomes a problem for some
  499. * users who wish to allow that scenario, then this could be
  500. * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
  501. * (and likewise for mems) to the new cgroup.
  502. */
  503. rcu_read_lock();
  504. cpuset_for_each_child(tmp_cs, pos_css, parent) {
  505. if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
  506. rcu_read_unlock();
  507. return;
  508. }
  509. }
  510. rcu_read_unlock();
  511. cpuset_callback_lock_irq();
  512. cs->mems_allowed = parent->mems_allowed;
  513. cs->effective_mems = parent->mems_allowed;
  514. cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
  515. cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
  516. cpuset_callback_unlock_irq();
  517. }
  518. static void
  519. update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
  520. {
  521. if (dattr->relax_domain_level < c->relax_domain_level)
  522. dattr->relax_domain_level = c->relax_domain_level;
  523. }
  524. static void update_domain_attr_tree(struct sched_domain_attr *dattr,
  525. struct cpuset *root_cs)
  526. {
  527. struct cpuset *cp;
  528. struct cgroup_subsys_state *pos_css;
  529. rcu_read_lock();
  530. cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
  531. /* skip the whole subtree if @cp doesn't have any CPU */
  532. if (cpumask_empty(cp->cpus_allowed)) {
  533. pos_css = css_rightmost_descendant(pos_css);
  534. continue;
  535. }
  536. if (is_sched_load_balance(cp))
  537. update_domain_attr(dattr, cp);
  538. }
  539. rcu_read_unlock();
  540. }
  541. /*
  542. * cpuset1_generate_sched_domains()
  543. *
  544. * Finding the best partition (set of domains):
  545. * The double nested loops below over i, j scan over the load
  546. * balanced cpusets (using the array of cpuset pointers in csa[])
  547. * looking for pairs of cpusets that have overlapping cpus_allowed
  548. * and merging them using a union-find algorithm.
  549. *
  550. * The union of the cpus_allowed masks from the set of all cpusets
  551. * having the same root then form the one element of the partition
  552. * (one sched domain) to be passed to partition_sched_domains().
  553. */
  554. int cpuset1_generate_sched_domains(cpumask_var_t **domains,
  555. struct sched_domain_attr **attributes)
  556. {
  557. struct cpuset *cp; /* top-down scan of cpusets */
  558. struct cpuset **csa; /* array of all cpuset ptrs */
  559. int csn; /* how many cpuset ptrs in csa so far */
  560. int i, j; /* indices for partition finding loops */
  561. cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
  562. struct sched_domain_attr *dattr; /* attributes for custom domains */
  563. int ndoms = 0; /* number of sched domains in result */
  564. int nslot; /* next empty doms[] struct cpumask slot */
  565. struct cgroup_subsys_state *pos_css;
  566. int nslot_update;
  567. lockdep_assert_cpuset_lock_held();
  568. doms = NULL;
  569. dattr = NULL;
  570. csa = NULL;
  571. /* Special case for the 99% of systems with one, full, sched domain */
  572. if (is_sched_load_balance(&top_cpuset)) {
  573. ndoms = 1;
  574. doms = alloc_sched_domains(ndoms);
  575. if (!doms)
  576. goto done;
  577. dattr = kmalloc_obj(struct sched_domain_attr);
  578. if (dattr) {
  579. *dattr = SD_ATTR_INIT;
  580. update_domain_attr_tree(dattr, &top_cpuset);
  581. }
  582. cpumask_and(doms[0], top_cpuset.effective_cpus,
  583. housekeeping_cpumask(HK_TYPE_DOMAIN));
  584. goto done;
  585. }
  586. csa = kmalloc_objs(cp, nr_cpusets());
  587. if (!csa)
  588. goto done;
  589. csn = 0;
  590. rcu_read_lock();
  591. cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
  592. if (cp == &top_cpuset)
  593. continue;
  594. /*
  595. * Continue traversing beyond @cp iff @cp has some CPUs and
  596. * isn't load balancing. The former is obvious. The
  597. * latter: All child cpusets contain a subset of the
  598. * parent's cpus, so just skip them, and then we call
  599. * update_domain_attr_tree() to calc relax_domain_level of
  600. * the corresponding sched domain.
  601. */
  602. if (!cpumask_empty(cp->cpus_allowed) &&
  603. !(is_sched_load_balance(cp) &&
  604. cpumask_intersects(cp->cpus_allowed,
  605. housekeeping_cpumask(HK_TYPE_DOMAIN))))
  606. continue;
  607. if (is_sched_load_balance(cp) &&
  608. !cpumask_empty(cp->effective_cpus))
  609. csa[csn++] = cp;
  610. /* skip @cp's subtree */
  611. pos_css = css_rightmost_descendant(pos_css);
  612. continue;
  613. }
  614. rcu_read_unlock();
  615. for (i = 0; i < csn; i++)
  616. uf_node_init(&csa[i]->node);
  617. /* Merge overlapping cpusets */
  618. for (i = 0; i < csn; i++) {
  619. for (j = i + 1; j < csn; j++) {
  620. if (cpusets_overlap(csa[i], csa[j]))
  621. uf_union(&csa[i]->node, &csa[j]->node);
  622. }
  623. }
  624. /* Count the total number of domains */
  625. for (i = 0; i < csn; i++) {
  626. if (uf_find(&csa[i]->node) == &csa[i]->node)
  627. ndoms++;
  628. }
  629. /*
  630. * Now we know how many domains to create.
  631. * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
  632. */
  633. doms = alloc_sched_domains(ndoms);
  634. if (!doms)
  635. goto done;
  636. /*
  637. * The rest of the code, including the scheduler, can deal with
  638. * dattr==NULL case. No need to abort if alloc fails.
  639. */
  640. dattr = kmalloc_objs(struct sched_domain_attr, ndoms);
  641. for (nslot = 0, i = 0; i < csn; i++) {
  642. nslot_update = 0;
  643. for (j = i; j < csn; j++) {
  644. if (uf_find(&csa[j]->node) == &csa[i]->node) {
  645. struct cpumask *dp = doms[nslot];
  646. if (i == j) {
  647. nslot_update = 1;
  648. cpumask_clear(dp);
  649. if (dattr)
  650. *(dattr + nslot) = SD_ATTR_INIT;
  651. }
  652. cpumask_or(dp, dp, csa[j]->effective_cpus);
  653. cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
  654. if (dattr)
  655. update_domain_attr_tree(dattr + nslot, csa[j]);
  656. }
  657. }
  658. if (nslot_update)
  659. nslot++;
  660. }
  661. BUG_ON(nslot != ndoms);
  662. done:
  663. kfree(csa);
  664. /*
  665. * Fallback to the default domain if kmalloc() failed.
  666. * See comments in partition_sched_domains().
  667. */
  668. if (doms == NULL)
  669. ndoms = 1;
  670. *domains = doms;
  671. *attributes = dattr;
  672. return ndoms;
  673. }
  674. /*
  675. * for the common functions, 'private' gives the type of file
  676. */
  677. struct cftype cpuset1_files[] = {
  678. {
  679. .name = "cpus",
  680. .seq_show = cpuset_common_seq_show,
  681. .write = cpuset_write_resmask,
  682. .max_write_len = (100U + 6 * NR_CPUS),
  683. .private = FILE_CPULIST,
  684. },
  685. {
  686. .name = "mems",
  687. .seq_show = cpuset_common_seq_show,
  688. .write = cpuset_write_resmask,
  689. .max_write_len = (100U + 6 * MAX_NUMNODES),
  690. .private = FILE_MEMLIST,
  691. },
  692. {
  693. .name = "effective_cpus",
  694. .seq_show = cpuset_common_seq_show,
  695. .private = FILE_EFFECTIVE_CPULIST,
  696. },
  697. {
  698. .name = "effective_mems",
  699. .seq_show = cpuset_common_seq_show,
  700. .private = FILE_EFFECTIVE_MEMLIST,
  701. },
  702. {
  703. .name = "cpu_exclusive",
  704. .read_u64 = cpuset_read_u64,
  705. .write_u64 = cpuset_write_u64,
  706. .private = FILE_CPU_EXCLUSIVE,
  707. },
  708. {
  709. .name = "mem_exclusive",
  710. .read_u64 = cpuset_read_u64,
  711. .write_u64 = cpuset_write_u64,
  712. .private = FILE_MEM_EXCLUSIVE,
  713. },
  714. {
  715. .name = "mem_hardwall",
  716. .read_u64 = cpuset_read_u64,
  717. .write_u64 = cpuset_write_u64,
  718. .private = FILE_MEM_HARDWALL,
  719. },
  720. {
  721. .name = "sched_load_balance",
  722. .read_u64 = cpuset_read_u64,
  723. .write_u64 = cpuset_write_u64,
  724. .private = FILE_SCHED_LOAD_BALANCE,
  725. },
  726. {
  727. .name = "sched_relax_domain_level",
  728. .read_s64 = cpuset_read_s64,
  729. .write_s64 = cpuset_write_s64,
  730. .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
  731. },
  732. {
  733. .name = "memory_migrate",
  734. .read_u64 = cpuset_read_u64,
  735. .write_u64 = cpuset_write_u64,
  736. .private = FILE_MEMORY_MIGRATE,
  737. },
  738. {
  739. .name = "memory_pressure",
  740. .read_u64 = cpuset_read_u64,
  741. .private = FILE_MEMORY_PRESSURE,
  742. },
  743. {
  744. .name = "memory_spread_page",
  745. .read_u64 = cpuset_read_u64,
  746. .write_u64 = cpuset_write_u64,
  747. .private = FILE_SPREAD_PAGE,
  748. },
  749. {
  750. /* obsolete, may be removed in the future */
  751. .name = "memory_spread_slab",
  752. .read_u64 = cpuset_read_u64,
  753. .write_u64 = cpuset_write_u64,
  754. .private = FILE_SPREAD_SLAB,
  755. },
  756. {
  757. .name = "memory_pressure_enabled",
  758. .flags = CFTYPE_ONLY_ON_ROOT,
  759. .read_u64 = cpuset_read_u64,
  760. .write_u64 = cpuset_write_u64,
  761. .private = FILE_MEMORY_PRESSURE_ENABLED,
  762. },
  763. { } /* terminate */
  764. };