| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875 |
- // SPDX-License-Identifier: GPL-2.0-or-later
- #include "cgroup-internal.h"
- #include "cpuset-internal.h"
- /*
- * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
- */
- struct cpuset_remove_tasks_struct {
- struct work_struct work;
- struct cpuset *cs;
- };
- /*
- * Frequency meter - How fast is some event occurring?
- *
- * These routines manage a digitally filtered, constant time based,
- * event frequency meter. There are four routines:
- * fmeter_init() - initialize a frequency meter.
- * fmeter_markevent() - called each time the event happens.
- * fmeter_getrate() - returns the recent rate of such events.
- * fmeter_update() - internal routine used to update fmeter.
- *
- * A common data structure is passed to each of these routines,
- * which is used to keep track of the state required to manage the
- * frequency meter and its digital filter.
- *
- * The filter works on the number of events marked per unit time.
- * The filter is single-pole low-pass recursive (IIR). The time unit
- * is 1 second. Arithmetic is done using 32-bit integers scaled to
- * simulate 3 decimal digits of precision (multiplied by 1000).
- *
- * With an FM_COEF of 933, and a time base of 1 second, the filter
- * has a half-life of 10 seconds, meaning that if the events quit
- * happening, then the rate returned from the fmeter_getrate()
- * will be cut in half each 10 seconds, until it converges to zero.
- *
- * It is not worth doing a real infinitely recursive filter. If more
- * than FM_MAXTICKS ticks have elapsed since the last filter event,
- * just compute FM_MAXTICKS ticks worth, by which point the level
- * will be stable.
- *
- * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
- * arithmetic overflow in the fmeter_update() routine.
- *
- * Given the simple 32 bit integer arithmetic used, this meter works
- * best for reporting rates between one per millisecond (msec) and
- * one per 32 (approx) seconds. At constant rates faster than one
- * per msec it maxes out at values just under 1,000,000. At constant
- * rates between one per msec, and one per second it will stabilize
- * to a value N*1000, where N is the rate of events per second.
- * At constant rates between one per second and one per 32 seconds,
- * it will be choppy, moving up on the seconds that have an event,
- * and then decaying until the next event. At rates slower than
- * about one in 32 seconds, it decays all the way back to zero between
- * each event.
- */
- #define FM_COEF 933 /* coefficient for half-life of 10 secs */
- #define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */
- #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
- #define FM_SCALE 1000 /* faux fixed point scale */
- /* Initialize a frequency meter */
- static void fmeter_init(struct fmeter *fmp)
- {
- fmp->cnt = 0;
- fmp->val = 0;
- fmp->time = 0;
- spin_lock_init(&fmp->lock);
- }
- /* Internal meter update - process cnt events and update value */
- static void fmeter_update(struct fmeter *fmp)
- {
- time64_t now;
- u32 ticks;
- now = ktime_get_seconds();
- ticks = now - fmp->time;
- if (ticks == 0)
- return;
- ticks = min(FM_MAXTICKS, ticks);
- while (ticks-- > 0)
- fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
- fmp->time = now;
- fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
- fmp->cnt = 0;
- }
- /* Process any previous ticks, then bump cnt by one (times scale). */
- static void fmeter_markevent(struct fmeter *fmp)
- {
- spin_lock(&fmp->lock);
- fmeter_update(fmp);
- fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
- spin_unlock(&fmp->lock);
- }
- /* Process any previous ticks, then return current value. */
- static int fmeter_getrate(struct fmeter *fmp)
- {
- int val;
- spin_lock(&fmp->lock);
- fmeter_update(fmp);
- val = fmp->val;
- spin_unlock(&fmp->lock);
- return val;
- }
- /*
- * Collection of memory_pressure is suppressed unless
- * this flag is enabled by writing "1" to the special
- * cpuset file 'memory_pressure_enabled' in the root cpuset.
- */
- int cpuset_memory_pressure_enabled __read_mostly;
- /*
- * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
- *
- * Keep a running average of the rate of synchronous (direct)
- * page reclaim efforts initiated by tasks in each cpuset.
- *
- * This represents the rate at which some task in the cpuset
- * ran low on memory on all nodes it was allowed to use, and
- * had to enter the kernels page reclaim code in an effort to
- * create more free memory by tossing clean pages or swapping
- * or writing dirty pages.
- *
- * Display to user space in the per-cpuset read-only file
- * "memory_pressure". Value displayed is an integer
- * representing the recent rate of entry into the synchronous
- * (direct) page reclaim by any task attached to the cpuset.
- */
- void __cpuset_memory_pressure_bump(void)
- {
- rcu_read_lock();
- fmeter_markevent(&task_cs(current)->fmeter);
- rcu_read_unlock();
- }
- static int update_relax_domain_level(struct cpuset *cs, s64 val)
- {
- #ifdef CONFIG_SMP
- if (val < -1 || val > sched_domain_level_max + 1)
- return -EINVAL;
- #endif
- if (val != cs->relax_domain_level) {
- cs->relax_domain_level = val;
- if (!cpumask_empty(cs->cpus_allowed) &&
- is_sched_load_balance(cs))
- rebuild_sched_domains_locked();
- }
- return 0;
- }
- static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
- s64 val)
- {
- struct cpuset *cs = css_cs(css);
- cpuset_filetype_t type = cft->private;
- int retval = -ENODEV;
- cpuset_full_lock();
- if (!is_cpuset_online(cs))
- goto out_unlock;
- switch (type) {
- case FILE_SCHED_RELAX_DOMAIN_LEVEL:
- pr_info_once("cpuset.%s is deprecated\n", cft->name);
- retval = update_relax_domain_level(cs, val);
- break;
- default:
- retval = -EINVAL;
- break;
- }
- out_unlock:
- cpuset_full_unlock();
- return retval;
- }
- static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
- {
- struct cpuset *cs = css_cs(css);
- cpuset_filetype_t type = cft->private;
- switch (type) {
- case FILE_SCHED_RELAX_DOMAIN_LEVEL:
- return cs->relax_domain_level;
- default:
- BUG();
- }
- /* Unreachable but makes gcc happy */
- return 0;
- }
- /*
- * update task's spread flag if cpuset's page/slab spread flag is set
- *
- * Call with callback_lock or cpuset_mutex held. The check can be skipped
- * if on default hierarchy.
- */
- void cpuset1_update_task_spread_flags(struct cpuset *cs,
- struct task_struct *tsk)
- {
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
- return;
- if (is_spread_page(cs))
- task_set_spread_page(tsk);
- else
- task_clear_spread_page(tsk);
- if (is_spread_slab(cs))
- task_set_spread_slab(tsk);
- else
- task_clear_spread_slab(tsk);
- }
- /**
- * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
- * @cs: the cpuset in which each task's spread flags needs to be changed
- *
- * Iterate through each task of @cs updating its spread flags. As this
- * function is called with cpuset_mutex held, cpuset membership stays
- * stable.
- */
- void cpuset1_update_tasks_flags(struct cpuset *cs)
- {
- struct css_task_iter it;
- struct task_struct *task;
- css_task_iter_start(&cs->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
- cpuset1_update_task_spread_flags(cs, task);
- css_task_iter_end(&it);
- }
- /*
- * If CPU and/or memory hotplug handlers, below, unplug any CPUs
- * or memory nodes, we need to walk over the cpuset hierarchy,
- * removing that CPU or node from all cpusets. If this removes the
- * last CPU or node from a cpuset, then move the tasks in the empty
- * cpuset to its next-highest non-empty parent.
- */
- static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
- {
- struct cpuset *parent;
- /*
- * Find its next-highest non-empty parent, (top cpuset
- * has online cpus, so can't be empty).
- */
- parent = parent_cs(cs);
- while (cpumask_empty(parent->cpus_allowed) ||
- nodes_empty(parent->mems_allowed))
- parent = parent_cs(parent);
- if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
- pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
- pr_cont_cgroup_name(cs->css.cgroup);
- pr_cont("\n");
- }
- }
- static void cpuset_migrate_tasks_workfn(struct work_struct *work)
- {
- struct cpuset_remove_tasks_struct *s;
- s = container_of(work, struct cpuset_remove_tasks_struct, work);
- remove_tasks_in_empty_cpuset(s->cs);
- css_put(&s->cs->css);
- kfree(s);
- }
- void cpuset1_hotplug_update_tasks(struct cpuset *cs,
- struct cpumask *new_cpus, nodemask_t *new_mems,
- bool cpus_updated, bool mems_updated)
- {
- bool is_empty;
- cpuset_callback_lock_irq();
- cpumask_copy(cs->cpus_allowed, new_cpus);
- cpumask_copy(cs->effective_cpus, new_cpus);
- cs->mems_allowed = *new_mems;
- cs->effective_mems = *new_mems;
- cpuset_callback_unlock_irq();
- /*
- * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
- * as the tasks will be migrated to an ancestor.
- */
- if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
- cpuset_update_tasks_cpumask(cs, new_cpus);
- if (mems_updated && !nodes_empty(cs->mems_allowed))
- cpuset_update_tasks_nodemask(cs);
- is_empty = cpumask_empty(cs->cpus_allowed) ||
- nodes_empty(cs->mems_allowed);
- /*
- * Move tasks to the nearest ancestor with execution resources,
- * This is full cgroup operation which will also call back into
- * cpuset. Execute it asynchronously using workqueue.
- */
- if (is_empty && cs->css.cgroup->nr_populated_csets &&
- css_tryget_online(&cs->css)) {
- struct cpuset_remove_tasks_struct *s;
- s = kzalloc_obj(*s);
- if (WARN_ON_ONCE(!s)) {
- css_put(&cs->css);
- return;
- }
- s->cs = cs;
- INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
- schedule_work(&s->work);
- }
- }
- /*
- * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
- *
- * One cpuset is a subset of another if all its allowed CPUs and
- * Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cpuset_mutex.
- */
- static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
- {
- return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
- nodes_subset(p->mems_allowed, q->mems_allowed) &&
- is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
- is_mem_exclusive(p) <= is_mem_exclusive(q);
- }
- /*
- * cpuset1_validate_change() - Validate conditions specific to legacy (v1)
- * behavior.
- */
- int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
- {
- struct cgroup_subsys_state *css;
- struct cpuset *c, *par;
- int ret;
- WARN_ON_ONCE(!rcu_read_lock_held());
- /* Each of our child cpusets must be a subset of us */
- ret = -EBUSY;
- cpuset_for_each_child(c, css, cur)
- if (!is_cpuset_subset(c, trial))
- goto out;
- /* On legacy hierarchy, we must be a subset of our parent cpuset. */
- ret = -EACCES;
- par = parent_cs(cur);
- if (par && !is_cpuset_subset(trial, par))
- goto out;
- /*
- * Cpusets with tasks - existing or newly being attached - can't
- * be changed to have empty cpus_allowed or mems_allowed.
- */
- ret = -ENOSPC;
- if (cpuset_is_populated(cur)) {
- if (!cpumask_empty(cur->cpus_allowed) &&
- cpumask_empty(trial->cpus_allowed))
- goto out;
- if (!nodes_empty(cur->mems_allowed) &&
- nodes_empty(trial->mems_allowed))
- goto out;
- }
- ret = 0;
- out:
- return ret;
- }
- /*
- * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts
- * to legacy (v1)
- * @cs1: first cpuset to check
- * @cs2: second cpuset to check
- *
- * Returns: true if CPU exclusivity conflict exists, false otherwise
- *
- * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect.
- */
- bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
- {
- if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
- return cpumask_intersects(cs1->cpus_allowed,
- cs2->cpus_allowed);
- return false;
- }
- #ifdef CONFIG_PROC_PID_CPUSET
- /*
- * proc_cpuset_show()
- * - Print tasks cpuset path into seq_file.
- * - Used for /proc/<pid>/cpuset.
- */
- int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
- struct pid *pid, struct task_struct *tsk)
- {
- char *buf;
- struct cgroup_subsys_state *css;
- int retval;
- retval = -ENOMEM;
- buf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!buf)
- goto out;
- rcu_read_lock();
- spin_lock_irq(&css_set_lock);
- css = task_css(tsk, cpuset_cgrp_id);
- retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
- current->nsproxy->cgroup_ns);
- spin_unlock_irq(&css_set_lock);
- rcu_read_unlock();
- if (retval == -E2BIG)
- retval = -ENAMETOOLONG;
- if (retval < 0)
- goto out_free;
- seq_puts(m, buf);
- seq_putc(m, '\n');
- retval = 0;
- out_free:
- kfree(buf);
- out:
- return retval;
- }
- #endif /* CONFIG_PROC_PID_CPUSET */
- static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
- {
- struct cpuset *cs = css_cs(css);
- cpuset_filetype_t type = cft->private;
- switch (type) {
- case FILE_CPU_EXCLUSIVE:
- return is_cpu_exclusive(cs);
- case FILE_MEM_EXCLUSIVE:
- return is_mem_exclusive(cs);
- case FILE_MEM_HARDWALL:
- return is_mem_hardwall(cs);
- case FILE_SCHED_LOAD_BALANCE:
- return is_sched_load_balance(cs);
- case FILE_MEMORY_MIGRATE:
- return is_memory_migrate(cs);
- case FILE_MEMORY_PRESSURE_ENABLED:
- return cpuset_memory_pressure_enabled;
- case FILE_MEMORY_PRESSURE:
- return fmeter_getrate(&cs->fmeter);
- case FILE_SPREAD_PAGE:
- return is_spread_page(cs);
- case FILE_SPREAD_SLAB:
- return is_spread_slab(cs);
- default:
- BUG();
- }
- /* Unreachable but makes gcc happy */
- return 0;
- }
- static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
- u64 val)
- {
- struct cpuset *cs = css_cs(css);
- cpuset_filetype_t type = cft->private;
- int retval = 0;
- cpuset_full_lock();
- if (!is_cpuset_online(cs)) {
- retval = -ENODEV;
- goto out_unlock;
- }
- switch (type) {
- case FILE_CPU_EXCLUSIVE:
- retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
- break;
- case FILE_MEM_EXCLUSIVE:
- pr_info_once("cpuset.%s is deprecated\n", cft->name);
- retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
- break;
- case FILE_MEM_HARDWALL:
- pr_info_once("cpuset.%s is deprecated\n", cft->name);
- retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
- break;
- case FILE_SCHED_LOAD_BALANCE:
- pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name);
- retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
- break;
- case FILE_MEMORY_MIGRATE:
- pr_info_once("cpuset.%s is deprecated\n", cft->name);
- retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
- break;
- case FILE_MEMORY_PRESSURE_ENABLED:
- pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name);
- cpuset_memory_pressure_enabled = !!val;
- break;
- case FILE_SPREAD_PAGE:
- pr_info_once("cpuset.%s is deprecated\n", cft->name);
- retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
- break;
- case FILE_SPREAD_SLAB:
- pr_warn_once("cpuset.%s is deprecated\n", cft->name);
- retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
- break;
- default:
- retval = -EINVAL;
- break;
- }
- out_unlock:
- cpuset_full_unlock();
- return retval;
- }
- void cpuset1_init(struct cpuset *cs)
- {
- fmeter_init(&cs->fmeter);
- cs->relax_domain_level = -1;
- }
- void cpuset1_online_css(struct cgroup_subsys_state *css)
- {
- struct cpuset *tmp_cs;
- struct cgroup_subsys_state *pos_css;
- struct cpuset *cs = css_cs(css);
- struct cpuset *parent = parent_cs(cs);
- lockdep_assert_cpus_held();
- lockdep_assert_cpuset_lock_held();
- if (is_spread_page(parent))
- set_bit(CS_SPREAD_PAGE, &cs->flags);
- if (is_spread_slab(parent))
- set_bit(CS_SPREAD_SLAB, &cs->flags);
- if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
- return;
- /*
- * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
- * set. This flag handling is implemented in cgroup core for
- * historical reasons - the flag may be specified during mount.
- *
- * Currently, if any sibling cpusets have exclusive cpus or mem, we
- * refuse to clone the configuration - thereby refusing the task to
- * be entered, and as a result refusing the sys_unshare() or
- * clone() which initiated it. If this becomes a problem for some
- * users who wish to allow that scenario, then this could be
- * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
- * (and likewise for mems) to the new cgroup.
- */
- rcu_read_lock();
- cpuset_for_each_child(tmp_cs, pos_css, parent) {
- if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
- rcu_read_unlock();
- return;
- }
- }
- rcu_read_unlock();
- cpuset_callback_lock_irq();
- cs->mems_allowed = parent->mems_allowed;
- cs->effective_mems = parent->mems_allowed;
- cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
- cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
- cpuset_callback_unlock_irq();
- }
- static void
- update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
- {
- if (dattr->relax_domain_level < c->relax_domain_level)
- dattr->relax_domain_level = c->relax_domain_level;
- }
- static void update_domain_attr_tree(struct sched_domain_attr *dattr,
- struct cpuset *root_cs)
- {
- struct cpuset *cp;
- struct cgroup_subsys_state *pos_css;
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
- /* skip the whole subtree if @cp doesn't have any CPU */
- if (cpumask_empty(cp->cpus_allowed)) {
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
- if (is_sched_load_balance(cp))
- update_domain_attr(dattr, cp);
- }
- rcu_read_unlock();
- }
- /*
- * cpuset1_generate_sched_domains()
- *
- * Finding the best partition (set of domains):
- * The double nested loops below over i, j scan over the load
- * balanced cpusets (using the array of cpuset pointers in csa[])
- * looking for pairs of cpusets that have overlapping cpus_allowed
- * and merging them using a union-find algorithm.
- *
- * The union of the cpus_allowed masks from the set of all cpusets
- * having the same root then form the one element of the partition
- * (one sched domain) to be passed to partition_sched_domains().
- */
- int cpuset1_generate_sched_domains(cpumask_var_t **domains,
- struct sched_domain_attr **attributes)
- {
- struct cpuset *cp; /* top-down scan of cpusets */
- struct cpuset **csa; /* array of all cpuset ptrs */
- int csn; /* how many cpuset ptrs in csa so far */
- int i, j; /* indices for partition finding loops */
- cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
- struct sched_domain_attr *dattr; /* attributes for custom domains */
- int ndoms = 0; /* number of sched domains in result */
- int nslot; /* next empty doms[] struct cpumask slot */
- struct cgroup_subsys_state *pos_css;
- int nslot_update;
- lockdep_assert_cpuset_lock_held();
- doms = NULL;
- dattr = NULL;
- csa = NULL;
- /* Special case for the 99% of systems with one, full, sched domain */
- if (is_sched_load_balance(&top_cpuset)) {
- ndoms = 1;
- doms = alloc_sched_domains(ndoms);
- if (!doms)
- goto done;
- dattr = kmalloc_obj(struct sched_domain_attr);
- if (dattr) {
- *dattr = SD_ATTR_INIT;
- update_domain_attr_tree(dattr, &top_cpuset);
- }
- cpumask_and(doms[0], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_TYPE_DOMAIN));
- goto done;
- }
- csa = kmalloc_objs(cp, nr_cpusets());
- if (!csa)
- goto done;
- csn = 0;
- rcu_read_lock();
- cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
- if (cp == &top_cpuset)
- continue;
- /*
- * Continue traversing beyond @cp iff @cp has some CPUs and
- * isn't load balancing. The former is obvious. The
- * latter: All child cpusets contain a subset of the
- * parent's cpus, so just skip them, and then we call
- * update_domain_attr_tree() to calc relax_domain_level of
- * the corresponding sched domain.
- */
- if (!cpumask_empty(cp->cpus_allowed) &&
- !(is_sched_load_balance(cp) &&
- cpumask_intersects(cp->cpus_allowed,
- housekeeping_cpumask(HK_TYPE_DOMAIN))))
- continue;
- if (is_sched_load_balance(cp) &&
- !cpumask_empty(cp->effective_cpus))
- csa[csn++] = cp;
- /* skip @cp's subtree */
- pos_css = css_rightmost_descendant(pos_css);
- continue;
- }
- rcu_read_unlock();
- for (i = 0; i < csn; i++)
- uf_node_init(&csa[i]->node);
- /* Merge overlapping cpusets */
- for (i = 0; i < csn; i++) {
- for (j = i + 1; j < csn; j++) {
- if (cpusets_overlap(csa[i], csa[j]))
- uf_union(&csa[i]->node, &csa[j]->node);
- }
- }
- /* Count the total number of domains */
- for (i = 0; i < csn; i++) {
- if (uf_find(&csa[i]->node) == &csa[i]->node)
- ndoms++;
- }
- /*
- * Now we know how many domains to create.
- * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
- */
- doms = alloc_sched_domains(ndoms);
- if (!doms)
- goto done;
- /*
- * The rest of the code, including the scheduler, can deal with
- * dattr==NULL case. No need to abort if alloc fails.
- */
- dattr = kmalloc_objs(struct sched_domain_attr, ndoms);
- for (nslot = 0, i = 0; i < csn; i++) {
- nslot_update = 0;
- for (j = i; j < csn; j++) {
- if (uf_find(&csa[j]->node) == &csa[i]->node) {
- struct cpumask *dp = doms[nslot];
- if (i == j) {
- nslot_update = 1;
- cpumask_clear(dp);
- if (dattr)
- *(dattr + nslot) = SD_ATTR_INIT;
- }
- cpumask_or(dp, dp, csa[j]->effective_cpus);
- cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
- if (dattr)
- update_domain_attr_tree(dattr + nslot, csa[j]);
- }
- }
- if (nslot_update)
- nslot++;
- }
- BUG_ON(nslot != ndoms);
- done:
- kfree(csa);
- /*
- * Fallback to the default domain if kmalloc() failed.
- * See comments in partition_sched_domains().
- */
- if (doms == NULL)
- ndoms = 1;
- *domains = doms;
- *attributes = dattr;
- return ndoms;
- }
- /*
- * for the common functions, 'private' gives the type of file
- */
- struct cftype cpuset1_files[] = {
- {
- .name = "cpus",
- .seq_show = cpuset_common_seq_show,
- .write = cpuset_write_resmask,
- .max_write_len = (100U + 6 * NR_CPUS),
- .private = FILE_CPULIST,
- },
- {
- .name = "mems",
- .seq_show = cpuset_common_seq_show,
- .write = cpuset_write_resmask,
- .max_write_len = (100U + 6 * MAX_NUMNODES),
- .private = FILE_MEMLIST,
- },
- {
- .name = "effective_cpus",
- .seq_show = cpuset_common_seq_show,
- .private = FILE_EFFECTIVE_CPULIST,
- },
- {
- .name = "effective_mems",
- .seq_show = cpuset_common_seq_show,
- .private = FILE_EFFECTIVE_MEMLIST,
- },
- {
- .name = "cpu_exclusive",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_CPU_EXCLUSIVE,
- },
- {
- .name = "mem_exclusive",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEM_EXCLUSIVE,
- },
- {
- .name = "mem_hardwall",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEM_HARDWALL,
- },
- {
- .name = "sched_load_balance",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_SCHED_LOAD_BALANCE,
- },
- {
- .name = "sched_relax_domain_level",
- .read_s64 = cpuset_read_s64,
- .write_s64 = cpuset_write_s64,
- .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
- },
- {
- .name = "memory_migrate",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEMORY_MIGRATE,
- },
- {
- .name = "memory_pressure",
- .read_u64 = cpuset_read_u64,
- .private = FILE_MEMORY_PRESSURE,
- },
- {
- .name = "memory_spread_page",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_SPREAD_PAGE,
- },
- {
- /* obsolete, may be removed in the future */
- .name = "memory_spread_slab",
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_SPREAD_SLAB,
- },
- {
- .name = "memory_pressure_enabled",
- .flags = CFTYPE_ONLY_ON_ROOT,
- .read_u64 = cpuset_read_u64,
- .write_u64 = cpuset_write_u64,
- .private = FILE_MEMORY_PRESSURE_ENABLED,
- },
- { } /* terminate */
- };
|