diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 8703a8452c33..90ee0e4d8d7e 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -71,7 +71,7 @@ DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key); /* * There could be abnormal cpuset configurations for cpu or memory - * node binding, add this key to provide a quick low-cost judgement + * node binding, add this key to provide a quick low-cost judgment * of the situation. */ DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key); @@ -93,16 +93,20 @@ enum prs_errcode { PERR_INVCPUS, PERR_INVPARENT, PERR_NOTPART, + PERR_NOTEXCL, PERR_NOCPUS, PERR_HOTPLUG, + PERR_CPUSEMPTY, }; static const char * const perr_strings[] = { - [PERR_INVCPUS] = "Invalid change to cpuset.cpus", + [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus", [PERR_INVPARENT] = "Parent is an invalid partition root", [PERR_NOTPART] = "Parent is not a partition root", + [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive", [PERR_NOCPUS] = "Parent unable to distribute cpu downstream", [PERR_HOTPLUG] = "No cpu available due to hotplug", + [PERR_CPUSEMPTY] = "cpuset.cpus is empty", }; struct cpuset { @@ -198,23 +202,22 @@ struct cpuset { /* * Partition root states: * - * 0 - not a partition root - * + * 0 - member (not a partition root) * 1 - partition root - * * 2 - partition root without load balancing (isolated) - * * -1 - invalid partition root - * None of the cpus in cpus_allowed can be put into the parent's - * subparts_cpus. In this case, the cpuset is not a real partition - * root anymore. However, the CPU_EXCLUSIVE bit will still be set - * and the cpuset can be restored back to a partition root if the - * parent cpuset can give more CPUs back to this child cpuset. + * -2 - invalid isolated partition root */ -#define PRS_DISABLED 0 -#define PRS_ENABLED 1 +#define PRS_MEMBER 0 +#define PRS_ROOT 1 #define PRS_ISOLATED 2 -#define PRS_ERROR -1 +#define PRS_INVALID_ROOT -1 +#define PRS_INVALID_ISOLATED -2 + +static inline bool is_prs_invalid(int prs_state) +{ + return prs_state < 0; +} /* * Temporary cpumasks for working with partitions that are passed among @@ -294,30 +297,40 @@ static inline int is_spread_slab(const struct cpuset *cs) return test_bit(CS_SPREAD_SLAB, &cs->flags); } -static inline int is_partition_root(const struct cpuset *cs) +static inline int is_partition_valid(const struct cpuset *cs) { return cs->partition_root_state > 0; } +static inline int is_partition_invalid(const struct cpuset *cs) +{ + return cs->partition_root_state < 0; +} + +static inline void set_partition_invalid(struct cpuset *cs) +{ + if (is_partition_valid(cs)) + cs->partition_root_state = -cs->partition_root_state; +} + /* * Send notification event of whenever partition_root_state changes. */ -static inline void notify_partition_change(struct cpuset *cs, - int old_prs, int new_prs) +static inline void notify_partition_change(struct cpuset *cs, int old_prs) { - if (old_prs == new_prs) + if (old_prs == cs->partition_root_state) return; cgroup_file_notify(&cs->partition_file); /* Reset prs_err if not invalid */ - if (new_prs != PRS_ERROR) + if (is_partition_valid(cs)) WRITE_ONCE(cs->prs_err, PERR_NONE); } static struct cpuset top_cpuset = { .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), - .partition_root_state = PRS_ENABLED, + .partition_root_state = PRS_ROOT, }; /** @@ -459,7 +472,7 @@ static inline bool partition_is_populated(struct cpuset *cs, cpuset_for_each_child(child, css, cs) { if (child == excluded_child) continue; - if (is_partition_root(child)) + if (is_partition_valid(child)) continue; if (cgroup_is_populated(child->css.cgroup)) { rcu_read_unlock(); @@ -656,6 +669,35 @@ static inline void free_cpuset(struct cpuset *cs) kfree(cs); } +/* + * validate_change_legacy() - Validate conditions specific to legacy (v1) + * behavior. + */ +static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial) +{ + struct cgroup_subsys_state *css; + struct cpuset *c, *par; + int ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + /* Each of our child cpusets must be a subset of us */ + ret = -EBUSY; + cpuset_for_each_child(c, css, cur) + if (!is_cpuset_subset(c, trial)) + goto out; + + /* On legacy hierarchy, we must be a subset of our parent cpuset. */ + ret = -EACCES; + par = parent_cs(cur); + if (par && !is_cpuset_subset(trial, par)) + goto out; + + ret = 0; +out: + return ret; +} + /* * validate_change() - Used to validate that any proposed cpuset change * follows the structural rules for cpusets. @@ -680,20 +722,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup_subsys_state *css; struct cpuset *c, *par; - int ret; - - /* The checks don't apply to root cpuset */ - if (cur == &top_cpuset) - return 0; + int ret = 0; rcu_read_lock(); - par = parent_cs(cur); - /* On legacy hierarchy, we must be a subset of our parent cpuset. */ - ret = -EACCES; - if (!is_in_v2_mode() && !is_cpuset_subset(trial, par)) + if (!is_in_v2_mode()) + ret = validate_change_legacy(cur, trial); + if (ret) + goto out; + + /* Remaining checks don't apply to root cpuset */ + if (cur == &top_cpuset) goto out; + par = parent_cs(cur); + /* * If either I or some sibling (!= me) is exclusive, we can't * overlap @@ -869,7 +912,7 @@ static int generate_sched_domains(cpumask_var_t **domains, update_domain_attr_tree(dattr, &top_cpuset); } cpumask_and(doms[0], top_cpuset.effective_cpus, - housekeeping_cpumask(HK_FLAG_DOMAIN)); + housekeeping_cpumask(HK_TYPE_DOMAIN)); goto done; } @@ -899,7 +942,7 @@ static int generate_sched_domains(cpumask_var_t **domains, if (!cpumask_empty(cp->cpus_allowed) && !(is_sched_load_balance(cp) && cpumask_intersects(cp->cpus_allowed, - housekeeping_cpumask(HK_FLAG_DOMAIN)))) + housekeeping_cpumask(HK_TYPE_DOMAIN)))) continue; if (root_load_balance && @@ -911,7 +954,7 @@ static int generate_sched_domains(cpumask_var_t **domains, csa[csn++] = cp; /* skip @cp's subtree if not a partition root */ - if (!is_partition_root(cp)) + if (!is_partition_valid(cp)) pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); @@ -988,7 +1031,7 @@ static int generate_sched_domains(cpumask_var_t **domains, if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); - cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN)); + cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN)); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -1117,7 +1160,7 @@ static void rebuild_sched_domains_locked(void) if (top_cpuset.nr_subparts_cpus) { rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (!is_partition_root(cs)) { + if (!is_partition_valid(cs)) { pos_css = css_rightmost_descendant(pos_css); continue; } @@ -1164,6 +1207,14 @@ static void update_tasks_cpumask(struct cpuset *cs) struct css_task_iter it; struct task_struct *task; + /* + * TODO: With cpuset partition that takes CPUs away from the top + * cpuset, we may want to properly adjust the cpus_allowed mask of + * tasks in the top cpuset as well. + */ + if (cs == &top_cpuset) + return; + css_task_iter_start(&cs->css, 0, &it); while ((task = css_task_iter_next(&it))) set_cpus_allowed_ptr(task, cs->effective_cpus); @@ -1203,13 +1254,15 @@ enum subparts_cmd { partcmd_update, /* Update parent's subparts_cpus */ }; +static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, + int turning_on); /** * update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset * @cpuset: The cpuset that requests change in partition root state * @cmd: Partition root state change command * @newmask: Optional new cpumask for partcmd_update * @tmp: Temporary addmask and delmask - * Return: 0, 1 or an error code + * Return: 0 or a partition root state error code * * For partcmd_enable, the cpuset is being transformed from a non-partition * root to a partition root. The cpus_allowed mask of the given cpuset will @@ -1217,43 +1270,32 @@ enum subparts_cmd { * effective_cpus. The function will return 0 if all the CPUs listed in * cpus_allowed can be granted or an error code will be returned. * - * For partcmd_disable, the cpuset is being transofrmed from a partition + * For partcmd_disable, the cpuset is being transformed from a partition * root back to a non-partition root. Any CPUs in cpus_allowed that are in * parent's subparts_cpus will be taken away from that cpumask and put back - * into parent's effective_cpus. 0 should always be returned. + * into parent's effective_cpus. 0 will always be returned. * - * For partcmd_update, if the optional newmask is specified, the cpu - * list is to be changed from cpus_allowed to newmask. Otherwise, - * cpus_allowed is assumed to remain the same. The cpuset should either - * be a partition root or an invalid partition root. The partition root - * state may change if newmask is NULL and none of the requested CPUs can - * be granted by the parent. The function will return 1 if changes to - * parent's subparts_cpus and effective_cpus happen or 0 otherwise. - * Error code should only be returned when newmask is non-NULL. + * For partcmd_update, if the optional newmask is specified, the cpu list is + * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is + * assumed to remain the same. The cpuset should either be a valid or invalid + * partition root. The partition root state may change from valid to invalid + * or vice versa. An error code will only be returned if transitioning from + * invalid to valid violates the exclusivity rule. * * The partcmd_enable and partcmd_disable commands are used by * update_prstate(). The partcmd_update command is used by * update_cpumasks_hier() with newmask NULL and update_cpumask() with * newmask set. - * - * The checking is more strict when enabling partition root than the - * other two commands. - * - * Because of the implicit cpu exclusive nature of a partition root, - * cpumask changes that violates the cpu exclusivity rule will not be - * permitted when checked by validate_change(). The validate_change() - * function will also prevent any changes to the cpu list if it is not - * a superset of children's cpu lists. */ -static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, +static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd, struct cpumask *newmask, struct tmpmasks *tmp) { - struct cpuset *parent = parent_cs(cpuset); + struct cpuset *parent = parent_cs(cs); int adding; /* Moving cpus from effective_cpus to subparts_cpus */ int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ int old_prs, new_prs; - bool part_error = false; /* Partition error? */ + int part_error = PERR_NONE; /* Partition error? */ percpu_rwsem_assert_held(&cpuset_rwsem); @@ -1262,39 +1304,41 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * The new cpumask, if present, or the current cpus_allowed must * not be empty. */ - if (!is_partition_root(parent) || - (newmask && cpumask_empty(newmask)) || - (!newmask && cpumask_empty(cpuset->cpus_allowed))) - return -EINVAL; - - /* - * Enabling partition root is not allowed if there are online children. - */ - if ((cmd == partcmd_enable) && css_has_online_children(&cpuset->css)) - return -EBUSY; + if (!is_partition_valid(parent)) { + return is_partition_invalid(parent) + ? PERR_INVPARENT : PERR_NOTPART; + } + if ((newmask && cpumask_empty(newmask)) || + (!newmask && cpumask_empty(cs->cpus_allowed))) + return PERR_CPUSEMPTY; adding = deleting = false; - old_prs = new_prs = cpuset->partition_root_state; + old_prs = new_prs = cs->partition_root_state; if (cmd == partcmd_enable) { /* * Enabling partition root is not allowed if cpus_allowed * doesn't overlap parent's cpus_allowed. */ - if (!cpumask_intersects(cpuset->cpus_allowed, parent->cpus_allowed)) - return -EINVAL; + if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed)) + return PERR_INVCPUS; /* * A parent can be left with no CPU as long as there is no * task directly associated with the parent partition. */ - if (partition_is_populated(parent, cpuset) && - cpumask_subset(parent->effective_cpus, cpuset->cpus_allowed)) - return -EINVAL; + if (partition_is_populated(parent, cs) && + !cpumask_intersects(cs->cpus_allowed, parent->effective_cpus)) + return PERR_NOCPUS; - cpumask_copy(tmp->addmask, cpuset->cpus_allowed); + cpumask_copy(tmp->addmask, cs->cpus_allowed); adding = true; } else if (cmd == partcmd_disable) { - deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, + /* + * Need to remove cpus from parent's subparts_cpus for valid + * partition root. + */ + deleting = !is_prs_invalid(old_prs) && + cpumask_and(tmp->delmask, cs->cpus_allowed, parent->subparts_cpus); } else if (newmask) { /* @@ -1306,7 +1350,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * addmask = newmask & parent->cpus_allowed * & ~parent->subparts_cpus */ - cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask); + cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask); deleting = cpumask_and(tmp->delmask, tmp->delmask, parent->subparts_cpus); @@ -1317,77 +1361,85 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * Make partition invalid if parent's effective_cpus could * become empty and there are tasks in the parent. */ - part_error = partition_is_populated(parent, cpuset) && - cpumask_subset(parent->effective_cpus, tmp->addmask) && - !cpumask_intersects(tmp->delmask, cpu_active_mask); - - if ((READ_ONCE(cpuset->prs_err) == PERR_NONE) && part_error) - WRITE_ONCE(cpuset->prs_err, PERR_INVCPUS); + if (adding && partition_is_populated(parent, cs) && + cpumask_subset(parent->effective_cpus, tmp->addmask) && + !cpumask_intersects(tmp->delmask, cpu_active_mask)) { + part_error = PERR_NOCPUS; + adding = false; + deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, + parent->subparts_cpus); + } } else { /* * partcmd_update w/o newmask: * - * addmask = cpus_allowed & parent->effective_cpus - * - * This gets invoked either due to a hotplug event or - * from update_cpumasks_hier() where we can't return an - * error. This can cause a partition root to become invalid - * in the case of a hotplug. + * delmask = cpus_allowed & parent->subparts_cpus + * addmask = cpus_allowed & parent->cpus_allowed + * & ~parent->subparts_cpus * + * This gets invoked either due to a hotplug event or from + * update_cpumasks_hier(). This can cause the state of a + * partition root to transition from valid to invalid or vice + * versa. So we still need to compute the addmask and delmask. + * A partition error happens when: * 1) Cpuset is valid partition, but parent does not distribute * out any CPUs. * 2) Parent has tasks and all its effective CPUs will have * to be distributed out. */ - adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed, - parent->effective_cpus); - part_error = (is_partition_root(cpuset) && - !parent->nr_subparts_cpus) || - (cpumask_equal(parent->effective_cpus, tmp->addmask) && - partition_is_populated(parent, cpuset)); + cpumask_and(tmp->addmask, cs->cpus_allowed, + parent->cpus_allowed); + adding = cpumask_andnot(tmp->addmask, tmp->addmask, + parent->subparts_cpus); + if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) || + (adding && + cpumask_subset(parent->effective_cpus, tmp->addmask) && + partition_is_populated(parent, cs))) { + part_error = PERR_NOCPUS; + adding = false; + } - if (is_partition_root(cpuset) && part_error) - WRITE_ONCE(cpuset->prs_err, PERR_NOCPUS); + if (part_error && is_partition_valid(cs) && + parent->nr_subparts_cpus) + deleting = cpumask_and(tmp->delmask, cs->cpus_allowed, + parent->subparts_cpus); } + if (part_error) + WRITE_ONCE(cs->prs_err, part_error); if (cmd == partcmd_update) { /* - * Check for possible transition between PRS_ERROR and - * PRS_ENABLED/PRS_ISOLATED. + * Check for possible transition between valid and invalid + * partition root. */ - switch (cpuset->partition_root_state) { - case PRS_ENABLED: + switch (cs->partition_root_state) { + case PRS_ROOT: case PRS_ISOLATED: if (part_error) - new_prs = PRS_ERROR; + new_prs = -old_prs; break; - case PRS_ERROR: - if (part_error) - break; - if (is_sched_load_balance(cpuset)) - new_prs = PRS_ENABLED; - else - new_prs = PRS_ISOLATED; + case PRS_INVALID_ROOT: + case PRS_INVALID_ISOLATED: + if (!part_error) + new_prs = -old_prs; break; } } - if ((old_prs == PRS_ERROR) && (new_prs == PRS_ERROR)) - return 0; /* Nothing need to be done */ - - if (new_prs == PRS_ERROR) { - /* - * Remove all its cpus from parent's subparts_cpus. - */ - adding = false; - deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed, - parent->subparts_cpus); - } - if (!adding && !deleting && (new_prs == old_prs)) return 0; + /* + * Transitioning from invalid to valid (partcmd_update) may require + * setting CS_CPU_EXCLUSIVE and clearing CS_SCHED_LOAD_BALANCE later. + */ + if ((old_prs != new_prs) && is_prs_invalid(old_prs)) { + if (!is_cpu_exclusive(cs) && + (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0)) + return PERR_NOTEXCL; + } + /* * Change the parent's subparts_cpus. * Newly added CPUs will be removed from effective_cpus and @@ -1414,12 +1466,25 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus); if (old_prs != new_prs) - cpuset->partition_root_state = new_prs; + cs->partition_root_state = new_prs; spin_unlock_irq(&callback_lock); - notify_partition_change(cpuset, old_prs, new_prs); - return cmd == partcmd_update; + if (adding || deleting) + update_tasks_cpumask(parent); + + /* + * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary. + * rebuild_sched_domains_locked() may be called. + */ + if ((old_prs != new_prs) && (cmd == partcmd_update)) { + if (old_prs == PRS_ISOLATED) + update_flag(CS_SCHED_LOAD_BALANCE, cs, 1); + else if (new_prs == PRS_ISOLATED) + update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + } + notify_partition_change(cs, old_prs); + return 0; } /* @@ -1457,7 +1522,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, * out all its CPUs. */ if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) { - if (is_partition_root(cp) && + if (is_partition_valid(cp) && cpumask_equal(cp->cpus_allowed, cp->subparts_cpus)) goto update_parent_subparts; @@ -1474,7 +1539,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, /* * Skip the whole subtree if the cpumask remains the same - * with no partition root state and force flag not set. + * and has no partition root state and force flag not set. */ if (!cp->partition_root_state && !force && cpumask_equal(tmp->new_cpus, cp->effective_cpus)) { @@ -1492,21 +1557,21 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, old_prs = new_prs = cp->partition_root_state; if ((cp != cs) && old_prs) { switch (parent->partition_root_state) { - case PRS_ENABLED: + case PRS_ROOT: case PRS_ISOLATED: update_parent = true; break; - case PRS_DISABLED: - case PRS_ERROR: + default: /* * When parent is not a partition root or is * invalid, child partition roots become * invalid too. */ - new_prs = PRS_ERROR; + if (is_partition_valid(cp)) + new_prs = -cp->partition_root_state; WRITE_ONCE(cp->prs_err, - (parent->partition_root_state == PRS_ERROR) + is_partition_invalid(parent) ? PERR_INVPARENT : PERR_NOTPART); break; } @@ -1517,25 +1582,25 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, rcu_read_unlock(); if (update_parent) { - if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp)) - update_tasks_cpumask(parent); + update_parent_subparts_cpumask(cp, partcmd_update, NULL, + tmp); /* - * The cpuset partition_root_state may be changed - * to PRS_ERROR. Capture it. + * The cpuset partition_root_state may become + * invalid. Capture it. */ new_prs = cp->partition_root_state; } spin_lock_irq(&callback_lock); - if (cp->nr_subparts_cpus && (new_prs <= 0)) { + if (cp->nr_subparts_cpus && !is_partition_valid(cp)) { /* * Put all active subparts_cpus back to effective_cpus. */ cpumask_or(tmp->new_cpus, tmp->new_cpus, cp->subparts_cpus); cpumask_and(tmp->new_cpus, tmp->new_cpus, - cpu_active_mask); + cpu_active_mask); cp->nr_subparts_cpus = 0; cpumask_clear(cp->subparts_cpus); } @@ -1552,7 +1617,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, cp->partition_root_state = new_prs; spin_unlock_irq(&callback_lock); - notify_partition_change(cp, old_prs, new_prs); + + notify_partition_change(cp, old_prs); WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); @@ -1568,7 +1634,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp, if (!cpumask_empty(cp->cpus_allowed) && is_sched_load_balance(cp) && (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) || - is_partition_root(cp))) + is_partition_valid(cp))) need_rebuild_sched_domains = true; rcu_read_lock(); @@ -1592,10 +1658,15 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, struct cpuset *sibling; struct cgroup_subsys_state *pos_css; + percpu_rwsem_assert_held(&cpuset_rwsem); + /* * Check all its siblings and call update_cpumasks_hier() * if their use_parent_ecpus flag is set in order for them * to use the right effective_cpus value. + * + * The update_cpumasks_hier() function may sleep. So we have to + * release the RCU read lock before calling it. */ rcu_read_lock(); cpuset_for_each_child(sibling, pos_css, parent) { @@ -1603,8 +1674,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs, continue; if (!sibling->use_parent_ecpus) continue; + if (!css_tryget_online(&sibling->css)) + continue; + rcu_read_unlock(); update_cpumasks_hier(sibling, tmp, false); + rcu_read_lock(); + css_put(&sibling->css); } rcu_read_unlock(); } @@ -1662,27 +1738,35 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, #endif if (cs->partition_root_state) { - /* Cpumask of a partition root cannot be empty */ - if (cpumask_empty(trialcs->cpus_allowed)) - return -EINVAL; - if (update_parent_subparts_cpumask(cs, partcmd_update, - trialcs->cpus_allowed, &tmp) < 0) - return -EINVAL; + update_parent_subparts_cpumask(cs, partcmd_update, + trialcs->cpus_allowed, &tmp); } + compute_effective_cpumask(trialcs->effective_cpus, trialcs, + parent_cs(cs)); spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); /* - * Make sure that subparts_cpus is a subset of cpus_allowed. + * Make sure that subparts_cpus, if not empty, is a subset of + * cpus_allowed. Clear subparts_cpus if there is an error or + * empty effective cpus with tasks. */ if (cs->nr_subparts_cpus) { - cpumask_and(cs->subparts_cpus, cs->subparts_cpus, - cs->cpus_allowed); - cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); + if (cs->prs_err || + (partition_is_populated(cs, NULL) && + cpumask_subset(trialcs->effective_cpus, cs->subparts_cpus))) { + cs->nr_subparts_cpus = 0; + cpumask_clear(cs->subparts_cpus); + } else { + cpumask_and(cs->subparts_cpus, cs->subparts_cpus, + cs->cpus_allowed); + cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); + } } spin_unlock_irq(&callback_lock); + /* effective_cpus will be updated here */ update_cpumasks_hier(cs, &tmp, false); if (cs->partition_root_state) { @@ -2059,16 +2143,17 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, return err; } -/* - * update_prstate - update partititon_root_state - * cs: the cpuset to update - * new_prs: new partition root state +/** + * update_prstate - update partition_root_state + * @cs: the cpuset to update + * @new_prs: new partition root state + * Return: 0 if successful, != 0 if error * * Call with cpuset_rwsem held. */ static int update_prstate(struct cpuset *cs, int new_prs) { - int err, old_prs = cs->partition_root_state; + int err = PERR_NONE, old_prs = cs->partition_root_state; bool sched_domain_rebuilt = false; struct cpuset *parent = parent_cs(cs); struct tmpmasks tmpmask; @@ -2077,28 +2162,33 @@ static int update_prstate(struct cpuset *cs, int new_prs) return 0; /* - * Cannot force a partial or invalid partition root to a full - * partition root. + * For a previously invalid partition root, leave it at being + * invalid if new_prs is not "member". */ - if (new_prs && (old_prs == PRS_ERROR)) - return -EINVAL; + if (new_prs && is_prs_invalid(old_prs)) { + cs->partition_root_state = -new_prs; + return 0; + } if (alloc_cpumasks(NULL, &tmpmask)) return -ENOMEM; - err = -EINVAL; if (!old_prs) { /* * Turning on partition root requires setting the * CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed - * cannot be NULL. + * cannot be empty. */ - if (cpumask_empty(cs->cpus_allowed)) + if (cpumask_empty(cs->cpus_allowed)) { + err = PERR_CPUSEMPTY; goto out; + } err = update_flag(CS_CPU_EXCLUSIVE, cs, 1); - if (err) + if (err) { + err = PERR_NOTEXCL; goto out; + } err = update_parent_subparts_cpumask(cs, partcmd_enable, NULL, &tmpmask); @@ -2120,16 +2210,16 @@ static int update_prstate(struct cpuset *cs, int new_prs) * A change in load balance state only, no change in cpumasks. */ update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED)); - err = 0; + sched_domain_rebuilt = true; goto out; /* Sched domain is rebuilt in update_flag() */ } else { /* - * Switch back to member is always allowed even if it + * Switching back to member is always allowed even if it * disables child partitions. */ - err = 0; update_parent_subparts_cpumask(cs, partcmd_disable, NULL, &tmpmask); + /* * If there are child partitions, they will all become invalid. */ @@ -2151,12 +2241,7 @@ static int update_prstate(struct cpuset *cs, int new_prs) } } - /* - * Update cpumask of parent's tasks except when it is the top - * cpuset as some system daemons cannot be mapped to other CPUs. - */ - if (parent != &top_cpuset) - update_tasks_cpumask(parent); + update_tasks_cpumask(parent); if (parent->child_ecpus_count) update_sibling_cpumasks(parent, cs, &tmpmask); @@ -2164,20 +2249,24 @@ static int update_prstate(struct cpuset *cs, int new_prs) if (!sched_domain_rebuilt) rebuild_sched_domains_locked(); out: - if (!err) { - spin_lock_irq(&callback_lock); - cs->partition_root_state = new_prs; - spin_unlock_irq(&callback_lock); - /* - * Update child cpusets when disabling partition. - */ - if (new_prs == PRS_DISABLED && !list_empty(&cs->css.children)) - update_cpumasks_hier(cs, &tmpmask, true); - notify_partition_change(cs, old_prs, new_prs); - } + /* + * Make partition invalid if an error happen + */ + if (err) + new_prs = -new_prs; + spin_lock_irq(&callback_lock); + cs->partition_root_state = new_prs; + spin_unlock_irq(&callback_lock); + /* + * Update child cpusets, if present. + * Force update if switching back to member. + */ + if (!list_empty(&cs->css.children)) + update_cpumasks_hier(cs, &tmpmask, !new_prs); + notify_partition_change(cs, old_prs); free_cpumasks(NULL, &tmpmask); - return err; + return 0; } /* @@ -2361,6 +2450,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) cgroup_taskset_first(tset, &css); cs = css_cs(css); + cpus_read_lock(); percpu_down_write(&cpuset_rwsem); guarantee_online_mems(cs, &cpuset_attach_nodemask_to); @@ -2414,6 +2504,7 @@ static void cpuset_attach(struct cgroup_taskset *tset) wake_up(&cpuset_attach_wq); percpu_up_write(&cpuset_rwsem); + cpus_read_unlock(); } /* The various types of files and directories in a cpuset file system */ @@ -2669,26 +2760,32 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) static int sched_partition_show(struct seq_file *seq, void *v) { struct cpuset *cs = css_cs(seq_css(seq)); - const char *err, *type; + const char *err, *type = NULL; switch (cs->partition_root_state) { - case PRS_ENABLED: + case PRS_ROOT: seq_puts(seq, "root\n"); break; case PRS_ISOLATED: seq_puts(seq, "isolated\n"); break; - case PRS_DISABLED: + case PRS_MEMBER: seq_puts(seq, "member\n"); break; - case PRS_ERROR: - type = is_sched_load_balance(cs) ? "root" : "isolated"; + case PRS_INVALID_ROOT: + type = "root"; + fallthrough; + case PRS_INVALID_ISOLATED: + if (!type) + type = "isolated"; err = perr_strings[READ_ONCE(cs->prs_err)]; if (err) seq_printf(seq, "%s invalid (%s)\n", type, err); else seq_printf(seq, "%s invalid\n", type); break; + seq_puts(seq, "isolated invalid\n"); + break; } return 0; } @@ -2706,9 +2803,9 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf, * Convert "root" to ENABLED, and convert "member" to DISABLED. */ if (!strcmp(buf, "root")) - val = PRS_ENABLED; + val = PRS_ROOT; else if (!strcmp(buf, "member")) - val = PRS_DISABLED; + val = PRS_MEMBER; else if (!strcmp(buf, "isolated")) val = PRS_ISOLATED; else @@ -2960,7 +3057,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) /* * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is * set. This flag handling is implemented in cgroup core for - * histrical reasons - the flag may be specified during mount. + * historical reasons - the flag may be specified during mount. * * Currently, if any sibling cpusets have exclusive cpus or mem, we * refuse to clone the configuration - thereby refusing the task to @@ -3009,7 +3106,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) cpus_read_lock(); percpu_down_write(&cpuset_rwsem); - if (is_partition_root(cs)) + if (is_partition_valid(cs)) update_prstate(cs, 0); if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) && @@ -3157,7 +3254,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs, /* * Don't call update_tasks_cpumask() if the cpuset becomes empty, - * as the tasks will be migratecd to an ancestor. + * as the tasks will be migrated to an ancestor. */ if (cpus_updated && !cpumask_empty(cs->cpus_allowed)) update_tasks_cpumask(cs); @@ -3186,7 +3283,7 @@ hotplug_update_tasks(struct cpuset *cs, bool cpus_updated, bool mems_updated) { /* A partition root is allowed to have empty effective cpus */ - if (cpumask_empty(new_cpus) && !is_partition_root(cs)) + if (cpumask_empty(new_cpus) && !is_partition_valid(cs)) cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus); if (nodes_empty(*new_mems)) *new_mems = parent_cs(cs)->effective_mems; @@ -3259,8 +3356,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) * partitions, if present, by setting nr_subparts_cpus to 0 to * reclaim their cpus. */ - if (is_partition_root(cs) && cpumask_empty(&new_cpus) && - cs->nr_subparts_cpus && partition_is_populated(cs, NULL)) { + if (cs->nr_subparts_cpus && is_partition_valid(cs) && + cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) { spin_lock_irq(&callback_lock); cs->nr_subparts_cpus = 0; cpumask_clear(cs->subparts_cpus); @@ -3271,16 +3368,15 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) /* * Force the partition to become invalid if either one of * the following conditions hold: - * 1) empty effective cpus with tasks in partition - * 2) parent is invalid or doesn't grant any cpus to child partitions. + * 1) empty effective cpus but not valid empty partition. + * 2) parent is invalid or doesn't grant any cpus to child + * partitions. */ - if (is_partition_root(cs) && - ((cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) || - !parent->nr_subparts_cpus)) { + if (is_partition_valid(cs) && (!parent->nr_subparts_cpus || + (cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) { int old_prs, parent_prs; - update_parent_subparts_cpumask(cs, partcmd_disable, - NULL, tmp); + update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp); if (cs->nr_subparts_cpus) { spin_lock_irq(&callback_lock); cs->nr_subparts_cpus = 0; @@ -3291,29 +3387,30 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) old_prs = cs->partition_root_state; parent_prs = parent->partition_root_state; - if (old_prs != PRS_ERROR) { + if (is_partition_valid(cs)) { spin_lock_irq(&callback_lock); - cs->partition_root_state = PRS_ERROR; + set_partition_invalid(cs); spin_unlock_irq(&callback_lock); - if (parent_prs == PRS_ERROR) + if (is_prs_invalid(parent_prs)) WRITE_ONCE(cs->prs_err, PERR_INVPARENT); else if (!parent_prs) WRITE_ONCE(cs->prs_err, PERR_NOTPART); else WRITE_ONCE(cs->prs_err, PERR_HOTPLUG); - notify_partition_change(cs, old_prs, PRS_ERROR); + notify_partition_change(cs, old_prs); } cpuset_force_rebuild(); } /* - * On the other hand, an erroneous partition root may be transitioned + * On the other hand, an invalid partition root may be transitioned * back to a regular one. */ - else if (is_partition_root(parent) && - (cs->partition_root_state == PRS_ERROR) && - update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp)) - cpuset_force_rebuild(); + else if (is_partition_valid(parent) && is_partition_invalid(cs)) { + update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp); + if (is_partition_valid(cs)) + cpuset_force_rebuild(); + } update_tasks: cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus); @@ -3619,8 +3716,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) return cs; } -/** - * cpuset_node_allowed - Can we allocate on a memory node? +/* + * __cpuset_node_allowed - Can we allocate on a memory node? * @node: is this an allowed node? * @gfp_mask: memory allocation flags * @@ -3662,7 +3759,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) bool __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ - int allowed; /* is allocation in zone z allowed? */ + bool allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) @@ -3791,8 +3888,8 @@ void cpuset_print_current_mems_allowed(void) int cpuset_memory_pressure_enabled __read_mostly; -/** - * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. +/* + * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. * * Keep a running average of the rate of synchronous (direct) * page reclaim efforts initiated by tasks in each cpuset. @@ -3807,7 +3904,7 @@ int cpuset_memory_pressure_enabled __read_mostly; * "memory_pressure". Value displayed is an integer * representing the recent rate of entry into the synchronous * (direct) page reclaim by any task attached to the cpuset. - **/ + */ void __cpuset_memory_pressure_bump(void) {