Fix sched-domains partitioning by cpusets. Walk the whole cpusets tree after something interesting changes, and recreate all partitions. Index: linux-2.6/kernel/cpuset.c =================================================================== --- linux-2.6.orig/kernel/cpuset.c 2006-10-19 19:26:54.000000000 +1000 +++ linux-2.6/kernel/cpuset.c 2006-10-19 20:21:29.000000000 +1000 @@ -751,6 +751,24 @@ static int validate_change(const struct return 0; } +static void update_cpu_domains_children(struct cpuset *par, + cpumask_t *non_partitioned) +{ + struct cpuset *c; + + list_for_each_entry(c, &par->children, sibling) { + if (cpus_empty(c->cpus_allowed)) + continue; + if (is_cpu_exclusive(c)) { + if (!partition_sched_domains(&c->cpus_allowed)) { + cpus_andnot(*non_partitioned, + *non_partitioned, c->cpus_allowed); + } + } else + update_cpu_domains_children(c, non_partitioned); + } +} + /* * For a given cpuset cur, partition the system as follows * a. All cpus in the parent cpuset's cpus_allowed that are not part of any @@ -760,53 +778,38 @@ static int validate_change(const struct * Build these two partitions by calling partition_sched_domains * * Call with manage_mutex held. May nest a call to the - * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. - * Must not be called holding callback_mutex, because we must - * not call lock_cpu_hotplug() while holding callback_mutex. + * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. Must not be called holding + * callback_mutex, because we must not call lock_cpu_hotplug() while holding + * callback_mutex. */ -static void update_cpu_domains(struct cpuset *cur) +static void update_cpu_domains(void) { - struct cpuset *c, *par = cur->parent; - cpumask_t pspan, cspan; + cpumask_t non_partitioned; - if (par == NULL || cpus_empty(cur->cpus_allowed)) - return; - - /* - * Get all cpus from parent's cpus_allowed not part of exclusive - * children - */ - pspan = par->cpus_allowed; - list_for_each_entry(c, &par->children, sibling) { - if (is_cpu_exclusive(c)) - cpus_andnot(pspan, pspan, c->cpus_allowed); - } - if (!is_cpu_exclusive(cur)) { - cpus_or(pspan, pspan, cur->cpus_allowed); - if (cpus_equal(pspan, cur->cpus_allowed)) - return; - cspan = CPU_MASK_NONE; - } else { - if (cpus_empty(pspan)) - return; - cspan = cur->cpus_allowed; - /* - * Get all cpus from current cpuset's cpus_allowed not part - * of exclusive children - */ - list_for_each_entry(c, &cur->children, sibling) { - if (is_cpu_exclusive(c)) - cpus_andnot(cspan, cspan, c->cpus_allowed); - } - } + BUG_ON(!mutex_is_locked(&manage_mutex)); lock_cpu_hotplug(); - partition_sched_domains(&pspan, &cspan); + non_partitioned = top_cpuset.cpus_allowed; + update_cpu_domains_children(&top_cpuset, &non_partitioned); + partition_sched_domains(&non_partitioned); unlock_cpu_hotplug(); } /* + * Same as above except called with lock_cpu_hotplug and without manage_mutex. + */ + +int cpuset_hotplug_update_sched_domains(void) +{ + cpumask_t non_partitioned; + + non_partitioned = top_cpuset.cpus_allowed; + update_cpu_domains_children(&top_cpuset, &non_partitioned); + return partition_sched_domains(&non_partitioned); +} + +/* * Call with manage_mutex held. May take callback_mutex during call. */ @@ -833,8 +836,8 @@ static int update_cpumask(struct cpuset mutex_lock(&callback_mutex); cs->cpus_allowed = trialcs.cpus_allowed; mutex_unlock(&callback_mutex); - if (is_cpu_exclusive(cs) && !cpus_unchanged) - update_cpu_domains(cs); + if (!cpus_unchanged) + update_cpu_domains(); return 0; } @@ -1067,7 +1070,7 @@ static int update_flag(cpuset_flagbits_t mutex_unlock(&callback_mutex); if (cpu_exclusive_changed) - update_cpu_domains(cs); + update_cpu_domains(); return 0; } @@ -1931,19 +1934,9 @@ static int cpuset_mkdir(struct inode *di return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); } -/* - * Locking note on the strange update_flag() call below: - * - * If the cpuset being removed is marked cpu_exclusive, then simulate - * turning cpu_exclusive off, which will call update_cpu_domains(). - * The lock_cpu_hotplug() call in update_cpu_domains() must not be - * made while holding callback_mutex. Elsewhere the kernel nests - * callback_mutex inside lock_cpu_hotplug() calls. So the reverse - * nesting would risk an ABBA deadlock. - */ - static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) { + int is_exclusive; struct cpuset *cs = dentry->d_fsdata; struct dentry *d; struct cpuset *parent; @@ -1961,13 +1954,8 @@ static int cpuset_rmdir(struct inode *un mutex_unlock(&manage_mutex); return -EBUSY; } - if (is_cpu_exclusive(cs)) { - int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0"); - if (retval < 0) { - mutex_unlock(&manage_mutex); - return retval; - } - } + is_exclusive = is_cpu_exclusive(cs); + parent = cs->parent; mutex_lock(&callback_mutex); set_bit(CS_REMOVED, &cs->flags); @@ -1982,8 +1970,13 @@ static int cpuset_rmdir(struct inode *un mutex_unlock(&callback_mutex); if (list_empty(&parent->children)) check_for_release(parent, &pathbuf); + + if (is_exclusive) + update_cpu_domains(); + mutex_unlock(&manage_mutex); cpuset_release_agent(pathbuf); + return 0; } Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c 2006-10-19 19:24:48.000000000 +1000 +++ linux-2.6/kernel/sched.c 2006-10-19 20:21:50.000000000 +1000 @@ -6586,6 +6586,9 @@ error: */ static int arch_init_sched_domains(const cpumask_t *cpu_map) { +#ifdef CONFIG_CPUSETS + return cpuset_hotplug_update_sched_domains(); +#else cpumask_t cpu_default_map; int err; @@ -6599,6 +6602,7 @@ static int arch_init_sched_domains(const err = build_sched_domains(&cpu_default_map); return err; +#endif } static void arch_destroy_sched_domains(const cpumask_t *cpu_map) @@ -6622,29 +6626,26 @@ static void detach_destroy_domains(const /* * Partition sched domains as specified by the cpumasks below. - * This attaches all cpus from the cpumasks to the NULL domain, + * This attaches all cpus from the partition to the NULL domain, * waits for a RCU quiescent period, recalculates sched - * domain information and then attaches them back to the - * correct sched domains - * Call with hotplug lock held + * domain information and then attaches them back to their own + * isolated partition. + * + * Called with hotplug lock held + * + * Returns 0 on success. */ -int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) +int partition_sched_domains(cpumask_t *partition) { + cpumask_t non_isolated_cpus; cpumask_t change_map; - int err = 0; - cpus_and(*partition1, *partition1, cpu_online_map); - cpus_and(*partition2, *partition2, cpu_online_map); - cpus_or(change_map, *partition1, *partition2); + cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map); + cpus_and(change_map, *partition, non_isolated_cpus); /* Detach sched domains from all of the affected cpus */ detach_destroy_domains(&change_map); - if (!cpus_empty(*partition1)) - err = build_sched_domains(partition1); - if (!err && !cpus_empty(*partition2)) - err = build_sched_domains(partition2); - - return err; + return build_sched_domains(&change_map); } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) Index: linux-2.6/include/linux/sched.h =================================================================== --- linux-2.6.orig/include/linux/sched.h 2006-10-19 20:02:24.000000000 +1000 +++ linux-2.6/include/linux/sched.h 2006-10-19 20:02:30.000000000 +1000 @@ -707,8 +707,7 @@ struct sched_domain { #endif }; -extern int partition_sched_domains(cpumask_t *partition1, - cpumask_t *partition2); +extern int partition_sched_domains(cpumask_t *partition); /* * Maximum cache size the migration-costs auto-tuning code will Index: linux-2.6/include/linux/cpuset.h =================================================================== --- linux-2.6.orig/include/linux/cpuset.h 2006-10-19 20:07:24.000000000 +1000 +++ linux-2.6/include/linux/cpuset.h 2006-10-19 20:21:08.000000000 +1000 @@ -14,6 +14,8 @@ #ifdef CONFIG_CPUSETS +extern int cpuset_hotplug_update_sched_domains(void); + extern int number_of_cpusets; /* How many cpusets are defined in system? */ extern int cpuset_init_early(void);