lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20241110025023.664487-3-longman@redhat.com>
Date: Sat,  9 Nov 2024 21:50:22 -0500
From: Waiman Long <longman@...hat.com>
To: Tejun Heo <tj@...nel.org>,
	Johannes Weiner <hannes@...xchg.org>,
	Michal Koutný <mkoutny@...e.com>
Cc: cgroups@...r.kernel.org,
	linux-kernel@...r.kernel.org,
	Juri Lelli <juri.lelli@...hat.com>,
	Waiman Long <longman@...hat.com>
Subject: [PATCH 2/3] cgroup/cpuset: Enforce at most one rebuild_sched_domains_locked() call per operation

Since commit ff0ce721ec21 ("cgroup/cpuset: Eliminate unncessary
sched domains rebuilds in hotplug"), there is only one
rebuild_sched_domains_locked() call per hotplug operation. However,
writing to the various cpuset control files may still casue more than
one rebuild_sched_domains_locked() call to happen in some cases.

Juri had found that two rebuild_sched_domains_locked() calls in
update_prstate(), one from update_cpumasks_hier() and another one from
update_partition_sd_lb() could cause cpuset partition to be created
with null total_bw for DL tasks. IOW, DL tasks may not be scheduled
correctly in such a partition.

A sample command sequence that can reproduce null total_bw is as
follows.

  # echo Y >/sys/kernel/debug/sched/verbose
  # echo +cpuset >/sys/fs/cgroup/cgroup.subtree_control
  # mkdir /sys/fs/cgroup/test
  # echo 0-7 > /sys/fs/cgroup/test/cpuset.cpus
  # echo 6-7 > /sys/fs/cgroup/test/cpuset.cpus.exclusive
  # echo root >/sys/fs/cgroup/test/cpuset.cpus.partition

Fix this double rebuild_sched_domains_locked() calls problem
by replacing existing calls with cpuset_force_rebuild() except
the rebuild_sched_domains_cpuslocked() call at the end of
cpuset_handle_hotplug(). Checking of the force_sd_rebuild flag is
now done at the end of cpuset_write_resmask() and update_prstate()
to determine if rebuild_sched_domains_locked() should be called or not.

The cpuset v1 code can still call rebuild_sched_domains_locked()
directly as double rebuild_sched_domains_locked() calls is not possible.

Reported-by: Juri Lelli <juri.lelli@...hat.com>
Closes: https://lore.kernel.org/lkml/ZyuUcJDPBln1BK1Y@jlelli-thinkpadt14gen4.remote.csb/
Signed-off-by: Waiman Long <longman@...hat.com>
---
 kernel/cgroup/cpuset.c | 49 ++++++++++++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 565280193922..0d56a226c522 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -84,9 +84,19 @@ static bool		have_boot_isolcpus;
 static struct list_head remote_children;
 
 /*
- * A flag to force sched domain rebuild at the end of an operation while
- * inhibiting it in the intermediate stages when set. Currently it is only
- * set in hotplug code.
+ * A flag to force sched domain rebuild at the end of an operation.
+ * It can be set in
+ *  - update_partition_sd_lb()
+ *  - remote_partition_check()
+ *  - update_cpumasks_hier()
+ *  - cpuset_update_flag()
+ *  - cpuset_hotplug_update_tasks()
+ *  - cpuset_handle_hotplug()
+ *
+ * Protected by cpuset_mutex (with cpus_read_lock held) or cpus_write_lock.
+ *
+ * Note that update_relax_domain_level() in cpuset-v1.c can still call
+ * rebuild_sched_domains_locked() directly without using this flag.
  */
 static bool force_sd_rebuild;
 
@@ -990,6 +1000,7 @@ void rebuild_sched_domains_locked(void)
 
 	lockdep_assert_cpus_held();
 	lockdep_assert_held(&cpuset_mutex);
+	force_sd_rebuild = false;
 
 	/*
 	 * If we have raced with CPU hotplug, return early to avoid
@@ -1164,8 +1175,8 @@ static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
 			clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
 	}
 
-	if (rebuild_domains && !force_sd_rebuild)
-		rebuild_sched_domains_locked();
+	if (rebuild_domains)
+		cpuset_force_rebuild();
 }
 
 /*
@@ -1512,8 +1523,8 @@ static void remote_partition_check(struct cpuset *cs, struct cpumask *newmask,
 			remote_partition_disable(child, tmp);
 			disable_cnt++;
 		}
-	if (disable_cnt && !force_sd_rebuild)
-		rebuild_sched_domains_locked();
+	if (disable_cnt)
+		cpuset_force_rebuild();
 }
 
 /*
@@ -2106,8 +2117,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
 	}
 	rcu_read_unlock();
 
-	if (need_rebuild_sched_domains && !force_sd_rebuild)
-		rebuild_sched_domains_locked();
+	if (need_rebuild_sched_domains)
+		cpuset_force_rebuild();
 }
 
 /**
@@ -2726,9 +2737,13 @@ int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
 	cs->flags = trialcs->flags;
 	spin_unlock_irq(&callback_lock);
 
-	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed &&
-	    !force_sd_rebuild)
-		rebuild_sched_domains_locked();
+	if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) {
+		if (!IS_ENABLED(CONFIG_CPUSETS_V1) ||
+		    cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+			cpuset_force_rebuild();
+		else
+			rebuild_sched_domains_locked();
+	}
 
 	if (spread_flag_changed)
 		cpuset1_update_tasks_flags(cs);
@@ -2848,6 +2863,8 @@ static int update_prstate(struct cpuset *cs, int new_prs)
 	update_partition_sd_lb(cs, old_prs);
 
 	notify_partition_change(cs, old_prs);
+	if (force_sd_rebuild)
+		rebuild_sched_domains_locked();
 	free_cpumasks(NULL, &tmpmask);
 	return 0;
 }
@@ -3141,6 +3158,8 @@ ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
 	}
 
 	free_cpuset(trialcs);
+	if (force_sd_rebuild)
+		rebuild_sched_domains_locked();
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
 	cpus_read_unlock();
@@ -3885,11 +3904,9 @@ static void cpuset_handle_hotplug(void)
 		rcu_read_unlock();
 	}
 
-	/* rebuild sched domains if cpus_allowed has changed */
-	if (force_sd_rebuild) {
-		force_sd_rebuild = false;
+	/* rebuild sched domains if necessary */
+	if (force_sd_rebuild)
 		rebuild_sched_domains_cpuslocked();
-	}
 
 	free_cpumasks(NULL, ptmp);
 }
-- 
2.47.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ