lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240325172707.73966-5-peternewman@google.com>
Date: Mon, 25 Mar 2024 10:27:05 -0700
From: Peter Newman <peternewman@...gle.com>
To: Fenghua Yu <fenghua.yu@...el.com>, Reinette Chatre <reinette.chatre@...el.com>, 
	James Morse <james.morse@....com>
Cc: Stephane Eranian <eranian@...gle.com>, Thomas Gleixner <tglx@...utronix.de>, 
	Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>, 
	Dave Hansen <dave.hansen@...ux.intel.com>, x86@...nel.org, 
	"H. Peter Anvin" <hpa@...or.com>, Peter Zijlstra <peterz@...radead.org>, Juri Lelli <juri.lelli@...hat.com>, 
	Vincent Guittot <vincent.guittot@...aro.org>, Dietmar Eggemann <dietmar.eggemann@....com>, 
	Steven Rostedt <rostedt@...dmis.org>, Ben Segall <bsegall@...gle.com>, Mel Gorman <mgorman@...e.de>, 
	Daniel Bristot de Oliveira <bristot@...hat.com>, Valentin Schneider <vschneid@...hat.com>, 
	Uros Bizjak <ubizjak@...il.com>, Mike Rapoport <rppt@...nel.org>, 
	"Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>, 
	Rick Edgecombe <rick.p.edgecombe@...el.com>, Xin Li <xin3.li@...el.com>, 
	Babu Moger <babu.moger@....com>, Shaopeng Tan <tan.shaopeng@...itsu.com>, 
	Maciej Wieczor-Retman <maciej.wieczor-retman@...el.com>, Jens Axboe <axboe@...nel.dk>, 
	Christian Brauner <brauner@...nel.org>, Oleg Nesterov <oleg@...hat.com>, 
	Andrew Morton <akpm@...ux-foundation.org>, Tycho Andersen <tandersen@...flix.com>, 
	Nicholas Piggin <npiggin@...il.com>, Beau Belgrave <beaub@...ux.microsoft.com>, 
	"Matthew Wilcox (Oracle)" <willy@...radead.org>, linux-kernel@...r.kernel.org, 
	Peter Newman <peternewman@...gle.com>
Subject: [PATCH v1 4/6] x86/resctrl: Use rdtgroup pointer to indicate task membership

Caching the CLOSID and RMID values in all member tasks makes changing
either ID for a group expensive, as all task_structs must be inspected
while read-locking the tasklist_lock.

A single rdtgroup reference from the task_struct can indicate the
mongroup and ctrl group membership of a task. In the case of mongroups,
the parent pointer can be used to determine the CLOSID indirectly,
avoiding the need for invalidating a cached CLOSID in all task_structs.

This also solves the problem of tearing PARTID/PMG values in MPAM, as
the parent pointer of a mongroup does not change. Therefore an atomic
read of the rdt_group pointer provides a consistent view of current
mongroup and control group membership, making __resctrl_sched_in()
portable.

Care must be taken to ensure that __resctrl_sched_in() does not
dereference a pointer to a freed rdtgroup struct. Tasks may no longer be
reachable via for_each_process_thread() but can still be switched in, so
update the rdt_group pointer before the thread is removed from the
tasklist.

Co-developed-by: Stephane Eranian <eranian@...gle.com>
Signed-off-by: Stephane Eranian <eranian@...gle.com>
Signed-off-by: Peter Newman <peternewman@...gle.com>
---
 arch/x86/include/asm/resctrl.h         |  18 ---
 arch/x86/kernel/cpu/resctrl/core.c     |   3 +-
 arch/x86/kernel/cpu/resctrl/internal.h |  13 +-
 arch/x86/kernel/cpu/resctrl/rdtgroup.c | 205 +++++++++++++------------
 include/linux/sched.h                  |   3 +-
 5 files changed, 110 insertions(+), 132 deletions(-)

diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index 99ba8c0dc155..be4afbc6180f 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -64,24 +64,6 @@ static inline unsigned int resctrl_arch_round_mon_val(unsigned int val)
 	return val * scale;
 }
 
-static inline void resctrl_arch_set_closid_rmid(struct task_struct *tsk,
-						u32 closid, u32 rmid)
-{
-	WRITE_ONCE(tsk->closid, closid);
-	WRITE_ONCE(tsk->rmid, rmid);
-}
-
-static inline bool resctrl_arch_match_closid(struct task_struct *tsk, u32 closid)
-{
-	return READ_ONCE(tsk->closid) == closid;
-}
-
-static inline bool resctrl_arch_match_rmid(struct task_struct *tsk, u32 ignored,
-					   u32 rmid)
-{
-	return READ_ONCE(tsk->rmid) == rmid;
-}
-
 static inline u32 resctrl_arch_system_num_rmid_idx(void)
 {
 	/* RMID are independent numbers for x86. num_rmid_idx == num_rmid */
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 83e40341583e..ae5878d748fc 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -600,8 +600,7 @@ static void clear_closid_rmid(int cpu)
 {
 	struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
 
-	state->default_closid = RESCTRL_RESERVED_CLOSID;
-	state->default_rmid = RESCTRL_RESERVED_RMID;
+	state->default_group = &rdtgroup_default;
 	state->cur_closid = RESCTRL_RESERVED_CLOSID;
 	state->cur_rmid = RESCTRL_RESERVED_RMID;
 	wrmsr(MSR_IA32_PQR_ASSOC, RESCTRL_RESERVED_RMID,
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 56a68e542572..0ba0d2428780 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -334,14 +334,8 @@ struct rftype {
 /**
  * struct resctrl_pqr_state - State cache for the PQR MSR
  * @cur_rmid:		The cached Resource Monitoring ID
- * @cur_closid:	The cached Class Of Service ID
- * @default_rmid:	The user assigned Resource Monitoring ID
- * @default_closid:	The user assigned cached Class Of Service ID
- *
- * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
- * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
- * contains both parts, so we need to cache them. This also
- * stores the user configured per cpu CLOSID and RMID.
+ * @cur_closid:		The cached Class Of Service ID
+ * @default_group:	The user assigned rdtgroup
  *
  * The cache also helps to avoid pointless updates if the value does
  * not change.
@@ -349,8 +343,7 @@ struct rftype {
 struct resctrl_pqr_state {
 	u32			cur_rmid;
 	u32			cur_closid;
-	u32			default_rmid;
-	u32			default_closid;
+	struct rdtgroup		*default_group;
 };
 
 DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 8d6979dbfd02..badf181c8cbb 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -348,25 +348,55 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 void __resctrl_sched_in(struct task_struct *tsk)
 {
 	struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
-	u32 closid = state->default_closid;
-	u32 rmid = state->default_rmid;
-	u32 tmp;
+	u32 closid = state->cur_closid;
+	u32 rmid = state->cur_rmid;
+	struct rdtgroup *rgrp;
 
 	/*
-	 * If this task has a closid/rmid assigned, use it.
-	 * Else use the closid/rmid assigned to this cpu.
+	 * A task's group assignment can change concurrently, but the CLOSID or
+	 * RMID assigned to a group cannot change.
 	 */
+	rgrp = READ_ONCE(tsk->rdt_group);
+	if (!rgrp || rgrp == &rdtgroup_default)
+		/*
+		 * If this task is a member of a control or monitoring group,
+		 * use the IDs assigned to these groups. Else use the
+		 * closid/rmid assigned to this cpu.
+		 */
+		rgrp = state->default_group;
+
+	/*
+	 * Context switches are possible before the cpuonline handler
+	 * initializes default_group.
+	 */
+	if (!rgrp)
+		rgrp = &rdtgroup_default;
+
 	if (static_branch_likely(&rdt_alloc_enable_key)) {
-		tmp = READ_ONCE(tsk->closid);
-		if (tmp)
-			closid = tmp;
+		/*
+		 * If the task is assigned to a monitoring group, the CLOSID is
+		 * determined by the parent control group.
+		 */
+		if (rgrp->type == RDTMON_GROUP) {
+			if (!WARN_ON(!rgrp->mon.parent))
+				/*
+				 * The parent rdtgroup cannot be freed until
+				 * after the mon group is freed. In the event
+				 * that the parent rdtgroup is removed (by
+				 * rdtgroup_rmdir_ctrl()), rdt_mon_group would
+				 * be redirected to rdtgroup_default, followed
+				 * by a full barrier and synchronous IPI
+				 * broadcast before proceeding to free the
+				 * group.
+				 */
+				closid = rgrp->mon.parent->closid;
+		} else {
+			closid = rgrp->closid;
+		}
 	}
 
-	if (static_branch_likely(&rdt_mon_enable_key)) {
-		tmp = READ_ONCE(tsk->rmid);
-		if (tmp)
-			rmid = tmp;
-	}
+	if (static_branch_likely(&rdt_mon_enable_key))
+		rmid = rgrp->mon.rmid;
 
 	if (closid != state->cur_closid || rmid != state->cur_rmid) {
 		state->cur_closid = closid;
@@ -385,10 +415,8 @@ static void update_cpu_closid_rmid(void *info)
 {
 	struct rdtgroup *r = info;
 
-	if (r) {
-		this_cpu_write(pqr_state.default_closid, r->closid);
-		this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
-	}
+	if (r)
+		this_cpu_write(pqr_state.default_group, r);
 
 	/*
 	 * We cannot unconditionally write the MSR because the current
@@ -624,49 +652,61 @@ static void update_task_closid_rmid(struct task_struct *t)
 
 static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp)
 {
-	u32 closid, rmid = rdtgrp->mon.rmid;
+	struct rdtgroup *task_group = READ_ONCE(tsk->rdt_group);
 
-	if (rdtgrp->type == RDTCTRL_GROUP)
-		closid = rdtgrp->closid;
-	else if (rdtgrp->type == RDTMON_GROUP)
-		closid = rdtgrp->mon.parent->closid;
-	else
-		return false;
+	lockdep_assert_held(&rdtgroup_mutex);
+
+	/* Uninitalized rdt_group pointer implies rdtgroup_default. */
+	if (!task_group)
+		task_group = &rdtgroup_default;
+
+	if (rdtgrp == task_group)
+		return true;
+
+	/* Tasks in child mongroups are members of the parent ctrlmon group. */
+	if (task_group->type == RDTMON_GROUP &&
+	    task_group->mon.parent == rdtgrp)
+		return true;
 
-	return resctrl_arch_match_closid(tsk, closid) &&
-	       resctrl_arch_match_rmid(tsk, closid, rmid);
+	return false;
 }
 
 static int __rdtgroup_move_task(struct task_struct *tsk,
 				struct rdtgroup *rdtgrp)
 {
+	struct rdtgroup *task_group = READ_ONCE(tsk->rdt_group);
+
 	/* If the task is already in rdtgrp, no need to move the task. */
 	if (task_in_rdtgroup(tsk, rdtgrp))
 		return 0;
 
 	/*
-	 * Set the task's closid/rmid before the PQR_ASSOC MSR can be
-	 * updated by them.
+	 * NULL is used in the task_struct so it can be overridden by a CPU's
+	 * default_group
+	 */
+	if (!task_group)
+		task_group = &rdtgroup_default;
+
+	/*
+	 * Set the task's group before the CPU can be updated by them.
 	 *
 	 * For ctrl_mon groups, move both closid and rmid.
 	 * For monitor groups, can move the tasks only from
-	 * their parent CTRL group.
+	 * their parent CTRL group or another mon group under the same parent.
 	 */
-	if (rdtgrp->type == RDTMON_GROUP &&
-	    !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) {
+	if (rdtgrp->type == RDTCTRL_GROUP) {
+		WRITE_ONCE(tsk->rdt_group, rdtgrp);
+	} else if (rdtgrp->type == RDTMON_GROUP &&
+		   (task_group == rdtgrp->mon.parent ||
+		    task_group->mon.parent == rdtgrp->mon.parent)) {
+		WRITE_ONCE(tsk->rdt_group, rdtgrp);
+	} else {
 		rdt_last_cmd_puts("Can't move task to different control group\n");
 		return -EINVAL;
 	}
 
-	if (rdtgrp->type == RDTMON_GROUP)
-		resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid,
-					     rdtgrp->mon.rmid);
-	else
-		resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid,
-					     rdtgrp->mon.rmid);
-
 	/*
-	 * Ensure the task's closid and rmid are written before determining if
+	 * Ensure the task's group is written before determining if
 	 * the task is current that will decide if it will be interrupted.
 	 * This pairs with the full barrier between the rq->curr update and
 	 * resctrl_sched_in() during context switch.
@@ -684,19 +724,6 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
 	return 0;
 }
 
-static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
-{
-	return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) &&
-		resctrl_arch_match_closid(t, r->closid));
-}
-
-static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
-{
-	return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) &&
-		resctrl_arch_match_rmid(t, r->mon.parent->closid,
-					r->mon.rmid));
-}
-
 /**
  * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
  * @r: Resource group
@@ -712,7 +739,7 @@ int rdtgroup_tasks_assigned(struct rdtgroup *r)
 
 	rcu_read_lock();
 	for_each_process_thread(p, t) {
-		if (is_closid_match(t, r) || is_rmid_match(t, r)) {
+		if (task_in_rdtgroup(t, r)) {
 			ret = 1;
 			break;
 		}
@@ -830,7 +857,7 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
 
 	rcu_read_lock();
 	for_each_process_thread(p, t) {
-		if (is_closid_match(t, r) || is_rmid_match(t, r)) {
+		if (task_in_rdtgroup(t, r)) {
 			pid = task_pid_vnr(t);
 			if (pid)
 				seq_printf(s, "%d\n", pid);
@@ -924,53 +951,34 @@ int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns,
 		      struct pid *pid, struct task_struct *tsk)
 {
 	struct rdtgroup *rdtg;
-	int ret = 0;
-
-	mutex_lock(&rdtgroup_mutex);
+	struct rdtgroup *crg;
+	struct rdtgroup *mrg;
 
 	/* Return empty if resctrl has not been mounted. */
 	if (!resctrl_mounted) {
 		seq_puts(s, "res:\nmon:\n");
-		goto unlock;
+		return 0;
 	}
 
-	list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) {
-		struct rdtgroup *crg;
+	mutex_lock(&rdtgroup_mutex);
 
-		/*
-		 * Task information is only relevant for shareable
-		 * and exclusive groups.
-		 */
-		if (rdtg->mode != RDT_MODE_SHAREABLE &&
-		    rdtg->mode != RDT_MODE_EXCLUSIVE)
-			continue;
+	rdtg = READ_ONCE(tsk->rdt_group);
+	if (!rdtg)
+		rdtg = &rdtgroup_default;
 
-		if (!resctrl_arch_match_closid(tsk, rdtg->closid))
-			continue;
+	mrg = rdtg;
+	crg = rdtg;
+	if (rdtg->type == RDTMON_GROUP)
+		crg = rdtg->mon.parent;
+
+	seq_printf(s, "res:%s%s\n", (crg == &rdtgroup_default) ? "/" : "",
+		   crg->kn->name);
+	seq_printf(s, "mon:%s%s\n", (mrg == &rdtgroup_default) ? "/" : "",
+		   mrg->kn->name);
 
-		seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "",
-			   rdtg->kn->name);
-		seq_puts(s, "mon:");
-		list_for_each_entry(crg, &rdtg->mon.crdtgrp_list,
-				    mon.crdtgrp_list) {
-			if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid,
-						     crg->mon.rmid))
-				continue;
-			seq_printf(s, "%s", crg->kn->name);
-			break;
-		}
-		seq_putc(s, '\n');
-		goto unlock;
-	}
-	/*
-	 * The above search should succeed. Otherwise return
-	 * with an error.
-	 */
-	ret = -ENOENT;
-unlock:
 	mutex_unlock(&rdtgroup_mutex);
 
-	return ret;
+	return 0;
 }
 #endif
 
@@ -2904,13 +2912,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
 
 	read_lock(&tasklist_lock);
 	for_each_process_thread(p, t) {
-		if (!from || is_closid_match(t, from) ||
-		    is_rmid_match(t, from)) {
-			resctrl_arch_set_closid_rmid(t, to->closid,
-						     to->mon.rmid);
+		if (!from || task_in_rdtgroup(t, from)) {
+			WRITE_ONCE(t->rdt_group, to);
 
 			/*
-			 * Order the closid/rmid stores above before the loads
+			 * Order the group store above before the loads
 			 * in task_curr(). This pairs with the full barrier
 			 * between the rq->curr update and resctrl_sched_in()
 			 * during context switch.
@@ -2939,6 +2945,7 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
  */
 void exit_resctrl(struct task_struct *tsk)
 {
+	WRITE_ONCE(tsk->rdt_group, &rdtgroup_default);
 }
 
 static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
@@ -3681,7 +3688,7 @@ static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
 
 	/* Update per cpu rmid of the moved CPUs first */
 	for_each_cpu(cpu, &rdtgrp->cpu_mask)
-		per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
+		per_cpu(pqr_state.default_group, cpu) = prdtgrp;
 	/*
 	 * Update the MSR on moved CPUs and CPUs which have moved
 	 * task running on them.
@@ -3724,10 +3731,8 @@ static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask)
 		   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
 
 	/* Update per cpu closid and rmid of the moved CPUs first */
-	for_each_cpu(cpu, &rdtgrp->cpu_mask) {
-		per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
-		per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
-	}
+	for_each_cpu(cpu, &rdtgrp->cpu_mask)
+		per_cpu(pqr_state.default_group, cpu) = &rdtgroup_default;
 
 	/*
 	 * Update the MSR on moved CPUs and CPUs which have moved
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3c2abbc587b4..d07d7a80006b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1236,8 +1236,7 @@ struct task_struct {
 	struct list_head		cg_list;
 #endif
 #ifdef CONFIG_X86_CPU_RESCTRL
-	u32				closid;
-	u32				rmid;
+	struct rdtgroup			*rdt_group;
 #endif
 #ifdef CONFIG_FUTEX
 	struct robust_list_head __user	*robust_list;
-- 
2.44.0.396.g6e790dbe36-goog


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ