linux-kernel - [PATCH v5 12/24] x86/resctrl: Make resctrl_arch_rmid

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230728164254.27562-13-james.morse@arm.com>
Date:   Fri, 28 Jul 2023 16:42:42 +0000
From:   James Morse <james.morse@....com>
To:     x86@...nel.org, linux-kernel@...r.kernel.org
Cc:     Fenghua Yu <fenghua.yu@...el.com>,
        Reinette Chatre <reinette.chatre@...el.com>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
        H Peter Anvin <hpa@...or.com>,
        Babu Moger <Babu.Moger@....com>,
        James Morse <james.morse@....com>,
        shameerali.kolothum.thodi@...wei.com,
        D Scott Phillips OS <scott@...amperecomputing.com>,
        carl@...amperecomputing.com, lcherian@...vell.com,
        bobo.shaobowang@...wei.com, tan.shaopeng@...itsu.com,
        xingxin.hx@...nanolis.org, baolin.wang@...ux.alibaba.com,
        Jamie Iles <quic_jiles@...cinc.com>,
        Xin Hao <xhao@...ux.alibaba.com>, peternewman@...gle.com,
        dfustini@...libre.com
Subject: [PATCH v5 12/24] x86/resctrl: Make resctrl_arch_rmid_read() retry when it is interrupted

resctrl_arch_rmid_read() could be called by resctrl in process context,
and then called by the PMU driver from irq context on the same CPU.
This could cause struct arch_mbm_state's prev_msr value to go backwards,
leading to the chunks value being incremented multiple times.

The struct arch_mbm_state holds both the previous msr value, and a count
of the number of chunks. These two fields need to be updated atomically.
Similarly __rmid_read() must write to one MSR and read from another,
this must be proteted from re-entrance.

Read the prev_msr before accessing the hardware, and cmpxchg() the value
back. If the value has changed, the whole thing is re-attempted. To protect
the MSR, __rmid_read() will retry reads for QM_CTR if QM_EVTSEL has changed
from the selected value.

Signed-off-by: James Morse <james.morse@....com>

---
Changes since v4:
 * Added retry loop in __rmid_read() to protect the CPU MSRs.
---
 arch/x86/kernel/cpu/resctrl/internal.h |  5 +--
 arch/x86/kernel/cpu/resctrl/monitor.c  | 45 ++++++++++++++++++++------
 2 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index a32d307292a1..7012f42a82ee 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -2,6 +2,7 @@
 #ifndef _ASM_X86_RESCTRL_INTERNAL_H
 #define _ASM_X86_RESCTRL_INTERNAL_H
 
+#include <linux/atomic.h>
 #include <linux/resctrl.h>
 #include <linux/sched.h>
 #include <linux/kernfs.h>
@@ -338,8 +339,8 @@ struct mbm_state {
  *		find this struct.
  */
 struct arch_mbm_state {
-	u64	chunks;
-	u64	prev_msr;
+	atomic64_t	chunks;
+	atomic64_t	prev_msr;
 };
 
 /**
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index f0670795b446..62350bbd23e0 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -16,6 +16,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/percpu.h>
 #include <linux/sizes.h>
 #include <linux/slab.h>
 
@@ -24,6 +25,9 @@
 
 #include "internal.h"
 
+/* Sequence number for writes to IA32 QM_EVTSEL */
+static DEFINE_PER_CPU(u64, qm_evtsel_seq);
+
 struct rmid_entry {
 	/*
 	 * Some architectures's resctrl_arch_rmid_read() needs the CLOSID value
@@ -178,7 +182,7 @@ static inline struct rmid_entry *__rmid_entry(u32 idx)
 
 static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
 {
-	u64 msr_val;
+	u64 msr_val, seq;
 
 	/*
 	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
@@ -187,9 +191,16 @@ static int __rmid_read(u32 rmid, enum resctrl_event_id eventid, u64 *val)
 	 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
 	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
 	 * are error bits.
+	 * A per-cpu sequence counter is incremented each time QM_EVTSEL is
+	 * written. This is used to detect if this function was interrupted by
+	 * another call without re-reading the MSRs. Retry the MSR read when
+	 * this happens as the QM_CTR value may belong to a different event.
 	 */
-	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
-	rdmsrl(MSR_IA32_QM_CTR, msr_val);
+	do {
+		seq = this_cpu_inc_return(qm_evtsel_seq);
+		wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
+		rdmsrl(MSR_IA32_QM_CTR, msr_val);
+	} while (seq != this_cpu_read(qm_evtsel_seq));
 
 	if (msr_val & RMID_VAL_ERROR)
 		return -EIO;
@@ -225,13 +236,15 @@ void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
 {
 	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
 	struct arch_mbm_state *am;
+	u64 msr_val;
 
 	am = get_arch_mbm_state(hw_dom, rmid, eventid);
 	if (am) {
 		memset(am, 0, sizeof(*am));
 
 		/* Record any initial, non-zero count value. */
-		__rmid_read(rmid, eventid, &am->prev_msr);
+		__rmid_read(rmid, eventid, &msr_val);
+		atomic64_set(&am->prev_msr, msr_val);
 	}
 }
 
@@ -266,23 +279,35 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
 {
 	struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
 	struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+	u64 start_msr_val, old_msr_val, msr_val, chunks;
 	struct arch_mbm_state *am;
-	u64 msr_val, chunks;
-	int ret;
+	int ret = 0;
 
 	if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
 		return -EINVAL;
 
+interrupted:
+	am = get_arch_mbm_state(hw_dom, rmid, eventid);
+	if (am)
+		start_msr_val = atomic64_read(&am->prev_msr);
+
 	ret = __rmid_read(rmid, eventid, &msr_val);
 	if (ret)
 		return ret;
 
 	am = get_arch_mbm_state(hw_dom, rmid, eventid);
 	if (am) {
-		am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
-						 hw_res->mbm_width);
-		chunks = get_corrected_mbm_count(rmid, am->chunks);
-		am->prev_msr = msr_val;
+		old_msr_val = atomic64_cmpxchg(&am->prev_msr, start_msr_val,
+					       msr_val);
+		if (old_msr_val != start_msr_val)
+			goto interrupted;
+
+		chunks = mbm_overflow_count(start_msr_val, msr_val,
+					    hw_res->mbm_width);
+		atomic64_add(chunks, &am->chunks);
+
+		chunks = get_corrected_mbm_count(rmid,
+						 atomic64_read(&am->chunks));
 	} else {
 		chunks = msr_val;
 	}
-- 
2.39.2