linux-kernel - [PATCH 07/12] x86/rdt,cqm: Scheduling support update

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1483740005-23499-8-git-send-email-vikas.shivappa@linux.intel.com>
Date:   Fri,  6 Jan 2017 14:00:00 -0800
From:   Vikas Shivappa <vikas.shivappa@...ux.intel.com>
To:     vikas.shivappa@...el.com, vikas.shivappa@...ux.intel.com
Cc:     davidcc@...gle.com, eranian@...gle.com,
        linux-kernel@...r.kernel.org, x86@...nel.org, hpa@...or.com,
        tglx@...utronix.de, mingo@...nel.org, peterz@...radead.org,
        ravi.v.shankar@...el.com, tony.luck@...el.com,
        fenghua.yu@...el.com, andi.kleen@...el.com, h.peter.anvin@...el.com
Subject: [PATCH 07/12] x86/rdt,cqm: Scheduling support update

Introduce a scheduling hook finish_arch_pre_lock_switch which is
called just after the perf sched_in during context switch. This method
handles both cat and cqm sched in scenarios.
The IA32_PQR_ASSOC MSR is used by cat(cache allocation) and cqm and this
patch integrates the two msr writes to one. The common sched_in patch
checks if the per cpu cache has a different RMID or CLOSid than the task
and does the MSR write.

During sched_in the task uses the task RMID if the task is monitored or
else uses the task's cgroup rmid.

Patch is based on David Carrillo-Cisneros <davidcc@...gle.com> patches
in cqm2 series.

Signed-off-by: Vikas Shivappa <vikas.shivappa@...ux.intel.com>
---
 arch/x86/events/intel/cqm.c              | 45 ++++++++++-----
 arch/x86/include/asm/intel_pqr_common.h  | 38 +++++++++++++
 arch/x86/include/asm/intel_rdt.h         | 39 -------------
 arch/x86/include/asm/intel_rdt_common.h  | 11 ++++
 arch/x86/include/asm/processor.h         |  4 ++
 arch/x86/kernel/cpu/Makefile             |  1 +
 arch/x86/kernel/cpu/intel_rdt_common.c   | 98 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/intel_rdt_rdtgroup.c |  4 +-
 arch/x86/kernel/process_32.c             |  4 --
 arch/x86/kernel/process_64.c             |  4 --
 kernel/sched/core.c                      |  1 +
 kernel/sched/sched.h                     |  3 +
 12 files changed, 188 insertions(+), 64 deletions(-)
 create mode 100644 arch/x86/include/asm/intel_pqr_common.h
 create mode 100644 arch/x86/kernel/cpu/intel_rdt_common.c

diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c
index c6479ae..597a184 100644
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -28,13 +28,6 @@
 static bool cqm_enabled, mbm_enabled;
 unsigned int cqm_socket_max;
 
-/*
- * The cached intel_pqr_state is strictly per CPU and can never be
- * updated from a remote CPU. Both functions which modify the state
- * (intel_cqm_event_start and intel_cqm_event_stop) are called with
- * interrupts disabled, which is sufficient for the protection.
- */
-DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
 static struct hrtimer *mbm_timers;
 /**
  * struct sample - mbm event's (local or total) data
@@ -74,6 +67,8 @@ struct sample {
 static DEFINE_MUTEX(cache_mutex);
 static DEFINE_RAW_SPINLOCK(cache_lock);
 
+DEFINE_STATIC_KEY_FALSE(cqm_enable_key);
+
 /*
  * Groups of events that have the same target(s), one RMID per group.
  */
@@ -108,7 +103,7 @@ struct sample {
  * Likewise, an rmid value of -1 is used to indicate "no rmid currently
  * assigned" and is used as part of the rotation code.
  */
-static inline bool __rmid_valid(u32 rmid)
+bool __rmid_valid(u32 rmid)
 {
 	if (!rmid || rmid > cqm_max_rmid)
 		return false;
@@ -161,7 +156,7 @@ static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid, int domain)
  *
  * We expect to be called with cache_mutex held.
  */
-static u32 __get_rmid(int domain)
+u32 __get_rmid(int domain)
 {
 	struct list_head *cqm_flist;
 	struct cqm_rmid_entry *entry;
@@ -368,6 +363,23 @@ static void init_mbm_sample(u32 *rmid, u32 evt_type)
 	on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
 }
 
+#ifdef CONFIG_CGROUP_PERF
+struct cgrp_cqm_info *cqminfo_from_tsk(struct task_struct *tsk)
+{
+	struct cgrp_cqm_info *ccinfo = NULL;
+	struct perf_cgroup *pcgrp;
+
+	pcgrp = perf_cgroup_from_task(tsk, NULL);
+
+	if (!pcgrp)
+		return NULL;
+	else
+		ccinfo = cgrp_to_cqm_info(pcgrp);
+
+	return ccinfo;
+}
+#endif
+
 static inline void cqm_enable_mon(struct cgrp_cqm_info *cqm_info, u32 *rmid)
 {
 	if (rmid != NULL) {
@@ -713,26 +725,27 @@ void alloc_needed_pkg_rmid(u32 *cqm_rmid)
 static void intel_cqm_event_start(struct perf_event *event, int mode)
 {
 	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-	u32 rmid;
 
 	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
 		return;
 
 	event->hw.cqm_state &= ~PERF_HES_STOPPED;
 
-	alloc_needed_pkg_rmid(event->hw.cqm_rmid);
-
-	rmid = event->hw.cqm_rmid[pkg_id];
-	state->rmid = rmid;
-	wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
+	if (is_task_event(event)) {
+		alloc_needed_pkg_rmid(event->hw.cqm_rmid);
+		state->next_task_rmid = event->hw.cqm_rmid[pkg_id];
+	}
 }
 
 static void intel_cqm_event_stop(struct perf_event *event, int mode)
 {
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+
 	if (event->hw.cqm_state & PERF_HES_STOPPED)
 		return;
 
 	event->hw.cqm_state |= PERF_HES_STOPPED;
+	state->next_task_rmid = 0;
 }
 
 static int intel_cqm_event_add(struct perf_event *event, int mode)
@@ -1366,6 +1379,8 @@ static int __init intel_cqm_init(void)
 	if (mbm_enabled)
 		pr_info("Intel MBM enabled\n");
 
+	static_branch_enable(&cqm_enable_key);
+
 	/*
 	 * Setup the hot cpu notifier once we are sure cqm
 	 * is enabled to avoid notifier leak.
diff --git a/arch/x86/include/asm/intel_pqr_common.h b/arch/x86/include/asm/intel_pqr_common.h
new file mode 100644
index 0000000..8fe9d8e
--- /dev/null
+++ b/arch/x86/include/asm/intel_pqr_common.h
@@ -0,0 +1,38 @@
+#ifndef _ASM_X86_INTEL_PQR_COMMON_H
+#define _ASM_X86_INTEL_PQR_COMMON_H
+
+#ifdef CONFIG_INTEL_RDT
+
+#include <linux/jump_label.h>
+#include <linux/types.h>
+#include <asm/percpu.h>
+#include <asm/msr.h>
+#include <asm/intel_rdt_common.h>
+
+void __intel_rdt_sched_in(void);
+
+/*
+ * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
+ *
+ * Following considerations are made so that this has minimal impact
+ * on scheduler hot path:
+ * - This will stay as no-op unless we are running on an Intel SKU
+ *   which supports resource control and we enable by mounting the
+ *   resctrl file system.
+ * - Caches the per cpu CLOSid values and does the MSR write only
+ *   when a task with a different CLOSid is scheduled in.
+ */
+static inline void intel_rdt_sched_in(void)
+{
+	if (static_branch_likely(&rdt_enable_key) ||
+		static_branch_unlikely(&cqm_enable_key)) {
+		__intel_rdt_sched_in();
+	}
+}
+
+#else
+
+static inline void intel_rdt_sched_in(void) {}
+
+#endif
+#endif
diff --git a/arch/x86/include/asm/intel_rdt.h b/arch/x86/include/asm/intel_rdt.h
index 95ce5c8..3b4a099 100644
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -5,7 +5,6 @@
 
 #include <linux/kernfs.h>
 #include <linux/jump_label.h>
-
 #include <asm/intel_rdt_common.h>
 
 #define IA32_L3_QOS_CFG		0xc81
@@ -182,43 +181,5 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 int rdtgroup_schemata_show(struct kernfs_open_file *of,
 			   struct seq_file *s, void *v);
 
-/*
- * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
- *
- * Following considerations are made so that this has minimal impact
- * on scheduler hot path:
- * - This will stay as no-op unless we are running on an Intel SKU
- *   which supports resource control and we enable by mounting the
- *   resctrl file system.
- * - Caches the per cpu CLOSid values and does the MSR write only
- *   when a task with a different CLOSid is scheduled in.
- *
- * Must be called with preemption disabled.
- */
-static inline void intel_rdt_sched_in(void)
-{
-	if (static_branch_likely(&rdt_enable_key)) {
-		struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
-		int closid;
-
-		/*
-		 * If this task has a closid assigned, use it.
-		 * Else use the closid assigned to this cpu.
-		 */
-		closid = current->closid;
-		if (closid == 0)
-			closid = this_cpu_read(cpu_closid);
-
-		if (closid != state->closid) {
-			state->closid = closid;
-			wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid);
-		}
-	}
-}
-
-#else
-
-static inline void intel_rdt_sched_in(void) {}
-
 #endif /* CONFIG_INTEL_RDT_A */
 #endif /* _ASM_X86_INTEL_RDT_H */
diff --git a/arch/x86/include/asm/intel_rdt_common.h b/arch/x86/include/asm/intel_rdt_common.h
index e11ed5e..544acaa 100644
--- a/arch/x86/include/asm/intel_rdt_common.h
+++ b/arch/x86/include/asm/intel_rdt_common.h
@@ -18,12 +18,23 @@
  */
 struct intel_pqr_state {
 	u32			rmid;
+	u32			next_task_rmid;
 	u32			closid;
 	int			rmid_usecnt;
 };
 
 DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
 
+u32 __get_rmid(int domain);
+bool __rmid_valid(u32 rmid);
+void alloc_needed_pkg_rmid(u32 *cqm_rmid);
+struct cgrp_cqm_info *cqminfo_from_tsk(struct task_struct *tsk);
+
+extern struct cgrp_cqm_info cqm_rootcginfo;
+
+DECLARE_STATIC_KEY_FALSE(cqm_enable_key);
+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
+
 /**
  * struct cgrp_cqm_info - perf_event cgroup metadata for cqm
  * @cont_mon     Continuous monitoring flag
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index eaf1005..ec4beed 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -22,6 +22,7 @@
 #include <asm/nops.h>
 #include <asm/special_insns.h>
 #include <asm/fpu/types.h>
+#include <asm/intel_pqr_common.h>
 
 #include <linux/personality.h>
 #include <linux/cache.h>
@@ -903,4 +904,7 @@ static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves)
 
 void stop_this_cpu(void *dummy);
 void df_debug(struct pt_regs *regs, long error_code);
+
+#define finish_arch_pre_lock_switch intel_rdt_sched_in
+
 #endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 5200001..d354e84 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
+obj-$(CONFIG_INTEL_RDT)		+= intel_rdt_common.o
 obj-$(CONFIG_INTEL_RDT_A)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
 
 obj-$(CONFIG_X86_MCE)			+= mcheck/
diff --git a/arch/x86/kernel/cpu/intel_rdt_common.c b/arch/x86/kernel/cpu/intel_rdt_common.c
new file mode 100644
index 0000000..c3c50cd
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_rdt_common.c
@@ -0,0 +1,98 @@
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/cacheinfo.h>
+#include <linux/cpuhotplug.h>
+#include <asm/intel-family.h>
+#include <asm/intel_rdt.h>
+
+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Both functions which modify the state
+ * (intel_cqm_event_start and intel_cqm_event_stop) are called with
+ * interrupts disabled, which is sufficient for the protection.
+ */
+DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+
+#define pkg_id	topology_physical_package_id(smp_processor_id())
+
+#ifdef CONFIG_INTEL_RDT_M
+static inline int get_cgroup_sched_rmid(void)
+{
+#ifdef CONFIG_CGROUP_PERF
+	struct cgrp_cqm_info *ccinfo = NULL;
+
+	ccinfo = cqminfo_from_tsk(current);
+
+	if (!ccinfo)
+		return 0;
+
+	/*
+	 * A cgroup is always monitoring for itself or
+	 * for an ancestor(default is root).
+	 */
+	if (ccinfo->mon_enabled) {
+		alloc_needed_pkg_rmid(ccinfo->rmid);
+		return ccinfo->rmid[pkg_id];
+	} else {
+		alloc_needed_pkg_rmid(ccinfo->mfa->rmid);
+		return ccinfo->mfa->rmid[pkg_id];
+	}
+#endif
+
+	return 0;
+}
+
+static inline int get_sched_in_rmid(void)
+{
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+	u32 rmid = 0;
+
+	rmid = state->next_task_rmid;
+
+	return rmid ? rmid : get_cgroup_sched_rmid();
+}
+#endif
+
+/*
+ * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
+ *
+ * Following considerations are made so that this has minimal impact
+ * on scheduler hot path:
+ * - This will stay as no-op unless we are running on an Intel SKU
+ *   which supports resource control and we enable by mounting the
+ *   resctrl file system or it supports resource monitoring.
+ * - Caches the per cpu CLOSid/RMID values and does the MSR write only
+ *   when a task with a different CLOSid/RMID is scheduled in.
+ */
+void __intel_rdt_sched_in(void)
+{
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+	int closid = 0;
+	u32 rmid = 0;
+
+#ifdef CONFIG_INTEL_RDT_A
+	if (static_branch_likely(&rdt_enable_key)) {
+		/*
+		 * If this task has a closid assigned, use it.
+		 * Else use the closid assigned to this cpu.
+		 */
+		closid = current->closid;
+		if (closid == 0)
+			closid = this_cpu_read(cpu_closid);
+	}
+#endif
+
+#ifdef CONFIG_INTEL_RDT_M
+	if (static_branch_unlikely(&cqm_enable_key))
+		rmid = get_sched_in_rmid();
+#endif
+
+	if (closid != state->closid || rmid != state->rmid) {
+
+		state->closid = closid;
+		state->rmid = rmid;
+		wrmsr(MSR_IA32_PQR_ASSOC, rmid, closid);
+	}
+}
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index 8af04af..8b6b429 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -206,7 +206,7 @@ static void rdt_update_cpu_closid(void *closid)
 	 * executing task might have its own closid selected. Just reuse
 	 * the context switch code.
 	 */
-	intel_rdt_sched_in();
+	__intel_rdt_sched_in();
 }
 
 /*
@@ -328,7 +328,7 @@ static void move_myself(struct callback_head *head)
 
 	preempt_disable();
 	/* update PQR_ASSOC MSR to make resource group go into effect */
-	intel_rdt_sched_in();
+	__intel_rdt_sched_in();
 	preempt_enable();
 
 	kfree(callback);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a0ac3e8..d0d7441 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -53,7 +53,6 @@
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
 #include <asm/vm86.h>
-#include <asm/intel_rdt.h>
 
 void __show_regs(struct pt_regs *regs, int all)
 {
@@ -297,8 +296,5 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 
 	this_cpu_write(current_task, next_p);
 
-	/* Load the Intel cache allocation PQR MSR. */
-	intel_rdt_sched_in();
-
 	return prev_p;
 }
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a61e141..a76b65e 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,7 +49,6 @@
 #include <asm/switch_to.h>
 #include <asm/xen/hypervisor.h>
 #include <asm/vdso.h>
-#include <asm/intel_rdt.h>
 
 __visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
 
@@ -477,9 +476,6 @@ void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 			loadsegment(ss, __KERNEL_DS);
 	}
 
-	/* Load the Intel cache allocation PQR MSR. */
-	intel_rdt_sched_in();
-
 	return prev_p;
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c56fb57..bf970ab 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2767,6 +2767,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	prev_state = prev->state;
 	vtime_task_switch(prev);
 	perf_event_task_sched_in(prev, current);
+	finish_arch_pre_lock_switch();
 	finish_lock_switch(rq, prev);
 	finish_arch_post_lock_switch();
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7b34c78..61b47a5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1121,6 +1121,9 @@ static inline int task_on_rq_migrating(struct task_struct *p)
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
+#ifndef finish_arch_pre_lock_switch
+# define finish_arch_pre_lock_switch()	do { } while (0)
+#endif
 #ifndef finish_arch_post_lock_switch
 # define finish_arch_post_lock_switch()	do { } while (0)
 #endif
-- 
1.9.1