lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1329196009-25268-4-git-send-email-konrad.wilk@oracle.com>
Date:	Tue, 14 Feb 2012 00:06:49 -0500
From:	Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
To:	ke.yu@...el.com, kevin.tian@...el.com, JBeulich@...ell.com,
	linux-kernel@...r.kernel.org, linux-acpi@...r.kernel.org
Cc:	xen-devel@...ts.xensource.com,
	Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
Subject: [PATCH 3/3] xen/acpi/cpufreq: Provide an driver that passes struct acpi_processor data to the hypervisor.

The ACPI processor processes the _Pxx and the _Cx state information
which are populated in the 'struct acpi_processor' per-cpu structure.
We read the contents of that structure and pass it up the Xen hypervisor.

The ACPI processor along with the CPU freq driver does all the heavy-lifting
for us (filtering, calling ACPI functions, etc) so that the contents is correct.
After we are done parsing the information, we wait in case of hotplug CPUs
get loaded and then pass that information to the hypervisor.

Note: There is no good way to deal with dom0_max_vcpus=X parameter where
the initial domain is limited to a smaller subset of CPUs.

A lot of the code to pass the information to the hypervisor was copied from
https://lkml.org/lkml/2011/11/30/245 so many thanks to

 Yu Ke <ke.yu@...el.com>
 Tian Kevin <kevin.tian@...el.com>

[TODO: Should their names be in the code as Authors? Or just have the
same comment as what I mentioned in the commit?]

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
---
 drivers/xen/Kconfig             |   14 ++
 drivers/xen/Makefile            |    2 +-
 drivers/xen/processor-harvest.c |  397 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 412 insertions(+), 1 deletions(-)
 create mode 100644 drivers/xen/processor-harvest.c

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index a1ced52..126183f 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -178,4 +178,18 @@ config XEN_PRIVCMD
 	depends on XEN
 	default m
 
+config XEN_PROCESSOR_HARVEST
+	tristate "Processor passthrough driver for Xen"
+	depends on XEN
+	depends on ACPI_PROCESSOR
+	depends on X86
+	depends on CPU_FREQ
+	help
+	  This driver parses the processor structure and passes the information
+	  to the Xen hypervisor. It is used to allow the Xen hypervisor to have the
+	  full power management data and be able to select proper Cx and Pxx states.
+
+	  The driver should be loaded after acpi processor and cpufreq drivers have
+	  been loaded. If you do not know what to choose, select M here.
+
 endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index aa31337..856cfc6 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -20,7 +20,7 @@ obj-$(CONFIG_SWIOTLB_XEN)		+= swiotlb-xen.o
 obj-$(CONFIG_XEN_DOM0)			+= pci.o
 obj-$(CONFIG_XEN_PCIDEV_BACKEND)	+= xen-pciback/
 obj-$(CONFIG_XEN_PRIVCMD)		+= xen-privcmd.o
-
+obj-$(CONFIG_XEN_PROCESSOR_HARVEST)	+= processor-harvest.o
 xen-evtchn-y				:= evtchn.o
 xen-gntdev-y				:= gntdev.o
 xen-gntalloc-y				:= gntalloc.o
diff --git a/drivers/xen/processor-harvest.c b/drivers/xen/processor-harvest.c
new file mode 100644
index 0000000..50681e2
--- /dev/null
+++ b/drivers/xen/processor-harvest.c
@@ -0,0 +1,397 @@
+/*
+ * Copyright 2012 by Oracle Inc
+ * Author: Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+/*
+ *  Known limitations
+ *
+ * The driver can only handle up to  for_each_possible_cpu().
+ * Meaning if you boot with dom0_max_cpus=X it will _only_ parse up to X
+ * processors.
+ */
+
+#include <linux/cpumask.h>
+#include <linux/cpufreq.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <acpi/acpi_bus.h>
+#include <acpi/acpi_drivers.h>
+#include <acpi/processor.h>
+
+#include <xen/interface/platform.h>
+#include <asm/xen/hypercall.h>
+
+#define DRV_NAME "processor-passthrough-xen"
+MODULE_AUTHOR("Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>");
+MODULE_DESCRIPTION("ACPI Power Management driver to pass Cx and Pxx data to Xen hypervisor");
+MODULE_LICENSE("GPL");
+
+
+MODULE_PARM_DESC(off, "Inhibit the hypercall.");
+static int no_hypercall;
+module_param_named(off, no_hypercall, int, 0400);
+
+static DEFINE_MUTEX(processors_done_mutex);
+static DECLARE_BITMAP(processors_done, NR_CPUS);
+
+#define POLL_TIMER msecs_to_jiffies(5000 /* 5 sec */)
+static struct task_struct *xen_processor_thread;
+
+static int xen_push_cxx_to_hypervisor(struct acpi_processor *_pr)
+{
+	struct xen_platform_op op = {
+		.cmd			= XENPF_set_processor_pminfo,
+		.interface_version	= XENPF_INTERFACE_VERSION,
+		.u.set_pminfo.id	= _pr->acpi_id,
+		.u.set_pminfo.type	= XEN_PM_CX,
+	};
+	struct xen_processor_cx *xen_cx, *xen_cx_states = NULL;
+	struct acpi_processor_cx *cx;
+	int i, ok, ret = 0;
+
+	xen_cx_states = kcalloc(_pr->power.count,
+				sizeof(struct xen_processor_cx), GFP_KERNEL);
+	if (!xen_cx_states)
+		return -ENOMEM;
+
+	for (ok = 0, i = 1; i <= _pr->power.count; i++) {
+		cx = &_pr->power.states[i];
+		if (!cx->valid)
+			continue;
+
+		xen_cx = &(xen_cx_states[ok++]);
+
+		xen_cx->reg.space_id = ACPI_ADR_SPACE_SYSTEM_IO;
+		if (cx->entry_method == ACPI_CSTATE_SYSTEMIO) {
+			xen_cx->reg.bit_width = 8;
+			xen_cx->reg.bit_offset = 0;
+			xen_cx->reg.access_size = 1;
+		} else {
+			xen_cx->reg.space_id = ACPI_ADR_SPACE_FIXED_HARDWARE;
+			if (cx->entry_method == ACPI_CSTATE_FFH) {
+				/* NATIVE_CSTATE_BEYOND_HALT */
+				xen_cx->reg.bit_offset = 2;
+				xen_cx->reg.bit_width = 1; /* VENDOR_INTEL */
+			}
+			xen_cx->reg.access_size = 0;
+		}
+		xen_cx->reg.address = cx->address;
+
+		xen_cx->type = cx->type;
+		xen_cx->latency = cx->latency;
+		xen_cx->power = cx->power;
+
+		xen_cx->dpcnt = 0;
+		set_xen_guest_handle(xen_cx->dp, NULL);
+#ifdef DEBUG
+		pr_debug(DRV_NAME ": CX: ID:%d [C%d:%s] entry:%d\n", _pr->acpi_id,
+			 cx->type, cx->desc, cx->entry_method);
+#endif
+	}
+	if (!ok) {
+		pr_err(DRV_NAME ": No available Cx info for cpu %d\n", _pr->acpi_id);
+		kfree(xen_cx_states);
+		return -EINVAL;
+	}
+	op.u.set_pminfo.power.count = ok;
+	op.u.set_pminfo.power.flags.bm_control = _pr->flags.bm_control;
+	op.u.set_pminfo.power.flags.bm_check = _pr->flags.bm_check;
+	op.u.set_pminfo.power.flags.has_cst = _pr->flags.has_cst;
+	op.u.set_pminfo.power.flags.power_setup_done =
+		_pr->flags.power_setup_done;
+
+	set_xen_guest_handle(op.u.set_pminfo.power.states, xen_cx_states);
+
+	if (!no_hypercall && xen_initial_domain())
+		ret = HYPERVISOR_dom0_op(&op);
+
+	if (ret) {
+		pr_err(DRV_NAME ": Failed to send to hypervisor (rc:%d)\n", ret);
+		print_hex_dump_bytes("OP: ", DUMP_PREFIX_NONE, &op,
+				     sizeof(struct xen_platform_op));
+		print_hex_dump_bytes("Cx: ", DUMP_PREFIX_NONE, xen_cx_states,
+				     _pr->power.count *
+				     sizeof(struct xen_processor_cx));
+	}
+	kfree(xen_cx_states);
+
+	return ret;
+}
+
+
+
+static struct xen_processor_px *xen_copy_pss_data(struct acpi_processor *_pr,
+						  struct xen_processor_performance *xen_perf)
+{
+	struct xen_processor_px *xen_states = NULL;
+	int i;
+
+	xen_states = kcalloc(_pr->performance->state_count,
+			     sizeof(struct xen_processor_px), GFP_KERNEL);
+	if (!xen_states)
+		return ERR_PTR(-ENOMEM);
+
+	xen_perf->state_count = _pr->performance->state_count;
+
+	BUILD_BUG_ON(sizeof(struct xen_processor_px) !=
+		     sizeof(struct acpi_processor_px));
+	for (i = 0; i < _pr->performance->state_count; i++) {
+
+		/* Fortunatly for us, they both have the same size */
+		memcpy(&(xen_states[i]), &(_pr->performance->states[i]),
+		       sizeof(struct acpi_processor_px));
+	}
+	return xen_states;
+}
+static int xen_copy_psd_data(struct acpi_processor *_pr,
+			     struct xen_processor_performance *xen_perf)
+{
+	BUILD_BUG_ON(sizeof(struct xen_psd_package) !=
+		     sizeof(struct acpi_psd_package));
+
+	if (_pr->performance->shared_type != CPUFREQ_SHARED_TYPE_NONE) {
+		xen_perf->shared_type = _pr->performance->shared_type;
+
+		memcpy(&(xen_perf->domain_info), &(_pr->performance->domain_info),
+		       sizeof(struct acpi_psd_package));
+	} else {
+		if ((&cpu_data(0))->x86_vendor != X86_VENDOR_AMD)
+			return -EINVAL;
+
+		/* On AMD, the powernow-k8 is loaded before acpi_cpufreq
+		 * meaning that acpi_processor_preregister_performance never
+		 * gets called which would parse the _CST.
+		 */
+		xen_perf->shared_type = CPUFREQ_SHARED_TYPE_ALL;
+		xen_perf->domain_info.num_processors = num_online_cpus();
+	}
+	return 0;
+}
+static int xen_copy_pct_data(struct acpi_pct_register *pct,
+			     struct xen_pct_register *_pct)
+{
+	/* It would be nice if you could just do 'memcpy(pct, _pct') but
+	 * sadly the Xen structure did not have the proper padding
+	 * so the descriptor field takes two (_pct) bytes instead of one (pct).
+	 */
+	_pct->descriptor = pct->descriptor;
+	_pct->length = pct->length;
+	_pct->space_id = pct->space_id;
+	_pct->bit_width = pct->bit_width;
+	_pct->bit_offset = pct->bit_offset;
+	_pct->reserved = pct->reserved;
+	_pct->address = pct->address;
+	return 0;
+}
+static int xen_push_pxx_to_hypervisor(struct acpi_processor *_pr)
+{
+	int ret = 0;
+	struct xen_platform_op op = {
+		.cmd			= XENPF_set_processor_pminfo,
+		.interface_version	= XENPF_INTERFACE_VERSION,
+		.u.set_pminfo.id	= _pr->acpi_id,
+		.u.set_pminfo.type	= XEN_PM_PX,
+	};
+	struct xen_processor_performance *xen_perf;
+	struct xen_processor_px *xen_states = NULL;
+
+	xen_perf = &op.u.set_pminfo.perf;
+
+	xen_perf->platform_limit = _pr->performance_platform_limit;
+	xen_perf->flags |= XEN_PX_PPC;
+	xen_copy_pct_data(&(_pr->performance->control_register),
+			  &xen_perf->control_register);
+	xen_copy_pct_data(&(_pr->performance->status_register),
+			  &xen_perf->status_register);
+	xen_perf->flags |= XEN_PX_PCT;
+	xen_states = xen_copy_pss_data(_pr, xen_perf);
+	if (!IS_ERR_OR_NULL(xen_states)) {
+		set_xen_guest_handle(xen_perf->states, xen_states);
+		xen_perf->flags |= XEN_PX_PSS;
+	}
+	if (!xen_copy_psd_data(_pr, xen_perf))
+		xen_perf->flags |= XEN_PX_PSD;
+
+	if (!no_hypercall && xen_initial_domain())
+		ret = HYPERVISOR_dom0_op(&op);
+
+	if (ret) {
+		pr_err(DRV_NAME ": Failed to send to hypervisor (rc:%d)\n", ret);
+		print_hex_dump_bytes("OP: ", DUMP_PREFIX_NONE, &op,
+				     sizeof(struct xen_platform_op));
+		if (!IS_ERR_OR_NULL(xen_states))
+			print_hex_dump_bytes("Pxx:", DUMP_PREFIX_NONE, xen_states,
+				     _pr->performance->state_count *
+				     sizeof(struct xen_processor_px));
+	}
+	if (!IS_ERR_OR_NULL(xen_states))
+		kfree(xen_states);
+
+	return ret;
+}
+/*
+ * We read out the struct acpi_processor, and serialize access
+ * so that there is only one caller. This is so that we won't
+ * race with the CPU hotplug code.
+ */
+static int xen_process_data(struct acpi_processor *_pr, int cpu)
+{
+	int err = 0;
+
+	mutex_lock(&processors_done_mutex);
+	if (cpumask_test_cpu(cpu, to_cpumask(processors_done))) {
+		mutex_unlock(&processors_done_mutex);
+		return -EBUSY;
+	}
+	if (_pr->flags.power)
+		err = xen_push_cxx_to_hypervisor(_pr);
+
+	if (_pr->performance && _pr->performance->states)
+		err |= xen_push_pxx_to_hypervisor(_pr);
+
+	cpumask_set_cpu(cpu, to_cpumask(processors_done));
+	mutex_unlock(&processors_done_mutex);
+	return err;
+}
+
+static int xen_processor_check(void)
+{
+	struct cpufreq_policy *policy;
+	int cpu;
+
+	policy = cpufreq_cpu_get(smp_processor_id());
+	if (!policy)
+		return -EBUSY;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		struct acpi_processor *_pr;
+
+		_pr = per_cpu(processors, cpu);
+		if (!_pr)
+			continue;
+
+		(void)xen_process_data(_pr, cpu);
+	}
+	put_online_cpus();
+
+	cpufreq_cpu_put(policy);
+	return 0;
+}
+/*
+ * The purpose of this timer/thread is to wait for the ACPI processor
+ * and CPUfreq drivers to load up and parse the Pxx and Cxx information
+ * before we attempt to read it.
+ */
+static void xen_processor_timeout(unsigned long arg)
+{
+	wake_up_process((struct task_struct *)arg);
+}
+static int xen_processor_thread_func(void *dummy)
+{
+	struct timer_list timer;
+
+	setup_deferrable_timer_on_stack(&timer, xen_processor_timeout,
+					(unsigned long)current);
+
+	do {
+		__set_current_state(TASK_INTERRUPTIBLE);
+		mod_timer(&timer, jiffies + POLL_TIMER);
+		schedule();
+		if (xen_processor_check() != -EBUSY)
+			break;
+	} while (!kthread_should_stop());
+
+	del_timer_sync(&timer);
+	destroy_timer_on_stack(&timer);
+	return 0;
+}
+
+static int xen_cpu_soft_notify(struct notifier_block *nfb,
+			       unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+	struct acpi_processor *_pr = per_cpu(processors, cpu);
+
+	if (action == CPU_ONLINE && _pr)
+		(void)xen_process_data(_pr, cpu);
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block xen_cpu_notifier = {
+	.notifier_call = xen_cpu_soft_notify,
+	.priority = -1, /* Be the last one */
+};
+
+static int __init check_prereq(void)
+{
+	struct cpuinfo_x86 *c = &cpu_data(0);
+
+	if (!xen_initial_domain())
+		return -ENODEV;
+
+	if (!acpi_gbl_FADT.smi_command)
+		return -ENODEV;
+
+	if (c->x86_vendor == X86_VENDOR_INTEL) {
+		if (!cpu_has(c, X86_FEATURE_EST))
+			return -ENODEV;
+
+		return 0;
+	}
+	if (c->x86_vendor == X86_VENDOR_AMD) {
+		u32 hi = 0, lo = 0;
+		/* Copied from powernow-k8.h, can't include ../cpufreq/powernow
+		 * as we get compile warnings for the static functions.
+		 */
+#define MSR_PSTATE_CUR_LIMIT    0xc0010061 /* pstate current limit MSR */
+		rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
+
+		/* If the MSR cannot provide the data, the powernow-k8
+		 * won't process the data properly either.
+		 */
+		if (hi || lo)
+			return 0;
+	}
+	return -ENODEV;
+}
+
+static int __init xen_processor_passthrough_init(void)
+{
+	int rc = check_prereq();
+
+	if (rc)
+		return rc;
+
+	xen_processor_thread = kthread_run(xen_processor_thread_func, NULL, DRV_NAME);
+	if (IS_ERR(xen_processor_thread)) {
+		pr_err(DRV_NAME ": Failed to create thread. Aborting.\n");
+		return -ENOMEM;
+	}
+	register_hotcpu_notifier(&xen_cpu_notifier);
+	return 0;
+}
+static void __exit xen_processor_passthrough_exit(void)
+{
+	unregister_hotcpu_notifier(&xen_cpu_notifier);
+	if (xen_processor_thread)
+		kthread_stop(xen_processor_thread);
+}
+late_initcall(xen_processor_passthrough_init);
+module_exit(xen_processor_passthrough_exit);
-- 
1.7.9.48.g85da4d

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ