linux-kernel - [PATCH 2/5] x86/xen: split smp.c for PV and PVHVM guests

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20170224161440.2136-3-vkuznets@redhat.com>
Date:   Fri, 24 Feb 2017 17:14:37 +0100
From:   Vitaly Kuznetsov <vkuznets@...hat.com>
To:     xen-devel@...ts.xenproject.org
Cc:     x86@...nel.org, linux-kernel@...r.kernel.org,
        Boris Ostrovsky <boris.ostrovsky@...cle.com>,
        Juergen Gross <jgross@...e.com>,
        Andrew Jones <drjones@...hat.com>
Subject: [PATCH 2/5] x86/xen: split smp.c for PV and PVHVM guests

More or less mechanically split smp.c into 3 files. XEN_PV_SMP and
XEN_PVHVM_SMP config options added to support the change.

Signed-off-by: Vitaly Kuznetsov <vkuznets@...hat.com>
---
 arch/x86/xen/Kconfig        |   8 +
 arch/x86/xen/Makefile       |   3 +
 arch/x86/xen/enlighten_pv.c |   9 +
 arch/x86/xen/smp.c          | 531 +-------------------------------------------
 arch/x86/xen/smp.h          |  23 ++
 arch/x86/xen/smp_hvm.c      |  58 +++++
 arch/x86/xen/smp_pv.c       | 499 +++++++++++++++++++++++++++++++++++++++++
 7 files changed, 611 insertions(+), 520 deletions(-)
 create mode 100644 arch/x86/xen/smp_hvm.c
 create mode 100644 arch/x86/xen/smp_pv.c

diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index c387560..9ebfd77 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -21,6 +21,10 @@ config XEN_PV
 	help
 	  Support running as a Xen PV guest.
 
+config XEN_PV_SMP
+	def_bool y
+	depends on XEN_PV && SMP
+
 config XEN_DOM0
 	bool "Xen PV Dom0 support"
 	default y
@@ -37,6 +41,10 @@ config XEN_PVHVM
 	help
 	  Support running as a Xen PVHVM guest.
 
+config XEN_PVHVM_SMP
+	def_bool y
+	depends on XEN_PVHVM && SMP
+
 config XEN_512GB
 	bool "Limit Xen pv-domain memory to 512GB"
 	depends on XEN_PV && X86_64
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 750727b..ed6f126 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -22,6 +22,9 @@ obj-$(CONFIG_XEN_PVH)		+= enlighten_pvh.o
 obj-$(CONFIG_EVENT_TRACING) += trace.o
 
 obj-$(CONFIG_SMP)		+= smp.o
+obj-$(CONFIG_XEN_PV_SMP)	+= smp_pv.o
+obj-$(CONFIG_XEN_PVHVM_SMP)	+= smp_hvm.o
+
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
 obj-$(CONFIG_XEN_DEBUG_FS)	+= debugfs.o
 obj-$(CONFIG_XEN_DOM0)		+= vga.o
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index b9ff23c..acfd896 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -1478,12 +1478,21 @@ static int xen_cpu_up_prepare_pv(unsigned int cpu)
 		     cpu, rc);
 		return rc;
 	}
+
+	rc = xen_smp_intr_init_pv(cpu);
+	if (rc) {
+		WARN(1, "xen_smp_intr_init_pv() for CPU %d failed: %d\n",
+		     cpu, rc);
+		return rc;
+	}
+
 	return 0;
 }
 
 static int xen_cpu_dead_pv(unsigned int cpu)
 {
 	xen_smp_intr_free(cpu);
+	xen_smp_intr_free_pv(cpu);
 
 	xen_teardown_timer(cpu);
 
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 14fd7f3..dd1150e 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -1,62 +1,21 @@
-/*
- * Xen SMP support
- *
- * This file implements the Xen versions of smp_ops.  SMP under Xen is
- * very straightforward.  Bringing a CPU up is simply a matter of
- * loading its initial context and setting it running.
- *
- * IPIs are handled through the Xen event mechanism.
- *
- * Because virtual CPUs can be scheduled onto any real CPU, there's no
- * useful topology information for the kernel to make use of.  As a
- * result, all CPUs are treated as if they're single-core and
- * single-threaded.
- */
-#include <linux/sched.h>
-#include <linux/err.h>
-#include <linux/slab.h>
 #include <linux/smp.h>
-#include <linux/irq_work.h>
-#include <linux/tick.h>
-
-#include <asm/paravirt.h>
-#include <asm/desc.h>
-#include <asm/pgtable.h>
-#include <asm/cpu.h>
-
-#include <xen/interface/xen.h>
-#include <xen/interface/vcpu.h>
-#include <xen/interface/xenpmu.h>
-
-#include <asm/xen/interface.h>
-#include <asm/xen/hypercall.h>
+#include <linux/slab.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
 
-#include <xen/xen.h>
-#include <xen/page.h>
 #include <xen/events.h>
 
 #include <xen/hvc-console.h>
 #include "xen-ops.h"
-#include "mmu.h"
 #include "smp.h"
-#include "pmu.h"
-
-cpumask_var_t xen_cpu_initialized_map;
 
-struct xen_common_irq {
-	int irq;
-	char *name;
-};
 static DEFINE_PER_CPU(struct xen_common_irq, xen_resched_irq) = { .irq = -1 };
 static DEFINE_PER_CPU(struct xen_common_irq, xen_callfunc_irq) = { .irq = -1 };
 static DEFINE_PER_CPU(struct xen_common_irq, xen_callfuncsingle_irq) = { .irq = -1 };
-static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
 static DEFINE_PER_CPU(struct xen_common_irq, xen_debug_irq) = { .irq = -1 };
-static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 };
 
 static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
 static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
-static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id);
 
 /*
  * Reschedule call back.
@@ -69,42 +28,6 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-static void cpu_bringup(void)
-{
-	int cpu;
-
-	cpu_init();
-	touch_softlockup_watchdog();
-	preempt_disable();
-
-	/* PVH runs in ring 0 and allows us to do native syscalls. Yay! */
-	if (!xen_feature(XENFEAT_supervisor_mode_kernel)) {
-		xen_enable_sysenter();
-		xen_enable_syscall();
-	}
-	cpu = smp_processor_id();
-	smp_store_cpu_info(cpu);
-	cpu_data(cpu).x86_max_cores = 1;
-	set_cpu_sibling_map(cpu);
-
-	xen_setup_cpu_clockevents();
-
-	notify_cpu_starting(cpu);
-
-	set_cpu_online(cpu, true);
-
-	cpu_set_state_online(cpu);  /* Implies full memory barrier. */
-
-	/* We can take interrupts now: we're officially "up". */
-	local_irq_enable();
-}
-
-asmlinkage __visible void cpu_bringup_and_idle(void)
-{
-	cpu_bringup();
-	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
-}
-
 void xen_smp_intr_free(unsigned int cpu)
 {
 	if (per_cpu(xen_resched_irq, cpu).irq >= 0) {
@@ -132,27 +55,12 @@ void xen_smp_intr_free(unsigned int cpu)
 		kfree(per_cpu(xen_callfuncsingle_irq, cpu).name);
 		per_cpu(xen_callfuncsingle_irq, cpu).name = NULL;
 	}
-	if (xen_hvm_domain())
-		return;
-
-	if (per_cpu(xen_irq_work, cpu).irq >= 0) {
-		unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL);
-		per_cpu(xen_irq_work, cpu).irq = -1;
-		kfree(per_cpu(xen_irq_work, cpu).name);
-		per_cpu(xen_irq_work, cpu).name = NULL;
-	}
+}
 
-	if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
-		unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
-		per_cpu(xen_pmu_irq, cpu).irq = -1;
-		kfree(per_cpu(xen_pmu_irq, cpu).name);
-		per_cpu(xen_pmu_irq, cpu).name = NULL;
-	}
-};
 int xen_smp_intr_init(unsigned int cpu)
 {
 	int rc;
-	char *resched_name, *callfunc_name, *debug_name, *pmu_name;
+	char *resched_name, *callfunc_name, *debug_name;
 
 	resched_name = kasprintf(GFP_KERNEL, "resched%d", cpu);
 	rc = bind_ipi_to_irqhandler(XEN_RESCHEDULE_VECTOR,
@@ -199,37 +107,6 @@ int xen_smp_intr_init(unsigned int cpu)
 	per_cpu(xen_callfuncsingle_irq, cpu).irq = rc;
 	per_cpu(xen_callfuncsingle_irq, cpu).name = callfunc_name;
 
-	/*
-	 * The IRQ worker on PVHVM goes through the native path and uses the
-	 * IPI mechanism.
-	 */
-	if (xen_hvm_domain())
-		return 0;
-
-	callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu);
-	rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
-				    cpu,
-				    xen_irq_work_interrupt,
-				    IRQF_PERCPU|IRQF_NOBALANCING,
-				    callfunc_name,
-				    NULL);
-	if (rc < 0)
-		goto fail;
-	per_cpu(xen_irq_work, cpu).irq = rc;
-	per_cpu(xen_irq_work, cpu).name = callfunc_name;
-
-	if (is_xen_pmu(cpu)) {
-		pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
-		rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
-					     xen_pmu_irq_handler,
-					     IRQF_PERCPU|IRQF_NOBALANCING,
-					     pmu_name, NULL);
-		if (rc < 0)
-			goto fail;
-		per_cpu(xen_pmu_irq, cpu).irq = rc;
-		per_cpu(xen_pmu_irq, cpu).name = pmu_name;
-	}
-
 	return 0;
 
  fail:
@@ -237,345 +114,13 @@ int xen_smp_intr_init(unsigned int cpu)
 	return rc;
 }
 
-#ifdef CONFIG_XEN_PV
-static void __init xen_fill_possible_map(void)
-{
-	int i, rc;
-
-	if (xen_initial_domain())
-		return;
-
-	for (i = 0; i < nr_cpu_ids; i++) {
-		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
-		if (rc >= 0) {
-			num_processors++;
-			set_cpu_possible(i, true);
-		}
-	}
-}
-
-static void __init xen_filter_cpu_maps(void)
-{
-	int i, rc;
-	unsigned int subtract = 0;
-
-	if (!xen_initial_domain())
-		return;
-
-	num_processors = 0;
-	disabled_cpus = 0;
-	for (i = 0; i < nr_cpu_ids; i++) {
-		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
-		if (rc >= 0) {
-			num_processors++;
-			set_cpu_possible(i, true);
-		} else {
-			set_cpu_possible(i, false);
-			set_cpu_present(i, false);
-			subtract++;
-		}
-	}
-#ifdef CONFIG_HOTPLUG_CPU
-	/* This is akin to using 'nr_cpus' on the Linux command line.
-	 * Which is OK as when we use 'dom0_max_vcpus=X' we can only
-	 * have up to X, while nr_cpu_ids is greater than X. This
-	 * normally is not a problem, except when CPU hotplugging
-	 * is involved and then there might be more than X CPUs
-	 * in the guest - which will not work as there is no
-	 * hypercall to expand the max number of VCPUs an already
-	 * running guest has. So cap it up to X. */
-	if (subtract)
-		nr_cpu_ids = nr_cpu_ids - subtract;
-#endif
-
-}
-#endif
-
-static void __init xen_smp_prepare_boot_cpu(void)
-{
-	BUG_ON(smp_processor_id() != 0);
-	native_smp_prepare_boot_cpu();
-
-	if (xen_pv_domain()) {
-#ifdef CONFIG_XEN_PV
-		if (!xen_feature(XENFEAT_writable_page_tables))
-			/* We've switched to the "real" per-cpu gdt, so make
-			 * sure the old memory can be recycled. */
-			make_lowmem_page_readwrite(xen_initial_gdt);
-
-#ifdef CONFIG_X86_32
-		/*
-		 * Xen starts us with XEN_FLAT_RING1_DS, but linux code
-		 * expects __USER_DS
-		 */
-		loadsegment(ds, __USER_DS);
-		loadsegment(es, __USER_DS);
-#endif
-
-		xen_filter_cpu_maps();
-		xen_setup_vcpu_info_placement();
-#endif
-	}
-
-	/*
-	 * Setup vcpu_info for boot CPU.
-	 */
-	if (xen_hvm_domain())
-		xen_vcpu_setup(0);
-
-	/*
-	 * The alternative logic (which patches the unlock/lock) runs before
-	 * the smp bootup up code is activated. Hence we need to set this up
-	 * the core kernel is being patched. Otherwise we will have only
-	 * modules patched but not core code.
-	 */
-	xen_init_spinlocks();
-}
-
-#ifdef CONFIG_XEN_PV
-static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
-{
-	unsigned cpu;
-	unsigned int i;
-
-	if (skip_ioapic_setup) {
-		char *m = (max_cpus == 0) ?
-			"The nosmp parameter is incompatible with Xen; " \
-			"use Xen dom0_max_vcpus=1 parameter" :
-			"The noapic parameter is incompatible with Xen";
-
-		xen_raw_printk(m);
-		panic(m);
-	}
-	xen_init_lock_cpu(0);
-
-	smp_store_boot_cpu_info();
-	cpu_data(0).x86_max_cores = 1;
-
-	for_each_possible_cpu(i) {
-		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
-		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
-		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
-	}
-	set_cpu_sibling_map(0);
-
-	xen_pmu_init(0);
-
-	if (xen_smp_intr_init(0))
-		BUG();
-
-	if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL))
-		panic("could not allocate xen_cpu_initialized_map\n");
-
-	cpumask_copy(xen_cpu_initialized_map, cpumask_of(0));
-
-	/* Restrict the possible_map according to max_cpus. */
-	while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
-		for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
-			continue;
-		set_cpu_possible(cpu, false);
-	}
-
-	for_each_possible_cpu(cpu)
-		set_cpu_present(cpu, true);
-}
-
-static int
-cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
-{
-	struct vcpu_guest_context *ctxt;
-	struct desc_struct *gdt;
-	unsigned long gdt_mfn;
-
-	/* used to tell cpu_init() that it can proceed with initialization */
-	cpumask_set_cpu(cpu, cpu_callout_mask);
-	if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
-		return 0;
-
-	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
-	if (ctxt == NULL)
-		return -ENOMEM;
-
-	gdt = get_cpu_gdt_table(cpu);
-
-#ifdef CONFIG_X86_32
-	ctxt->user_regs.fs = __KERNEL_PERCPU;
-	ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
-#endif
-	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
-
-	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
-	ctxt->flags = VGCF_IN_KERNEL;
-	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
-	ctxt->user_regs.ds = __USER_DS;
-	ctxt->user_regs.es = __USER_DS;
-	ctxt->user_regs.ss = __KERNEL_DS;
-
-	xen_copy_trap_info(ctxt->trap_ctxt);
-
-	ctxt->ldt_ents = 0;
-
-	BUG_ON((unsigned long)gdt & ~PAGE_MASK);
-
-	gdt_mfn = arbitrary_virt_to_mfn(gdt);
-	make_lowmem_page_readonly(gdt);
-	make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
-
-	ctxt->gdt_frames[0] = gdt_mfn;
-	ctxt->gdt_ents      = GDT_ENTRIES;
-
-	ctxt->kernel_ss = __KERNEL_DS;
-	ctxt->kernel_sp = idle->thread.sp0;
-
-#ifdef CONFIG_X86_32
-	ctxt->event_callback_cs     = __KERNEL_CS;
-	ctxt->failsafe_callback_cs  = __KERNEL_CS;
-#else
-	ctxt->gs_base_kernel = per_cpu_offset(cpu);
-#endif
-	ctxt->event_callback_eip    =
-		(unsigned long)xen_hypervisor_callback;
-	ctxt->failsafe_callback_eip =
-		(unsigned long)xen_failsafe_callback;
-	ctxt->user_regs.cs = __KERNEL_CS;
-	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
-
-	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
-	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
-	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt))
-		BUG();
-
-	kfree(ctxt);
-	return 0;
-}
-
-static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
-{
-	int rc;
-
-	common_cpu_up(cpu, idle);
-
-	xen_setup_runstate_info(cpu);
-
-	/*
-	 * PV VCPUs are always successfully taken down (see 'while' loop
-	 * in xen_cpu_die()), so -EBUSY is an error.
-	 */
-	rc = cpu_check_up_prepare(cpu);
-	if (rc)
-		return rc;
-
-	/* make sure interrupts start blocked */
-	per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
-
-	rc = cpu_initialize_context(cpu, idle);
-	if (rc)
-		return rc;
-
-	xen_pmu_init(cpu);
-
-	rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL);
-	BUG_ON(rc);
-
-	while (cpu_report_state(cpu) != CPU_ONLINE)
-		HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
-
-	return 0;
-}
-
-static void xen_smp_cpus_done(unsigned int max_cpus)
-{
-}
-
-#ifdef CONFIG_HOTPLUG_CPU
-static int xen_cpu_disable(void)
-{
-	unsigned int cpu = smp_processor_id();
-	if (cpu == 0)
-		return -EBUSY;
-
-	cpu_disable_common();
-
-	load_cr3(swapper_pg_dir);
-	return 0;
-}
-
-static void xen_play_dead(void) /* used only with HOTPLUG_CPU */
-{
-	play_dead_common();
-	HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL);
-	cpu_bringup();
-	/*
-	 * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down)
-	 * clears certain data that the cpu_idle loop (which called us
-	 * and that we return from) expects. The only way to get that
-	 * data back is to call:
-	 */
-	tick_nohz_idle_enter();
-
-	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
-}
-
-#else /* !CONFIG_HOTPLUG_CPU */
-static int xen_cpu_disable(void)
-{
-	return -ENOSYS;
-}
-
-static void xen_cpu_die(unsigned int cpu)
-{
-	BUG();
-}
-
-static void xen_play_dead(void)
-{
-	BUG();
-}
-
-#endif
-static void stop_self(void *v)
-{
-	int cpu = smp_processor_id();
-
-	/* make sure we're not pinning something down */
-	load_cr3(swapper_pg_dir);
-	/* should set up a minimal gdt */
-
-	set_cpu_online(cpu, false);
-
-	HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL);
-	BUG();
-}
-
-static void xen_stop_other_cpus(int wait)
-{
-	smp_call_function(stop_self, NULL, wait);
-}
-#endif /* CONFIG_XEN_PV */
-
-static void xen_cpu_die(unsigned int cpu)
-{
-	while (xen_pv_domain() && HYPERVISOR_vcpu_op(VCPUOP_is_up,
-						     xen_vcpu_nr(cpu), NULL)) {
-		__set_current_state(TASK_UNINTERRUPTIBLE);
-		schedule_timeout(HZ/10);
-	}
-
-	if (common_cpu_die(cpu) == 0) {
-		xen_smp_intr_free(cpu);
-		xen_uninit_lock_cpu(cpu);
-		xen_teardown_timer(cpu);
-		xen_pmu_finish(cpu);
-	}
-}
-
-static void xen_smp_send_reschedule(int cpu)
+void xen_smp_send_reschedule(int cpu)
 {
 	xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
 }
 
 static void __xen_send_IPI_mask(const struct cpumask *mask,
-			      int vector)
+				int vector)
 {
 	unsigned cpu;
 
@@ -583,7 +128,7 @@ static void __xen_send_IPI_mask(const struct cpumask *mask,
 		xen_send_IPI_one(cpu, vector);
 }
 
-static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
+void xen_smp_send_call_function_ipi(const struct cpumask *mask)
 {
 	int cpu;
 
@@ -598,10 +143,10 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
 	}
 }
 
-static void xen_smp_send_call_function_single_ipi(int cpu)
+void xen_smp_send_call_function_single_ipi(int cpu)
 {
 	__xen_send_IPI_mask(cpumask_of(cpu),
-			  XEN_CALL_FUNCTION_SINGLE_VECTOR);
+			    XEN_CALL_FUNCTION_SINGLE_VECTOR);
 }
 
 static inline int xen_map_vector(int vector)
@@ -636,8 +181,7 @@ static inline int xen_map_vector(int vector)
 	return xen_vector;
 }
 
-void xen_send_IPI_mask(const struct cpumask *mask,
-			      int vector)
+void xen_send_IPI_mask(const struct cpumask *mask, int vector)
 {
 	int xen_vector = xen_map_vector(vector);
 
@@ -703,56 +247,3 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
 
 	return IRQ_HANDLED;
 }
-
-static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
-{
-	irq_enter();
-	irq_work_run();
-	inc_irq_stat(apic_irq_work_irqs);
-	irq_exit();
-
-	return IRQ_HANDLED;
-}
-
-#ifdef CONFIG_XEN_PV
-static const struct smp_ops xen_smp_ops __initconst = {
-	.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
-	.smp_prepare_cpus = xen_smp_prepare_cpus,
-	.smp_cpus_done = xen_smp_cpus_done,
-
-	.cpu_up = xen_cpu_up,
-	.cpu_die = xen_cpu_die,
-	.cpu_disable = xen_cpu_disable,
-	.play_dead = xen_play_dead,
-
-	.stop_other_cpus = xen_stop_other_cpus,
-	.smp_send_reschedule = xen_smp_send_reschedule,
-
-	.send_call_func_ipi = xen_smp_send_call_function_ipi,
-	.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
-};
-
-void __init xen_smp_init(void)
-{
-	smp_ops = xen_smp_ops;
-	xen_fill_possible_map();
-}
-#endif
-
-static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
-{
-	native_smp_prepare_cpus(max_cpus);
-	WARN_ON(xen_smp_intr_init(0));
-
-	xen_init_lock_cpu(0);
-}
-
-void __init xen_hvm_smp_init(void)
-{
-	smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
-	smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
-	smp_ops.cpu_die = xen_cpu_die;
-	smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
-	smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
-	smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;
-}
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
index 9beef33..b31d43b 100644
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -8,9 +8,22 @@ extern void xen_send_IPI_mask_allbutself(const struct cpumask *mask,
 extern void xen_send_IPI_allbutself(int vector);
 extern void xen_send_IPI_all(int vector);
 extern void xen_send_IPI_self(int vector);
+extern void xen_send_IPI_mask(const struct cpumask *mask, int vector);
 
 extern int xen_smp_intr_init(unsigned int cpu);
 extern void xen_smp_intr_free(unsigned int cpu);
+#ifdef CONFIG_XEN_PV
+extern int xen_smp_intr_init_pv(unsigned int cpu);
+extern void xen_smp_intr_free_pv(unsigned int cpu);
+#endif
+extern void xen_smp_send_reschedule(int cpu);
+extern void xen_smp_send_call_function_ipi(const struct cpumask *mask);
+extern void xen_smp_send_call_function_single_ipi(int cpu);
+
+struct xen_common_irq {
+	int irq;
+	char *name;
+};
 
 #else /* CONFIG_SMP */
 
@@ -18,6 +31,16 @@ static inline int xen_smp_intr_init(unsigned int cpu)
 {
 	return 0;
 }
+
+#ifdef CONFIG_XEN_PV
+static inline int xen_smp_intr_init_pv(unsigned int cpu)
+{
+	return 0;
+}
+
+static inline void xen_smp_intr_free_pv(unsigned int cpu) {}
+#endif
+
 static inline void xen_smp_intr_free(unsigned int cpu) {}
 #endif /* CONFIG_SMP */
 
diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c
new file mode 100644
index 0000000..8bed434
--- /dev/null
+++ b/arch/x86/xen/smp_hvm.c
@@ -0,0 +1,58 @@
+#include <asm/smp.h>
+
+#include "xen-ops.h"
+#include "smp.h"
+
+
+static void __init xen_hvm_smp_prepare_boot_cpu(void)
+{
+	BUG_ON(smp_processor_id() != 0);
+	native_smp_prepare_boot_cpu();
+
+	/*
+	 * Setup vcpu_info for boot CPU.
+	 */
+	xen_vcpu_setup(0);
+
+	/*
+	 * The alternative logic (which patches the unlock/lock) runs before
+	 * the smp bootup up code is activated. Hence we need to set this up
+	 * the core kernel is being patched. Otherwise we will have only
+	 * modules patched but not core code.
+	 */
+	xen_init_spinlocks();
+}
+
+static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
+{
+	native_smp_prepare_cpus(max_cpus);
+	WARN_ON(xen_smp_intr_init(0));
+
+	xen_init_lock_cpu(0);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void xen_hvm_cpu_die(unsigned int cpu)
+{
+	if (common_cpu_die(cpu) == 0) {
+		xen_smp_intr_free(cpu);
+		xen_uninit_lock_cpu(cpu);
+		xen_teardown_timer(cpu);
+	}
+}
+#else
+static void xen_hvm_cpu_die(unsigned int cpu)
+{
+	BUG();
+}
+#endif
+
+void __init xen_hvm_smp_init(void)
+{
+	smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
+	smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
+	smp_ops.cpu_die = xen_hvm_cpu_die;
+	smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
+	smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
+	smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu;
+}
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
new file mode 100644
index 0000000..fc89907
--- /dev/null
+++ b/arch/x86/xen/smp_pv.c
@@ -0,0 +1,499 @@
+/*
+ * Xen SMP support
+ *
+ * This file implements the Xen versions of smp_ops.  SMP under Xen is
+ * very straightforward.  Bringing a CPU up is simply a matter of
+ * loading its initial context and setting it running.
+ *
+ * IPIs are handled through the Xen event mechanism.
+ *
+ * Because virtual CPUs can be scheduled onto any real CPU, there's no
+ * useful topology information for the kernel to make use of.  As a
+ * result, all CPUs are treated as if they're single-core and
+ * single-threaded.
+ */
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/irq_work.h>
+#include <linux/tick.h>
+
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/cpu.h>
+
+#include <xen/interface/xen.h>
+#include <xen/interface/vcpu.h>
+#include <xen/interface/xenpmu.h>
+
+#include <asm/xen/interface.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/events.h>
+
+#include <xen/hvc-console.h>
+#include "xen-ops.h"
+#include "mmu.h"
+#include "smp.h"
+#include "pmu.h"
+
+cpumask_var_t xen_cpu_initialized_map;
+
+static DEFINE_PER_CPU(struct xen_common_irq, xen_irq_work) = { .irq = -1 };
+static DEFINE_PER_CPU(struct xen_common_irq, xen_pmu_irq) = { .irq = -1 };
+
+static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id);
+
+static void cpu_bringup(void)
+{
+	int cpu;
+
+	cpu_init();
+	touch_softlockup_watchdog();
+	preempt_disable();
+
+	/* PVH runs in ring 0 and allows us to do native syscalls. Yay! */
+	if (!xen_feature(XENFEAT_supervisor_mode_kernel)) {
+		xen_enable_sysenter();
+		xen_enable_syscall();
+	}
+	cpu = smp_processor_id();
+	smp_store_cpu_info(cpu);
+	cpu_data(cpu).x86_max_cores = 1;
+	set_cpu_sibling_map(cpu);
+
+	xen_setup_cpu_clockevents();
+
+	notify_cpu_starting(cpu);
+
+	set_cpu_online(cpu, true);
+
+	cpu_set_state_online(cpu);  /* Implies full memory barrier. */
+
+	/* We can take interrupts now: we're officially "up". */
+	local_irq_enable();
+}
+
+asmlinkage __visible void cpu_bringup_and_idle(void)
+{
+	cpu_bringup();
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
+}
+
+void xen_smp_intr_free_pv(unsigned int cpu)
+{
+	if (per_cpu(xen_irq_work, cpu).irq >= 0) {
+		unbind_from_irqhandler(per_cpu(xen_irq_work, cpu).irq, NULL);
+		per_cpu(xen_irq_work, cpu).irq = -1;
+		kfree(per_cpu(xen_irq_work, cpu).name);
+		per_cpu(xen_irq_work, cpu).name = NULL;
+	}
+
+	if (per_cpu(xen_pmu_irq, cpu).irq >= 0) {
+		unbind_from_irqhandler(per_cpu(xen_pmu_irq, cpu).irq, NULL);
+		per_cpu(xen_pmu_irq, cpu).irq = -1;
+		kfree(per_cpu(xen_pmu_irq, cpu).name);
+		per_cpu(xen_pmu_irq, cpu).name = NULL;
+	}
+}
+
+int xen_smp_intr_init_pv(unsigned int cpu)
+{
+	int rc;
+	char *callfunc_name, *pmu_name;
+
+	callfunc_name = kasprintf(GFP_KERNEL, "irqwork%d", cpu);
+	rc = bind_ipi_to_irqhandler(XEN_IRQ_WORK_VECTOR,
+				    cpu,
+				    xen_irq_work_interrupt,
+				    IRQF_PERCPU|IRQF_NOBALANCING,
+				    callfunc_name,
+				    NULL);
+	if (rc < 0)
+		goto fail;
+	per_cpu(xen_irq_work, cpu).irq = rc;
+	per_cpu(xen_irq_work, cpu).name = callfunc_name;
+
+	if (is_xen_pmu(cpu)) {
+		pmu_name = kasprintf(GFP_KERNEL, "pmu%d", cpu);
+		rc = bind_virq_to_irqhandler(VIRQ_XENPMU, cpu,
+					     xen_pmu_irq_handler,
+					     IRQF_PERCPU|IRQF_NOBALANCING,
+					     pmu_name, NULL);
+		if (rc < 0)
+			goto fail;
+		per_cpu(xen_pmu_irq, cpu).irq = rc;
+		per_cpu(xen_pmu_irq, cpu).name = pmu_name;
+	}
+
+	return 0;
+
+ fail:
+	xen_smp_intr_free_pv(cpu);
+	return rc;
+}
+
+static void __init xen_fill_possible_map(void)
+{
+	int i, rc;
+
+	if (xen_initial_domain())
+		return;
+
+	for (i = 0; i < nr_cpu_ids; i++) {
+		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+		if (rc >= 0) {
+			num_processors++;
+			set_cpu_possible(i, true);
+		}
+	}
+}
+
+static void __init xen_filter_cpu_maps(void)
+{
+	int i, rc;
+	unsigned int subtract = 0;
+
+	if (!xen_initial_domain())
+		return;
+
+	num_processors = 0;
+	disabled_cpus = 0;
+	for (i = 0; i < nr_cpu_ids; i++) {
+		rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
+		if (rc >= 0) {
+			num_processors++;
+			set_cpu_possible(i, true);
+		} else {
+			set_cpu_possible(i, false);
+			set_cpu_present(i, false);
+			subtract++;
+		}
+	}
+#ifdef CONFIG_HOTPLUG_CPU
+	/* This is akin to using 'nr_cpus' on the Linux command line.
+	 * Which is OK as when we use 'dom0_max_vcpus=X' we can only
+	 * have up to X, while nr_cpu_ids is greater than X. This
+	 * normally is not a problem, except when CPU hotplugging
+	 * is involved and then there might be more than X CPUs
+	 * in the guest - which will not work as there is no
+	 * hypercall to expand the max number of VCPUs an already
+	 * running guest has. So cap it up to X. */
+	if (subtract)
+		nr_cpu_ids = nr_cpu_ids - subtract;
+#endif
+
+}
+
+static void __init xen_pv_smp_prepare_boot_cpu(void)
+{
+	BUG_ON(smp_processor_id() != 0);
+	native_smp_prepare_boot_cpu();
+
+	if (xen_pv_domain()) {
+#ifdef CONFIG_XEN_PV
+		if (!xen_feature(XENFEAT_writable_page_tables))
+			/* We've switched to the "real" per-cpu gdt, so make
+			 * sure the old memory can be recycled. */
+			make_lowmem_page_readwrite(xen_initial_gdt);
+
+#ifdef CONFIG_X86_32
+		/*
+		 * Xen starts us with XEN_FLAT_RING1_DS, but linux code
+		 * expects __USER_DS
+		 */
+		loadsegment(ds, __USER_DS);
+		loadsegment(es, __USER_DS);
+#endif
+
+		xen_filter_cpu_maps();
+		xen_setup_vcpu_info_placement();
+#endif
+	}
+
+	/*
+	 * Setup vcpu_info for boot CPU.
+	 */
+	if (xen_hvm_domain())
+		xen_vcpu_setup(0);
+
+	/*
+	 * The alternative logic (which patches the unlock/lock) runs before
+	 * the smp bootup up code is activated. Hence we need to set this up
+	 * the core kernel is being patched. Otherwise we will have only
+	 * modules patched but not core code.
+	 */
+	xen_init_spinlocks();
+}
+
+static void __init xen_pv_smp_prepare_cpus(unsigned int max_cpus)
+{
+	unsigned cpu;
+	unsigned int i;
+
+	if (skip_ioapic_setup) {
+		char *m = (max_cpus == 0) ?
+			"The nosmp parameter is incompatible with Xen; " \
+			"use Xen dom0_max_vcpus=1 parameter" :
+			"The noapic parameter is incompatible with Xen";
+
+		xen_raw_printk(m);
+		panic(m);
+	}
+	xen_init_lock_cpu(0);
+
+	smp_store_boot_cpu_info();
+	cpu_data(0).x86_max_cores = 1;
+
+	for_each_possible_cpu(i) {
+		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
+		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
+	}
+	set_cpu_sibling_map(0);
+
+	xen_pmu_init(0);
+
+	if (xen_smp_intr_init(0))
+		BUG();
+
+	if (!alloc_cpumask_var(&xen_cpu_initialized_map, GFP_KERNEL))
+		panic("could not allocate xen_cpu_initialized_map\n");
+
+	cpumask_copy(xen_cpu_initialized_map, cpumask_of(0));
+
+	/* Restrict the possible_map according to max_cpus. */
+	while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) {
+		for (cpu = nr_cpu_ids - 1; !cpu_possible(cpu); cpu--)
+			continue;
+		set_cpu_possible(cpu, false);
+	}
+
+	for_each_possible_cpu(cpu)
+		set_cpu_present(cpu, true);
+}
+
+static int
+cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
+{
+	struct vcpu_guest_context *ctxt;
+	struct desc_struct *gdt;
+	unsigned long gdt_mfn;
+
+	/* used to tell cpu_init() that it can proceed with initialization */
+	cpumask_set_cpu(cpu, cpu_callout_mask);
+	if (cpumask_test_and_set_cpu(cpu, xen_cpu_initialized_map))
+		return 0;
+
+	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
+	if (ctxt == NULL)
+		return -ENOMEM;
+
+	gdt = get_cpu_gdt_table(cpu);
+
+#ifdef CONFIG_X86_32
+	ctxt->user_regs.fs = __KERNEL_PERCPU;
+	ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
+#endif
+	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
+
+	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
+	ctxt->flags = VGCF_IN_KERNEL;
+	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
+	ctxt->user_regs.ds = __USER_DS;
+	ctxt->user_regs.es = __USER_DS;
+	ctxt->user_regs.ss = __KERNEL_DS;
+
+	xen_copy_trap_info(ctxt->trap_ctxt);
+
+	ctxt->ldt_ents = 0;
+
+	BUG_ON((unsigned long)gdt & ~PAGE_MASK);
+
+	gdt_mfn = arbitrary_virt_to_mfn(gdt);
+	make_lowmem_page_readonly(gdt);
+	make_lowmem_page_readonly(mfn_to_virt(gdt_mfn));
+
+	ctxt->gdt_frames[0] = gdt_mfn;
+	ctxt->gdt_ents      = GDT_ENTRIES;
+
+	ctxt->kernel_ss = __KERNEL_DS;
+	ctxt->kernel_sp = idle->thread.sp0;
+
+#ifdef CONFIG_X86_32
+	ctxt->event_callback_cs     = __KERNEL_CS;
+	ctxt->failsafe_callback_cs  = __KERNEL_CS;
+#else
+	ctxt->gs_base_kernel = per_cpu_offset(cpu);
+#endif
+	ctxt->event_callback_eip    =
+		(unsigned long)xen_hypervisor_callback;
+	ctxt->failsafe_callback_eip =
+		(unsigned long)xen_failsafe_callback;
+	ctxt->user_regs.cs = __KERNEL_CS;
+	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
+
+	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
+	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
+	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt))
+		BUG();
+
+	kfree(ctxt);
+	return 0;
+}
+
+static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle)
+{
+	int rc;
+
+	common_cpu_up(cpu, idle);
+
+	xen_setup_runstate_info(cpu);
+
+	/*
+	 * PV VCPUs are always successfully taken down (see 'while' loop
+	 * in xen_cpu_die()), so -EBUSY is an error.
+	 */
+	rc = cpu_check_up_prepare(cpu);
+	if (rc)
+		return rc;
+
+	/* make sure interrupts start blocked */
+	per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
+
+	rc = cpu_initialize_context(cpu, idle);
+	if (rc)
+		return rc;
+
+	xen_pmu_init(cpu);
+
+	rc = HYPERVISOR_vcpu_op(VCPUOP_up, xen_vcpu_nr(cpu), NULL);
+	BUG_ON(rc);
+
+	while (cpu_report_state(cpu) != CPU_ONLINE)
+		HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
+
+	return 0;
+}
+
+static void xen_pv_smp_cpus_done(unsigned int max_cpus)
+{
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static int xen_pv_cpu_disable(void)
+{
+	unsigned int cpu = smp_processor_id();
+	if (cpu == 0)
+		return -EBUSY;
+
+	cpu_disable_common();
+
+	load_cr3(swapper_pg_dir);
+	return 0;
+}
+
+static void xen_pv_play_dead(void) /* used only with HOTPLUG_CPU */
+{
+	play_dead_common();
+	HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(smp_processor_id()), NULL);
+	cpu_bringup();
+	/*
+	 * commit 4b0c0f294 (tick: Cleanup NOHZ per cpu data on cpu down)
+	 * clears certain data that the cpu_idle loop (which called us
+	 * and that we return from) expects. The only way to get that
+	 * data back is to call:
+	 */
+	tick_nohz_idle_enter();
+
+	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+static int xen_pv_cpu_disable(void)
+{
+	return -ENOSYS;
+}
+
+static void xen_pv_cpu_die(unsigned int cpu)
+{
+	BUG();
+}
+
+static void xen_pv_play_dead(void)
+{
+	BUG();
+}
+
+#endif
+static void stop_self(void *v)
+{
+	int cpu = smp_processor_id();
+
+	/* make sure we're not pinning something down */
+	load_cr3(swapper_pg_dir);
+	/* should set up a minimal gdt */
+
+	set_cpu_online(cpu, false);
+
+	HYPERVISOR_vcpu_op(VCPUOP_down, xen_vcpu_nr(cpu), NULL);
+	BUG();
+}
+
+static void xen_pv_stop_other_cpus(int wait)
+{
+	smp_call_function(stop_self, NULL, wait);
+}
+
+static void xen_pv_cpu_die(unsigned int cpu)
+{
+	while (HYPERVISOR_vcpu_op(VCPUOP_is_up,
+				  xen_vcpu_nr(cpu), NULL)) {
+		__set_current_state(TASK_UNINTERRUPTIBLE);
+		schedule_timeout(HZ/10);
+	}
+
+	if (common_cpu_die(cpu) == 0) {
+		xen_smp_intr_free(cpu);
+		xen_uninit_lock_cpu(cpu);
+		xen_teardown_timer(cpu);
+		xen_pmu_finish(cpu);
+	}
+}
+
+static irqreturn_t xen_irq_work_interrupt(int irq, void *dev_id)
+{
+	irq_enter();
+	irq_work_run();
+	inc_irq_stat(apic_irq_work_irqs);
+	irq_exit();
+
+	return IRQ_HANDLED;
+}
+
+static const struct smp_ops xen_smp_ops __initconst = {
+	.smp_prepare_boot_cpu = xen_pv_smp_prepare_boot_cpu,
+	.smp_prepare_cpus = xen_pv_smp_prepare_cpus,
+	.smp_cpus_done = xen_pv_smp_cpus_done,
+
+	.cpu_up = xen_pv_cpu_up,
+	.cpu_die = xen_pv_cpu_die,
+	.cpu_disable = xen_pv_cpu_disable,
+	.play_dead = xen_pv_play_dead,
+
+	.stop_other_cpus = xen_pv_stop_other_cpus,
+	.smp_send_reschedule = xen_smp_send_reschedule,
+
+	.send_call_func_ipi = xen_smp_send_call_function_ipi,
+	.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
+};
+
+void __init xen_smp_init(void)
+{
+	smp_ops = xen_smp_ops;
+	xen_fill_possible_map();
+}
-- 
2.9.3