linux-kernel - [RFC PATCH 08/16] x86/xen: irq/upcall handling with multiple xenhosts

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20190509172540.12398-9-ankur.a.arora@oracle.com>
Date:   Thu,  9 May 2019 10:25:32 -0700
From:   Ankur Arora <ankur.a.arora@...cle.com>
To:     linux-kernel@...r.kernel.org, xen-devel@...ts.xenproject.org
Cc:     jgross@...e.com, pbonzini@...hat.com, boris.ostrovsky@...cle.com,
        konrad.wilk@...cle.com, sstabellini@...nel.org,
        joao.m.martins@...cle.com, ankur.a.arora@...cle.com
Subject: [RFC PATCH 08/16] x86/xen: irq/upcall handling with multiple xenhosts

For configurations with multiple xenhosts, we need to handle events
generated from multiple xenhosts.

Having more than one upcall handler might be quite hairy, and it would
be simpler if the callback from L0-Xen could be bounced via L1-Xen.
This will also mean simpler pv_irq_ops code because now the IF flag
maps onto the xh_default->vcpu_info->evtchn_upcall_mask.

However, we still update the xh_remote->vcpu_info->evtchn_upcall_mask
on a best effort basis to minimize unnecessary work in remote xenhost.

TODO:
  - direct pv_ops.irq are disabled.

Signed-off-by: Ankur Arora <ankur.a.arora@...cle.com>
---
 arch/x86/xen/Makefile       |  2 +-
 arch/x86/xen/enlighten_pv.c |  4 ++-
 arch/x86/xen/irq.c          | 69 +++++++++++++++++++++++++++++--------
 arch/x86/xen/smp_pv.c       | 11 ++++++
 4 files changed, 70 insertions(+), 16 deletions(-)

diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 564b4dddbc15..3c7056ad3520 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -34,7 +34,7 @@ obj-$(CONFIG_XEN_PV)		+= enlighten_pv.o
 obj-$(CONFIG_XEN_PV)		+= mmu_pv.o
 obj-$(CONFIG_XEN_PV)		+= irq.o
 obj-$(CONFIG_XEN_PV)		+= multicalls.o
-obj-$(CONFIG_XEN_PV)		+= xen-asm.o
+obj-n		+= xen-asm.o
 obj-$(CONFIG_XEN_PV)		+= xen-asm_$(BITS).o
 
 obj-$(CONFIG_XEN_PVH)		+= enlighten_pvh.o
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 5f6a1475ec0c..77b1a0d4aef2 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -996,8 +996,9 @@ void __init xen_setup_vcpu_info_placement(void)
 	 * xen_vcpu_setup managed to place the vcpu_info within the
 	 * percpu area for all cpus, so make use of it.
 	 */
+#if 0
+	/* Disable direct access for now. */
 	if (xen_have_vcpu_info_placement && false) {
-		/* Disable direct access until we have proper pcpu data structures. */
 		pv_ops.irq.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
 		pv_ops.irq.restore_fl =
 			__PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
@@ -1007,6 +1008,7 @@ void __init xen_setup_vcpu_info_placement(void)
 			__PV_IS_CALLEE_SAVE(xen_irq_enable_direct);
 		pv_ops.mmu.read_cr2 = xen_read_cr2_direct;
 	}
+#endif
 }
 
 static const struct pv_info xen_info __initconst = {
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 38ad1a1c4763..f760a6abfb1e 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -19,9 +19,9 @@
  * callback mask. We do this in a very simple manner, by making a call
  * down into Xen. The pending flag will be checked by Xen on return.
  */
-void xen_force_evtchn_callback(void)
+void xen_force_evtchn_callback(xenhost_t *xh)
 {
-	(void)HYPERVISOR_xen_version(0, NULL);
+	(void)hypervisor_xen_version(xh, 0, NULL);
 }
 
 asmlinkage __visible unsigned long xen_save_fl(void)
@@ -29,6 +29,21 @@ asmlinkage __visible unsigned long xen_save_fl(void)
 	struct vcpu_info *vcpu;
 	unsigned long flags;
 
+	/*
+	 * In scenarios with more than one xenhost, the primary xenhost
+	 * is responsible for all the upcalls, with the remote xenhost
+	 * bouncing its upcalls through it (see comment in
+	 * cpu_initialize_context().)
+	 *
+	 * To minimize unnecessary upcalls, the remote xenhost still looks at
+	 * the value of vcpu_info->evtchn_upcall_mask, so we still set and reset
+	 * that.
+	 *
+	 * The fact that the upcall itself is gated by the default xenhost,
+	 * also helps in simplifying the logic here because we don't have to
+	 * worry about guaranteeing atomicity with updates to
+	 * xh_remote->vcpu_info->evtchn_upcall_mask.
+	 */
 	vcpu = xh_default->xen_vcpu[smp_processor_id()];
 
 	/* flag has opposite sense of mask */
@@ -38,26 +53,34 @@ asmlinkage __visible unsigned long xen_save_fl(void)
 	   -0 -> 0x00000000
 	   -1 -> 0xffffffff
 	*/
-	return (-flags) & X86_EFLAGS_IF;
+	return ((-flags) & X86_EFLAGS_IF);
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_save_fl);
 
 __visible void xen_restore_fl(unsigned long flags)
 {
 	struct vcpu_info *vcpu;
+	xenhost_t **xh;
 
 	/* convert from IF type flag */
 	flags = !(flags & X86_EFLAGS_IF);
 
 	/* See xen_irq_enable() for why preemption must be disabled. */
 	preempt_disable();
-	vcpu = xh_default->xen_vcpu[smp_processor_id()];
-	vcpu->evtchn_upcall_mask = flags;
+	for_each_xenhost(xh) {
+		vcpu = (*xh)->xen_vcpu[smp_processor_id()];
+		vcpu->evtchn_upcall_mask = flags;
+	}
 
 	if (flags == 0) {
 		barrier(); /* unmask then check (avoid races) */
-		if (unlikely(vcpu->evtchn_upcall_pending))
-			xen_force_evtchn_callback();
+		for_each_xenhost(xh) {
+			/* Preemption is disabled so we should not have
+			 * gotten moved to a different VCPU. */
+			vcpu = (*xh)->xen_vcpu[smp_processor_id()];
+			if (unlikely(vcpu->evtchn_upcall_pending))
+				xen_force_evtchn_callback(*xh);
+		}
 		preempt_enable();
 	} else
 		preempt_enable_no_resched();
@@ -66,11 +89,19 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_restore_fl);
 
 asmlinkage __visible void xen_irq_disable(void)
 {
+	xenhost_t **xh;
+
 	/* There's a one instruction preempt window here.  We need to
 	   make sure we're don't switch CPUs between getting the vcpu
 	   pointer and updating the mask. */
 	preempt_disable();
-	xh_default->xen_vcpu[smp_processor_id()]->evtchn_upcall_mask = 1;
+	for_each_xenhost(xh)
+		/*
+		 * Mask events on this CPU for both the xenhosts.  As the
+		 * comment above mentions, disabling preemption means we
+		 * can safely do that.
+		 */
+		(*xh)->xen_vcpu[smp_processor_id()]->evtchn_upcall_mask = 1;
 	preempt_enable_no_resched();
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
@@ -78,6 +109,7 @@ PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable);
 asmlinkage __visible void xen_irq_enable(void)
 {
 	struct vcpu_info *vcpu;
+	xenhost_t **xh;
 
 	/*
 	 * We may be preempted as soon as vcpu->evtchn_upcall_mask is
@@ -86,16 +118,25 @@ asmlinkage __visible void xen_irq_enable(void)
 	 */
 	preempt_disable();
 
-	vcpu = xh_default->xen_vcpu[smp_processor_id()];
-	vcpu->evtchn_upcall_mask = 0;
+	/* Given that the interrupts are generated from the default xenhost,
+	 * we should do this in reverse order.
+	 */
+	for_each_xenhost(xh) {
+		vcpu = (*xh)->xen_vcpu[smp_processor_id()];
+		vcpu->evtchn_upcall_mask = 0;
 
-	/* Doesn't matter if we get preempted here, because any
-	   pending event will get dealt with anyway. */
+		/* We could get preempted by an incoming interrupt here with a
+		 * half enabled irq (for the first xenhost.)
+		 */
+	}
 
 	barrier(); /* unmask then check (avoid races) */
-	if (unlikely(vcpu->evtchn_upcall_pending))
-		xen_force_evtchn_callback();
 
+	for_each_xenhost(xh) {
+		vcpu = (*xh)->xen_vcpu[smp_processor_id()];
+		if (unlikely(vcpu->evtchn_upcall_pending))
+			xen_force_evtchn_callback(*xh);
+	}
 	preempt_enable();
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_irq_enable);
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 6d9c3e6611ef..f4ea9eac8b6a 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -343,6 +343,17 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 #else
 	ctxt->gs_base_kernel = per_cpu_offset(cpu);
 #endif
+	/*
+	 * We setup an upcall handler only for the default xenhost. The remote
+	 * xenhost will generate evtchn events, but an additional callback would be
+	 * quite hairy, since we would have VCPU state initialised in multiple
+	 * hypervisors and issues like re-entrancy of upcalls.
+	 *
+	 * It would be simpler if the callback from L0-Xen could be bounced
+	 * bounced via L1-Xen. This also simplifies the pv_irq_ops code
+	 * because now the CPU's IF processing only needs to happen on
+	 * xh_default->vcpu_info.
+	 */
 	ctxt->event_callback_eip    =
 		(unsigned long)xen_hypervisor_callback;
 	ctxt->failsafe_callback_eip =
-- 
2.20.1