linux-kernel - [PATCH v4 2/4] KVM: X86: Add paravirt remote TLB flush

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1510533206-9821-3-git-send-email-wanpeng.li@hotmail.com>
Date:   Sun, 12 Nov 2017 16:33:24 -0800
From:   Wanpeng Li <kernellwp@...il.com>
To:     linux-kernel@...r.kernel.org, kvm@...r.kernel.org
Cc:     Paolo Bonzini <pbonzini@...hat.com>,
        Radim Krčmář <rkrcmar@...hat.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Wanpeng Li <wanpeng.li@...mail.com>
Subject: [PATCH v4 2/4] KVM: X86: Add paravirt remote TLB flush

From: Wanpeng Li <wanpeng.li@...mail.com>

Remote flushing api's does a busy wait which is fine in bare-metal
scenario. But with-in the guest, the vcpus might have been pre-empted
or blocked. In this scenario, the initator vcpu would end up
busy-waiting for a long amount of time.

This patch set implements para-virt flush tlbs making sure that it does 
not wait for vcpus that are sleeping. And all the sleeping vcpus flush 
the tlb on guest enter.

The best result is achieved when we're overcommiting the host by running 
multiple vCPUs on each pCPU. In this case PV tlb flush avoids touching 
vCPUs which are not scheduled and avoid the wait on the main CPU.

Test on a Haswell i7 desktop 4 cores (2HT), so 8 pCPUs, running ebizzy 
in one linux guest.

ebizzy -M 
              vanilla    optimized     boost
 8 vCPUs       10152       10083       -0.68% 
16 vCPUs        1224        4866       297.5% 
24 vCPUs        1109        3871       249%
32 vCPUs        1025        3375       229.3% 

Cc: Paolo Bonzini <pbonzini@...hat.com>
Cc: Radim Krčmář <rkrcmar@...hat.com>
Cc: Peter Zijlstra <peterz@...radead.org>
Signed-off-by: Wanpeng Li <wanpeng.li@...mail.com>
---
 Documentation/virtual/kvm/cpuid.txt  |  4 ++++
 arch/x86/include/uapi/asm/kvm_para.h |  2 ++
 arch/x86/kernel/kvm.c                | 42 +++++++++++++++++++++++++++++++++++-
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt
index 117066a..9693fcc 100644
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -60,6 +60,10 @@ KVM_FEATURE_PV_DEDICATED           ||     8 || guest checks this feature bit
                                    ||       || mizations such as usage of
                                    ||       || qspinlocks.
 ------------------------------------------------------------------------------
+KVM_FEATURE_PV_TLB_FLUSH           ||     9 || guest checks this feature bit
+                                   ||       || before enabling paravirtualized
+                                   ||       || tlb flush.
+------------------------------------------------------------------------------
 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
                                    ||       || per-cpu warps are expected in
                                    ||       || kvmclock.
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 6d66556..e267d83 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -26,6 +26,7 @@
 #define KVM_FEATURE_PV_EOI		6
 #define KVM_FEATURE_PV_UNHALT		7
 #define KVM_FEATURE_PV_DEDICATED	8
+#define KVM_FEATURE_PV_TLB_FLUSH	9
 
 /* The last 8 bits are used to indicate how to interpret the flags field
  * in pvclock structure. If no bits are set, all flags are ignored.
@@ -54,6 +55,7 @@ struct kvm_steal_time {
 
 #define KVM_VCPU_NOT_PREEMPTED      (0 << 0)
 #define KVM_VCPU_PREEMPTED          (1 << 0)
+#define KVM_VCPU_SHOULD_FLUSH       (1 << 1)
 
 #define KVM_CLOCK_PAIRING_WALLCLOCK 0
 struct kvm_clock_pairing {
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 66ed3bc..78794c1 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -465,9 +465,40 @@ static void __init kvm_apf_trap_init(void)
 	update_intr_gate(X86_TRAP_PF, async_page_fault);
 }
 
+static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask);
+
+static void kvm_flush_tlb_others(const struct cpumask *cpumask,
+			const struct flush_tlb_info *info)
+{
+	u8 state;
+	int cpu;
+	struct kvm_steal_time *src;
+	struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask);
+
+	if (unlikely(!flushmask))
+		return;
+
+	cpumask_copy(flushmask, cpumask);
+	/*
+	 * We have to call flush only on online vCPUs. And
+	 * queue flush_on_enter for pre-empted vCPUs
+	 */
+	for_each_cpu(cpu, cpumask) {
+		src = &per_cpu(steal_time, cpu);
+		state = READ_ONCE(src->preempted);
+		if ((state & KVM_VCPU_PREEMPTED)) {
+			if (try_cmpxchg(&src->preempted, &state,
+				state | KVM_VCPU_SHOULD_FLUSH))
+				__cpumask_clear_cpu(cpu, flushmask);
+		}
+	}
+
+	native_flush_tlb_others(flushmask, info);
+}
+
 void __init kvm_guest_init(void)
 {
-	int i;
+	int i, cpu;
 
 	if (!kvm_para_available())
 		return;
@@ -484,6 +515,15 @@ void __init kvm_guest_init(void)
 		pv_time_ops.steal_clock = kvm_steal_clock;
 	}
 
+	if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
+		!kvm_para_has_feature(KVM_FEATURE_PV_DEDICATED)) {
+		for_each_possible_cpu(cpu) {
+			zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
+					GFP_KERNEL, cpu_to_node(cpu));
+		}
+		pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
+	}
+
 	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
 		apic_set_eoi_write(kvm_guest_apic_eoi_write);
 
-- 
2.7.4