lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1399474907-22206-19-git-send-email-Waiman.Long@hp.com>
Date:	Wed,  7 May 2014 11:01:46 -0400
From:	Waiman Long <Waiman.Long@...com>
To:	Thomas Gleixner <tglx@...utronix.de>,
	Ingo Molnar <mingo@...hat.com>,
	"H. Peter Anvin" <hpa@...or.com>,
	Peter Zijlstra <peterz@...radead.org>
Cc:	linux-arch@...r.kernel.org, x86@...nel.org,
	linux-kernel@...r.kernel.org,
	virtualization@...ts.linux-foundation.org,
	xen-devel@...ts.xenproject.org, kvm@...r.kernel.org,
	Paolo Bonzini <paolo.bonzini@...il.com>,
	Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>,
	Boris Ostrovsky <boris.ostrovsky@...cle.com>,
	"Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>,
	Rik van Riel <riel@...hat.com>,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	Raghavendra K T <raghavendra.kt@...ux.vnet.ibm.com>,
	David Vrabel <david.vrabel@...rix.com>,
	Oleg Nesterov <oleg@...hat.com>,
	Gleb Natapov <gleb@...hat.com>,
	Scott J Norton <scott.norton@...com>,
	Chegu Vinod <chegu_vinod@...com>,
	Waiman Long <Waiman.Long@...com>
Subject: [PATCH v10 18/19] pvqspinlock, x86: Enable PV qspinlock PV for KVM

This patch adds the necessary KVM specific code to allow KVM to
support the CPU halting and kicking operations needed by the queue
spinlock PV code.

Two KVM guests of 20 CPU cores (2 nodes) were created for performance
testing in one of the following three configurations:
 1) Only 1 VM is active
 2) Both VMs are active and they share the same 20 physical CPUs
   (200% overcommit)
 3) Both VMs are active and they shares 30 physical CPUs (10 delicated
    and 10 shared - 133% overcommit)

The tests run included the disk workload of the AIM7 benchmark on both
ext4 and xfs RAM disks at 3000 users on a 3.15-rc1 based kernel. The
"ebizzy -m" test was was also run and its performance data were
recorded.  With two VMs running, the "idle=poll" kernel option was
added to simulate a busy guest. The entry "unfair + PV qspinlock"
below means that both the unfair lock and PV spinlock configuration
options were turned on.

                AIM7 XFS Disk Test (no overcommit)
  kernel                 JPM    Real Time   Sys Time    Usr Time
  -----                  ---    ---------   --------    --------
  PV ticketlock         2489626    7.23      101.08       5.30
  qspinlock             2531646    7.11      100.75       5.43
  PV qspinlock          2500000    7.20      101.94       5.40
  unfair qspinlock      2549575    7.06       99.81       5.35
  unfair + PV qspinlock 2486188    7.24      101.55       5.51

                AIM7 XFS Disk Test (133% overcommit)
  kernel                 JPM    Real Time   Sys Time    Usr Time
  -----                  ---    ---------   --------    --------
  PV ticketlock         1114551   16.15      220.17      10.75
  qspinlock             1159047   15.53      216.60      10.24
  PV qspinlock          1170351   15.38      216.16      11.03
  unfair qspinlock      1188119   15.15      209.37      10.82
  unfair + PV qspinlock 1178782   15.27      211.37      11.25

                AIM7 XFS Disk Test (200% overcommit)
  kernel                 JPM    Real Time   Sys Time    Usr Time
  -----                  ---    ---------   --------    --------
  PV ticketlock         587467    30.64      444.95      11.92
  qspinlock             593276    30.34      439.39      14.59
  PV qspinlock          601403    29.93      426.04      14.49
  unfair qspinlock      654070    27.52      400.82      10.86
  unfair + PV qspinlock 614334    29.30      393.38      28.56

                AIM7 EXT4 Disk Test (no overcommit)
  kernel                 JPM    Real Time   Sys Time    Usr Time
  -----                  ---    ---------   --------    --------
  PV ticketlock         2002225    9.07      105.62       5.43
  qspinlock             2006689    8.97      105.65       5.26
  PV qspinlock          2002225    8.99      103.19       5.19
  unfair qspinlock      1988950    9.05      103.81       5.03
  unfair + PV qspinlock 1993355    9.03      107.99       5.68

                AIM7 EXT4 Disk Test (133% overcommit)
  kernel                 JPM    Real Time   Sys Time    Usr Time
  -----                  ---    ---------   --------    --------
  PV ticketlock          987383   18.23      221.63       8.89
  qspinlock             1050788   17.13      206.87       8.35
  PV qspinlock          1058823   17.00      205.22       9.18
  unfair qspinlock      1161290   15.50      184.22       8.84
  unfair + PV qspinlock 1122894   16.03      195.86       9.34

                AIM7 EXT4 Disk Test (200% overcommit)
  kernel                 JPM    Real Time   Sys Time    Usr Time
  -----                  ---    ---------   --------    --------
  PV ticketlock         420757    42.78      565.96       5.84
  qspinlock             427452    42.11      543.08      11.12
  PV qspinlock          420659    42.79      548.30      10.56
  unfair qspinlock      504909    35.65      466.71       5.38
  unfair + PV qspinlock 500974    35.93      469.02       6.77

                EBIZZY-M Test (no overcommit)
  kernel                Rec/s   Real Time   Sys Time    Usr Time
  -----                 -----   ---------   --------    --------
  PV ticketlock         1230      10.00      88.34        1.42
  qspinlock             1212      10.00      68.25        1.47
  PV qspinlock          1265      10.00      91.50        1.41
  unfair qspinlock      1304      10.00      77.94        1.49
  unfair + PV qspinlock 1445      10.00      75.45        1.68

                EBIZZY-M Test (133% overcommit)
  kernel                Rec/s   Real Time   Sys Time    Usr Time
  -----                 -----   ---------   --------    --------
  PV ticketlock          467      10.00      88.16        0.73
  qspinlock              463      10.00      89.44        0.78
  PV qspinlock           441      10.00      95.10        0.74
  unfair qspinlock      1233      10.00      35.76        1.76
  unfair + PV qspinlock 1555      10.00      32.12        1.96

                EBIZZY-M Test (200% overcommit)
  kernel                Rec/s   Real Time   Sys Time    Usr Time
  -----                 -----   ---------   --------    --------
  PV ticketlock          263      10.00      84.48        4.27
  qspinlock              226      10.00      87.74        2.02
  PV qspinlock           253      10.00      98.28        2.63
  unfair qspinlock       338      10.00      61.15        1.68
  unfair + PV qspinlock  346      10.00      60.47        3.31

Raghavendra KT had done some performance testing on this patch with
the following results:

Overall we are seeing good improvement for pv-unfair version.

System: 32 cpu sandybridge with HT on (4 node with 32 GB each)
Guest : 8GB with 16 vcpu/VM.
Average was taken over 8-10 data points.

Base = 3.15-rc2  with PRAVIRT_SPINLOCK = y

A = 3.15-rc2 + qspinlock v9 patch with QUEUE_SPINLOCK = y
PRAVIRT_SPINLOCK = y PARAVIRT_UNFAIR_LOCKS = y (unfair lock)

B =  3.15-rc2 + qspinlock v9 patch with QUEUE_SPINLOCK = y
PRAVIRT_SPINLOCK = n PARAVIRT_UNFAIR_LOCKS = n
(queue spinlock without paravirt)

C = 3.15-rc2 + qspinlock v9 patch with  QUEUE_SPINLOCK = y
PRAVIRT_SPINLOCK = y  PARAVIRT_UNFAIR_LOCKS = n
(queue spinlock with paravirt)

Ebizzy %improvements
====================
overcommit  A            B            C
0.5x     4.4265        2.0611       1.5824
1.0x     0.9015       -7.7828       4.5443
1.5x    46.1162       -2.9845      -3.5046
2.0x    99.8150       -2.7116       4.7461

Dbench %improvements
====================
overcommit  A            B            C
0.5x     3.2617        3.5436        2.5676
1.0x     0.6302        2.2342        5.2201
1.5x     5.0027        4.8275        3.8375
2.0x    23.8242        4.5782        12.6067

Absolute values of base results: (overcommit, value, stdev)
Ebizzy ( records / sec with 120 sec run)
0.5x 20941.8750 (2%)
1.0x 17623.8750 (5%)
1.5x  5874.7778 (15%)
2.0x  3581.8750 (7%)

Dbench (throughput in MB/sec)
0.5x 10009.6610 (5%)
1.0x  6583.0538 (1%)
1.5x  3991.9622 (4%)
2.0x  2527.0613 (2.5%)

Signed-off-by: Waiman Long <Waiman.Long@...com>
Tested-by: Raghavendra K T <raghavendra.kt@...ux.vnet.ibm.com>
---
 arch/x86/kernel/kvm.c |  135 +++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/Kconfig.locks  |    2 +-
 2 files changed, 136 insertions(+), 1 deletions(-)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 7ab8ab3..eef427b 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -567,6 +567,7 @@ static void kvm_kick_cpu(int cpu)
 	kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
 }
 
+#ifndef CONFIG_QUEUE_SPINLOCK
 enum kvm_contention_stat {
 	TAKEN_SLOW,
 	TAKEN_SLOW_PICKUP,
@@ -794,6 +795,134 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket)
 		}
 	}
 }
+#else /* !CONFIG_QUEUE_SPINLOCK */
+
+#ifdef CONFIG_KVM_DEBUG_FS
+static struct dentry *d_spin_debug;
+static struct dentry *d_kvm_debug;
+static u32 kick_nohlt_stats;	/* Kick but not halt count	*/
+static u32 halt_qhead_stats;	/* Queue head halting count	*/
+static u32 halt_qnode_stats;	/* Queue node halting count	*/
+static u32 halt_abort_stats;	/* Halting abort count		*/
+static u32 wake_kick_stats;	/* Wakeup by kicking count	*/
+static u32 wake_spur_stats;	/* Spurious wakeup count	*/
+static u64 time_blocked;	/* Total blocking time		*/
+
+static int __init kvm_spinlock_debugfs(void)
+{
+	d_kvm_debug = debugfs_create_dir("kvm-guest", NULL);
+	if (!d_kvm_debug) {
+		printk(KERN_WARNING
+		       "Could not create 'kvm' debugfs directory\n");
+		return -ENOMEM;
+	}
+	d_spin_debug = debugfs_create_dir("spinlocks", d_kvm_debug);
+
+	debugfs_create_u32("kick_nohlt_stats",
+			   0644, d_spin_debug, &kick_nohlt_stats);
+	debugfs_create_u32("halt_qhead_stats",
+			   0644, d_spin_debug, &halt_qhead_stats);
+	debugfs_create_u32("halt_qnode_stats",
+			   0644, d_spin_debug, &halt_qnode_stats);
+	debugfs_create_u32("halt_abort_stats",
+			   0644, d_spin_debug, &halt_abort_stats);
+	debugfs_create_u32("wake_kick_stats",
+			   0644, d_spin_debug, &wake_kick_stats);
+	debugfs_create_u32("wake_spur_stats",
+			   0644, d_spin_debug, &wake_spur_stats);
+	debugfs_create_u64("time_blocked",
+			   0644, d_spin_debug, &time_blocked);
+	return 0;
+}
+
+static inline void kvm_halt_stats(enum pv_lock_stats type)
+{
+	if (type == PV_HALT_QHEAD)
+		add_smp(&halt_qhead_stats, 1);
+	else if (type == PV_HALT_QNODE)
+		add_smp(&halt_qnode_stats, 1);
+	else /* type == PV_HALT_ABORT */
+		add_smp(&halt_abort_stats, 1);
+}
+
+static inline void kvm_lock_stats(enum pv_lock_stats type)
+{
+	if (type == PV_WAKE_KICKED)
+		add_smp(&wake_kick_stats, 1);
+	else if (type == PV_WAKE_SPURIOUS)
+		add_smp(&wake_spur_stats, 1);
+	else /* type == PV_KICK_NOHALT */
+		add_smp(&kick_nohlt_stats, 1);
+}
+
+static inline u64 spin_time_start(void)
+{
+	return sched_clock();
+}
+
+static inline void spin_time_accum_blocked(u64 start)
+{
+	u64 delta;
+
+	delta = sched_clock() - start;
+	add_smp(&time_blocked, delta);
+}
+
+fs_initcall(kvm_spinlock_debugfs);
+
+#else /* CONFIG_KVM_DEBUG_FS */
+static inline void kvm_halt_stats(enum pv_lock_stats type)
+{
+}
+
+static inline void kvm_lock_stats(enum pv_lock_stats type)
+{
+}
+
+static inline u64 spin_time_start(void)
+{
+	return 0;
+}
+
+static inline void spin_time_accum_blocked(u64 start)
+{
+}
+#endif /* CONFIG_KVM_DEBUG_FS */
+
+/*
+ * Halt the current CPU & release it back to the host
+ */
+static void kvm_halt_cpu(enum pv_lock_stats type, s8 *state, s8 sval)
+{
+	unsigned long flags;
+	u64 start;
+
+	if (in_nmi())
+		return;
+
+	/*
+	 * Make sure an interrupt handler can't upset things in a
+	 * partially setup state.
+	 */
+	local_irq_save(flags);
+	/*
+	 * Don't halt if the CPU state has been changed.
+	 */
+	if (ACCESS_ONCE(*state) != sval) {
+		kvm_halt_stats(PV_HALT_ABORT);
+		goto out;
+	}
+	start = spin_time_start();
+	kvm_halt_stats(type);
+	if (arch_irqs_disabled_flags(flags))
+		halt();
+	else
+		safe_halt();
+	spin_time_accum_blocked(start);
+out:
+	local_irq_restore(flags);
+}
+#endif /* !CONFIG_QUEUE_SPINLOCK */
 
 /*
  * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
@@ -806,8 +935,14 @@ void __init kvm_spinlock_init(void)
 	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
 		return;
 
+#ifdef CONFIG_QUEUE_SPINLOCK
+	pv_lock_ops.kick_cpu = kvm_kick_cpu;
+	pv_lock_ops.halt_cpu = kvm_halt_cpu;
+	pv_lock_ops.lockstat = kvm_lock_stats;
+#else
 	pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning);
 	pv_lock_ops.unlock_kick = kvm_unlock_kick;
+#endif
 }
 
 static __init int kvm_spinlock_init_jump(void)
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index f185584..a70fdeb 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -229,4 +229,4 @@ config ARCH_USE_QUEUE_SPINLOCK
 
 config QUEUE_SPINLOCK
 	def_bool y if ARCH_USE_QUEUE_SPINLOCK
-	depends on SMP && !PARAVIRT_SPINLOCKS
+	depends on SMP && (!PARAVIRT_SPINLOCKS || !XEN)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ