linux-kernel - [PATCH 2/4, v2] x86: enlightenment for ticket spin locks

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <4C2A200F02000078000089E4@vpn.id2.novell.com>
Date:	Tue, 29 Jun 2010 15:32:15 +0100
From:	"Jan Beulich" <JBeulich@...ell.com>
To:	<mingo@...e.hu>, <tglx@...utronix.de>, <hpa@...or.com>
Cc:	<jeremy.fitzhardinge@...rix.com>,
	"Ky Srinivasan" <KSrinivasan@...ell.com>,
	<linux-kernel@...r.kernel.org>
Subject: [PATCH 2/4, v2] x86: enlightenment for ticket spin locks - Xen
	 implementation

Use the (alternative instructions based) callout hooks to the ticket
spinlock code to enlighten ticket locks when running fully virtualized
on Xen. Ultimately, this code might also be a candidate to be used
when running para-virtualized.

Signed-off-by: Jan Beulich <jbeulich@...ell.com>
Cc: Jeremy Fitzhardinge <jeremy.fitzhardinge@...rix.com>
Cc: KY Srinivasan <ksrinivasan@...ell.com>

---
 arch/x86/include/asm/hypervisor.h     |    1 
 arch/x86/include/asm/spinlock_types.h |   17 +-
 arch/x86/include/asm/xen/cpuid.h      |   68 ++++++++
 arch/x86/kernel/cpu/Makefile          |    2 
 arch/x86/kernel/cpu/hypervisor.c      |    1 
 arch/x86/kernel/cpu/xen.c             |  269 ++++++++++++++++++++++++++++++++++
 6 files changed, 355 insertions(+), 3 deletions(-)

--- 2.6.35-rc3-virt-spinlocks.orig/arch/x86/include/asm/hypervisor.h
+++ 2.6.35-rc3-virt-spinlocks/arch/x86/include/asm/hypervisor.h
@@ -45,5 +45,6 @@ extern const struct hypervisor_x86 *x86_
 /* Recognized hypervisors */
 extern const struct hypervisor_x86 x86_hyper_vmware;
 extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
+extern const struct hypervisor_x86 x86_hyper_xen;
 
 #endif
--- 2.6.35-rc3-virt-spinlocks.orig/arch/x86/include/asm/spinlock_types.h
+++ 2.6.35-rc3-virt-spinlocks/arch/x86/include/asm/spinlock_types.h
@@ -5,11 +5,24 @@
 # error "please don't include this file directly"
 #endif
 
+#include <asm/types.h>
+
 typedef struct arch_spinlock {
-	unsigned int slock;
+	union {
+		unsigned int slock;
+#ifdef CONFIG_ENLIGHTEN_SPINLOCKS
+		struct {
+# if CONFIG_NR_CPUS < 256
+			u8 cur, seq;
+# else
+			u16 cur, seq;
+# endif
+		};
+#endif
+	};
 } arch_spinlock_t;
 
-#define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
+#define __ARCH_SPIN_LOCK_UNLOCKED	{ { 0 } }
 
 typedef struct {
 	unsigned int lock;
--- /dev/null
+++ 2.6.35-rc3-virt-spinlocks/arch/x86/include/asm/xen/cpuid.h
@@ -0,0 +1,68 @@
+/******************************************************************************
+ * arch-x86/cpuid.h
+ *
+ * CPUID interface to Xen.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Copyright (c) 2007 Citrix Systems, Inc.
+ *
+ * Authors:
+ *    Keir Fraser <keir.fraser@...rix.com>
+ */
+
+#ifndef __XEN_PUBLIC_ARCH_X86_CPUID_H__
+#define __XEN_PUBLIC_ARCH_X86_CPUID_H__
+
+/* Xen identification leaves start at 0x40000000. */
+#define XEN_CPUID_FIRST_LEAF 0x40000000
+#define XEN_CPUID_LEAF(i)    (XEN_CPUID_FIRST_LEAF + (i))
+
+/*
+ * Leaf 1 (0x40000000)
+ * EAX: Largest Xen-information leaf. All leaves up to an including @EAX
+ *      are supported by the Xen host.
+ * EBX-EDX: "XenVMMXenVMM" signature, allowing positive identification
+ *      of a Xen host.
+ */
+#define XEN_CPUID_SIGNATURE_EBX 0x566e6558 /* "XenV" */
+#define XEN_CPUID_SIGNATURE_ECX 0x65584d4d /* "MMXe" */
+#define XEN_CPUID_SIGNATURE_EDX 0x4d4d566e /* "nVMM" */
+
+/*
+ * Leaf 2 (0x40000001)
+ * EAX[31:16]: Xen major version.
+ * EAX[15: 0]: Xen minor version.
+ * EBX-EDX: Reserved (currently all zeroes).
+ */
+
+/*
+ * Leaf 3 (0x40000002)
+ * EAX: Number of hypercall transfer pages. This register is always guaranteed
+ *      to specify one hypercall page.
+ * EBX: Base address of Xen-specific MSRs.
+ * ECX: Features 1. Unused bits are set to zero.
+ * EDX: Features 2. Unused bits are set to zero.
+ */
+
+/* Does the host support MMU_PT_UPDATE_PRESERVE_AD for this guest? */
+#define _XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD 0
+#define XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD  (1u<<0)
+
+#endif /* __XEN_PUBLIC_ARCH_X86_CPUID_H__ */
--- 2.6.35-rc3-virt-spinlocks.orig/arch/x86/kernel/cpu/Makefile
+++ 2.6.35-rc3-virt-spinlocks/arch/x86/kernel/cpu/Makefile
@@ -14,7 +14,7 @@ CFLAGS_common.o		:= $(nostackp)
 
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
 obj-y			+= proc.o capflags.o powerflags.o common.o
-obj-y			+= vmware.o hypervisor.o sched.o mshyperv.o
+obj-y			+= vmware.o xen.o hypervisor.o sched.o mshyperv.o
 
 obj-$(CONFIG_X86_32)	+= bugs.o cmpxchg.o
 obj-$(CONFIG_X86_64)	+= bugs_64.o
--- 2.6.35-rc3-virt-spinlocks.orig/arch/x86/kernel/cpu/hypervisor.c
+++ 2.6.35-rc3-virt-spinlocks/arch/x86/kernel/cpu/hypervisor.c
@@ -43,6 +43,7 @@ static const __initconst struct hypervis
 {
 	&x86_hyper_vmware,
 	&x86_hyper_ms_hyperv,
+	&x86_hyper_xen,
 };
 
 const struct hypervisor_x86 *x86_hyper;
--- /dev/null
+++ 2.6.35-rc3-virt-spinlocks/arch/x86/kernel/cpu/xen.c
@@ -0,0 +1,269 @@
+#define __XEN_INTERFACE_VERSION__ 0x00030207
+#include <linux/bootmem.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/spinlock.h>
+#include <linux/stringify.h>
+#include <asm/sync_bitops.h>
+#include <asm/hypervisor.h>
+#include <asm/xen/cpuid.h>
+#include <asm/xen/hypercall.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/memory.h>
+#include <xen/interface/vcpu.h>
+
+#ifdef CONFIG_ENLIGHTEN_SPINLOCKS
+struct spinning {
+	struct arch_spinlock *lock;
+	unsigned int ticket;
+	struct spinning *prev;
+};
+
+static struct shared_info *__read_mostly xen_shared_info;
+EXPORT_SYMBOL_GPL(xen_shared_info);
+
+static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
+static DEFINE_PER_CPU(evtchn_port_t, poll_evtchn);
+static DEFINE_PER_CPU(struct spinning *, _spinning);
+/*
+ * Protect removal of objects: Insertion can be done lockless, and even
+ * removal itself doesn't need protection - what needs to be prevented is
+ * removed objects going out of scope (as they're living on the stack).
+ */
+static DEFINE_PER_CPU(arch_rwlock_t, spinning_rm_lock) = __ARCH_RW_LOCK_UNLOCKED;
+
+static unsigned int __read_mostly spin_count = 1000;
+static int __init setup_spin_count(char *s)
+{
+	if (!s)
+		return -EINVAL;
+	spin_count = simple_strtoul(s, &s, 0);
+	return !*s ? 0 : -EINVAL;
+}
+early_param("spin_count", setup_spin_count);
+
+#ifndef CONFIG_XEN
+__asm__(".pushsection .text, \"ax\", @progbits\n"
+	".p2align " __stringify(PAGE_SHIFT) "\n"
+	"hypercall_page:\n"
+	".skip 1 << " __stringify(PAGE_SHIFT) "\n"
+	".popsection");
+#endif
+
+static void xen_set_cpu_features(struct cpuinfo_x86 *);
+
+static void xen_spin_lock(struct arch_spinlock *lock, unsigned int token)
+{
+	arch_rwlock_t *rm_lock;
+	unsigned long flags;
+	unsigned int count;
+	struct spinning spinning;
+
+	if (unlikely(percpu_read(runstate.state) != RUNSTATE_running))
+		xen_set_cpu_features(&__get_cpu_var(cpu_info));
+
+#if TICKET_SHIFT == 8
+	token >>= TICKET_SHIFT;
+#endif
+	spinning.ticket = token;
+	spinning.lock = lock;
+	spinning.prev = percpu_read(_spinning);
+	smp_wmb();
+	percpu_write(_spinning, &spinning);
+
+	sync_clear_bit(percpu_read(poll_evtchn),
+		       xen_shared_info->evtchn_pending);
+
+	for (count = spin_count; ({ barrier(); lock->cur != token; }); )
+		if (likely(cpu_online(raw_smp_processor_id()))
+		    && unlikely(!--count)) {
+			struct sched_poll sched_poll;
+
+			set_xen_guest_handle(sched_poll.ports,
+					     &__get_cpu_var(poll_evtchn));
+			sched_poll.nr_ports = 1;
+			sched_poll.timeout = 0;
+			HYPERVISOR_sched_op(SCHEDOP_poll, &sched_poll);
+			count = spin_count;
+		} else
+			cpu_relax();
+
+	/*
+	 * If we interrupted another spinlock while it was blocking, make
+	 * sure it doesn't block (again) without re-checking the lock.
+	 */
+	if (spinning.prev)
+		sync_set_bit(percpu_read(poll_evtchn),
+			     xen_shared_info->evtchn_pending);
+
+	percpu_write(_spinning, spinning.prev);
+	rm_lock = &__get_cpu_var(spinning_rm_lock);
+	raw_local_irq_save(flags);
+	arch_write_lock(rm_lock);
+	arch_write_unlock(rm_lock);
+	raw_local_irq_restore(flags);
+}
+
+static void xen_spin_unlock(struct arch_spinlock *lock, unsigned int token)
+{
+	unsigned int cpu;
+
+	token &= (1U << TICKET_SHIFT) - 1;
+	for_each_online_cpu(cpu) {
+		arch_rwlock_t *rm_lock;
+		unsigned long flags;
+		struct spinning *spinning;
+
+		if (cpu == raw_smp_processor_id())
+			continue;
+
+		rm_lock = &per_cpu(spinning_rm_lock, cpu);
+		raw_local_irq_save(flags);
+		arch_read_lock(rm_lock);
+
+		spinning = per_cpu(_spinning, cpu);
+		smp_rmb();
+		if (spinning
+		    && (spinning->lock != lock || spinning->ticket != token))
+			spinning = NULL;
+
+		arch_read_unlock(rm_lock);
+		raw_local_irq_restore(flags);
+
+		if (unlikely(spinning)) {
+			struct evtchn_send send;
+
+			send.port = per_cpu(poll_evtchn, cpu);
+			HYPERVISOR_event_channel_op(EVTCHNOP_send, &send);
+			return;
+		}
+	}
+}
+
+static void __init _prepare_shared_info_page(void)
+{
+	struct xen_add_to_physmap xatp;
+
+	xen_shared_info = slab_is_available()
+			  ? (void *)get_zeroed_page(GFP_KERNEL)
+			  : alloc_bootmem_pages(PAGE_SIZE);
+
+	xatp.domid = DOMID_SELF;
+	xatp.idx = 0;
+	xatp.space = XENMAPSPACE_shared_info;
+	xatp.gpfn = __pa(xen_shared_info) >> PAGE_SHIFT;
+	if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
+		BUG();
+}
+
+static void __ref prepare_shared_info_page(void)
+{
+	_prepare_shared_info_page();
+}
+#endif
+
+static bool __cpuinit xen_platform(void)
+{
+	unsigned int first = XEN_CPUID_FIRST_LEAF;
+
+#if 0 /* So far, Xen sets this only for PV guests. */
+	if (!cpu_has_hypervisor)
+		return false;
+#endif
+
+	while (first < XEN_CPUID_LEAF(0x10000)) {
+		unsigned int eax, ebx, ecx, edx;
+
+		cpuid(first, &eax, &ebx, &ecx, &edx);
+		if (ebx == XEN_CPUID_SIGNATURE_EBX
+		    && ecx == XEN_CPUID_SIGNATURE_ECX
+		    && edx == XEN_CPUID_SIGNATURE_EDX) {
+			if (!smp_processor_id()) {
+				cpuid(first + 1, &eax, &ebx, &ecx, &edx);
+				printk(KERN_INFO "Running on Xen %u.%u\n",
+				       eax >> 16, eax & 0xffff);
+			}
+			return true;
+		}
+		first += 0x100;
+	}
+
+	return false;
+}
+
+static void xen_set_cpu_features(struct cpuinfo_x86 *c)
+{
+#ifdef CONFIG_ENLIGHTEN_SPINLOCKS
+	unsigned int msr, eax, ebx, ecx, edx;
+	unsigned int first = XEN_CPUID_FIRST_LEAF;
+	int ret;
+	struct vcpu_register_runstate_memory_area vrrma;
+
+	if (num_possible_cpus() <= 1
+	    || !spin_count
+	    || (c != &boot_cpu_data
+		&& !boot_cpu_has(X86_FEATURE_SPINLOCK_YIELD)))
+		return;
+
+	while (first < XEN_CPUID_LEAF(0x10000)) {
+		cpuid(first, &eax, &ebx, &ecx, &edx);
+		if (ebx == XEN_CPUID_SIGNATURE_EBX
+		    && ecx == XEN_CPUID_SIGNATURE_ECX
+		    && edx == XEN_CPUID_SIGNATURE_EDX)
+			break;
+		first += 0x100;
+	}
+	BUG_ON(first >= XEN_CPUID_LEAF(0x10000));
+
+	cpuid(first + 2, &eax, &msr, &ecx, &edx);
+	BUG_ON(!eax);
+	wrmsrl(msr, __pa_symbol(hypercall_page));
+
+	if (!xen_shared_info)
+		prepare_shared_info_page();
+
+	memset(&vrrma, 0, sizeof(vrrma));
+	set_xen_guest_handle(vrrma.addr.h, &__get_cpu_var(runstate));
+	ret = HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
+				 c->cpu_index, &vrrma);
+	if (ret) {
+		printk(KERN_WARNING
+		       "Could not register runstate area for CPU%u: %d\n",
+		       c->cpu_index, ret);
+		BUG_ON(boot_cpu_has(X86_FEATURE_SPINLOCK_YIELD));
+		return;
+	}
+
+	if (c != &boot_cpu_data || !percpu_read(poll_evtchn)) {
+		struct evtchn_bind_ipi bind_ipi;
+
+		bind_ipi.vcpu = c->cpu_index;
+		ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi,
+						  &bind_ipi);
+		if (ret) {
+			printk(KERN_WARNING
+			       "Could not bind event channel for CPU%u: %d\n",
+			       c->cpu_index, ret);
+			BUG_ON(boot_cpu_has(X86_FEATURE_SPINLOCK_YIELD));
+			return;
+		}
+		sync_set_bit(bind_ipi.port, xen_shared_info->evtchn_mask);
+		percpu_write(poll_evtchn, bind_ipi.port);
+		printk(KERN_INFO "CPU%u spinlock poll event channel: %u\n",
+		       c->cpu_index, bind_ipi.port);
+	}
+
+	virt_spin_lock = xen_spin_lock;
+	virt_spin_unlock = xen_spin_unlock;
+	set_cpu_cap(c, X86_FEATURE_SPINLOCK_YIELD);
+#endif
+}
+
+const __refconst struct hypervisor_x86 x86_hyper_xen = {
+	.name			= "Xen",
+	.detect			= xen_platform,
+	.set_cpu_features	= xen_set_cpu_features
+};


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/