linux-kernel - [GIT PULL] x86 fixes

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20160114101615.GA16926@gmail.com>
Date:	Thu, 14 Jan 2016 11:16:15 +0100
From:	Ingo Molnar <mingo@...nel.org>
To:	Linus Torvalds <torvalds@...ux-foundation.org>
Cc:	linux-kernel@...r.kernel.org, Thomas Gleixner <tglx@...utronix.de>,
	"H. Peter Anvin" <hpa@...or.com>,
	Peter Zijlstra <a.p.zijlstra@...llo.nl>,
	Andrew Morton <akpm@...ux-foundation.org>
Subject: [GIT PULL] x86 fixes

Linus,

Please pull the latest x86-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86-urgent-for-linus

   # HEAD: 7030a7e9321166eef44c811fe4af4d460360d424 x86/cpu/amd: Remove an unneeded condition in srat_detect_node()

Misc changes:

 - fix lguest bug
 - fix /proc/meminfo output on certain configs
 - fix pvclock bug
 - fix reboot on certain iMacs by adding new reboot quirk
 - fix bootup crash
 - fix FPU boot line option parsing
 - add more x86 self-tests
 - small cleanups, documentation improvements, etc.

  out-of-topic modifications in x86-urgent-for-linus:
  -----------------------------------------------------
  drivers/lguest/core.c              # e27d90e8be08: lguest: Map switcher text R/
  tools/testing/selftests/x86/Makefile# 65cacec1ba90: selftests/x86: Test __kernel
                                   # 0f672809f91a: selftests/x86: Disable the l
  tools/testing/selftests/x86/vdso_restorer.c# 65cacec1ba90: selftests/x86: Test __kernel

 Thanks,

	Ingo

------------------>
Andy Lutomirski (5):
      x86/mm: Add barriers and document switch_mm()-vs-flush synchronization
      selftests/x86: Disable the ldt_gdt_64 test for now
      selftests/x86: Test __kernel_sigreturn and __kernel_rt_sigreturn
      x86/mm: Improve switch_mm() barrier comments
      x86/vdso/pvclock: Protect STABLE check with the seqcount

Borislav Petkov (1):
      x86/boot: Hide local labels in verify_cpu()

Dan Carpenter (1):
      x86/cpu/amd: Remove an unneeded condition in srat_detect_node()

Dave Jones (1):
      x86/mm/pat: Make split_page_count() check for empty levels to fix /proc/meminfo output

H.J. Lu (1):
      x86/boot: Double BOOT_HEAP_SIZE to 64KB

Kefeng Wang (1):
      x86/mm: Use PAGE_ALIGNED instead of IS_ALIGNED

Mario Kleiner (1):
      x86/reboot/quirks: Add iMac10,1 to pci_reboot_dmi_table[]

Rusty Russell (1):
      lguest: Map switcher text R/O

yu-cheng yu (4):
      x86/fpu: Fix early FPU command-line parsing
      x86/fpu: Disable XGETBV1 when no XSAVE
      x86/fpu: Disable MPX when eagerfpu is off
      x86/fpu: Disable AVX when eagerfpu is off


 arch/x86/entry/vdso/vclock_gettime.c        |  12 +--
 arch/x86/include/asm/boot.h                 |   2 +-
 arch/x86/include/asm/fpu/internal.h         |   1 +
 arch/x86/include/asm/fpu/xstate.h           |  11 +-
 arch/x86/include/asm/lguest.h               |   4 +-
 arch/x86/include/asm/mmu_context.h          |  34 +++++-
 arch/x86/kernel/cpu/amd.c                   |   3 +-
 arch/x86/kernel/fpu/init.c                  | 161 ++++++++++++++--------------
 arch/x86/kernel/fpu/xstate.c                |   4 +-
 arch/x86/kernel/reboot.c                    |   8 ++
 arch/x86/kernel/verify_cpu.S                |  50 ++++-----
 arch/x86/mm/init_64.c                       |   3 +-
 arch/x86/mm/pageattr.c                      |   3 +
 arch/x86/mm/tlb.c                           |  29 ++++-
 drivers/lguest/core.c                       |  74 +++++++++----
 tools/testing/selftests/x86/Makefile        |   6 +-
 tools/testing/selftests/x86/vdso_restorer.c |  88 +++++++++++++++
 17 files changed, 342 insertions(+), 151 deletions(-)
 create mode 100644 tools/testing/selftests/x86/vdso_restorer.c

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 8602f06c759f..1a50e09c945b 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -126,23 +126,23 @@ static notrace cycle_t vread_pvclock(int *mode)
 	 *
 	 * On Xen, we don't appear to have that guarantee, but Xen still
 	 * supplies a valid seqlock using the version field.
-
+	 *
 	 * We only do pvclock vdso timing at all if
 	 * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
 	 * mean that all vCPUs have matching pvti and that the TSC is
 	 * synced, so we can just look at vCPU 0's pvti.
 	 */
 
-	if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
-		*mode = VCLOCK_NONE;
-		return 0;
-	}
-
 	do {
 		version = pvti->version;
 
 		smp_rmb();
 
+		if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
+			*mode = VCLOCK_NONE;
+			return 0;
+		}
+
 		tsc = rdtsc_ordered();
 		pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
 		pvti_tsc_shift = pvti->tsc_shift;
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 4fa687a47a62..6b8d6e8cd449 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -27,7 +27,7 @@
 #define BOOT_HEAP_SIZE             0x400000
 #else /* !CONFIG_KERNEL_BZIP2 */
 
-#define BOOT_HEAP_SIZE	0x8000
+#define BOOT_HEAP_SIZE	0x10000
 
 #endif /* !CONFIG_KERNEL_BZIP2 */
 
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index eadcdd5bb946..0fd440df63f1 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -42,6 +42,7 @@ extern void fpu__init_cpu_xstate(void);
 extern void fpu__init_system(struct cpuinfo_x86 *c);
 extern void fpu__init_check_bugs(void);
 extern void fpu__resume_cpu(void);
+extern u64 fpu__get_supported_xfeatures_mask(void);
 
 /*
  * Debugging facility:
diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 3a6c89b70307..af30fdeb140d 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -20,15 +20,16 @@
 
 /* Supported features which support lazy state saving */
 #define XFEATURE_MASK_LAZY	(XFEATURE_MASK_FP | \
-				 XFEATURE_MASK_SSE | \
+				 XFEATURE_MASK_SSE)
+
+/* Supported features which require eager state saving */
+#define XFEATURE_MASK_EAGER	(XFEATURE_MASK_BNDREGS | \
+				 XFEATURE_MASK_BNDCSR | \
 				 XFEATURE_MASK_YMM | \
-				 XFEATURE_MASK_OPMASK |	\
+				 XFEATURE_MASK_OPMASK | \
 				 XFEATURE_MASK_ZMM_Hi256 | \
 				 XFEATURE_MASK_Hi16_ZMM)
 
-/* Supported features which require eager state saving */
-#define XFEATURE_MASK_EAGER	(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
-
 /* All currently supported features */
 #define XCNTXT_MASK	(XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
 
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 3bbc07a57a31..73d0c9b92087 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -12,7 +12,9 @@
 #define GUEST_PL 1
 
 /* Page for Switcher text itself, then two pages per cpu */
-#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids)
+#define SWITCHER_TEXT_PAGES (1)
+#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
+#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
 
 /* Where we map the Switcher, in both Host and Guest. */
 extern unsigned long switcher_addr;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 379cd3658799..bfd9b2a35a0b 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -116,8 +116,36 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 #endif
 		cpumask_set_cpu(cpu, mm_cpumask(next));
 
-		/* Re-load page tables */
+		/*
+		 * Re-load page tables.
+		 *
+		 * This logic has an ordering constraint:
+		 *
+		 *  CPU 0: Write to a PTE for 'next'
+		 *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
+		 *  CPU 1: set bit 1 in next's mm_cpumask
+		 *  CPU 1: load from the PTE that CPU 0 writes (implicit)
+		 *
+		 * We need to prevent an outcome in which CPU 1 observes
+		 * the new PTE value and CPU 0 observes bit 1 clear in
+		 * mm_cpumask.  (If that occurs, then the IPI will never
+		 * be sent, and CPU 0's TLB will contain a stale entry.)
+		 *
+		 * The bad outcome can occur if either CPU's load is
+		 * reordered before that CPU's store, so both CPUs must
+		 * execute full barriers to prevent this from happening.
+		 *
+		 * Thus, switch_mm needs a full barrier between the
+		 * store to mm_cpumask and any operation that could load
+		 * from next->pgd.  TLB fills are special and can happen
+		 * due to instruction fetches or for no reason at all,
+		 * and neither LOCK nor MFENCE orders them.
+		 * Fortunately, load_cr3() is serializing and gives the
+		 * ordering guarantee we need.
+		 *
+		 */
 		load_cr3(next->pgd);
+
 		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 
 		/* Stop flush ipis for the previous mm */
@@ -156,10 +184,14 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			 * schedule, protecting us from simultaneous changes.
 			 */
 			cpumask_set_cpu(cpu, mm_cpumask(next));
+
 			/*
 			 * We were in lazy tlb mode and leave_mm disabled
 			 * tlb flush IPI delivery. We must reload CR3
 			 * to make sure to use no freed page tables.
+			 *
+			 * As above, load_cr3() is serializing and orders TLB
+			 * fills with respect to the mm_cpumask write.
 			 */
 			load_cr3(next->pgd);
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e678ddeed030..a07956a08936 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -434,8 +434,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
 		 */
 		int ht_nodeid = c->initial_apicid;
 
-		if (ht_nodeid >= 0 &&
-		    __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+		if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
 			node = __apicid_to_node[ht_nodeid];
 		/* Pick a nearby node */
 		if (!node_online(node))
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 7b2978ab30df..6d9f0a7ef4c8 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -3,8 +3,11 @@
  */
 #include <asm/fpu/internal.h>
 #include <asm/tlbflush.h>
+#include <asm/setup.h>
+#include <asm/cmdline.h>
 
 #include <linux/sched.h>
+#include <linux/init.h>
 
 /*
  * Initialize the TS bit in CR0 according to the style of context-switches
@@ -270,20 +273,52 @@ static void __init fpu__init_system_xstate_size_legacy(void)
  */
 static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
 
-static int __init eager_fpu_setup(char *s)
+/*
+ * Find supported xfeatures based on cpu features and command-line input.
+ * This must be called after fpu__init_parse_early_param() is called and
+ * xfeatures_mask is enumerated.
+ */
+u64 __init fpu__get_supported_xfeatures_mask(void)
 {
-	if (!strcmp(s, "on"))
-		eagerfpu = ENABLE;
-	else if (!strcmp(s, "off"))
-		eagerfpu = DISABLE;
-	else if (!strcmp(s, "auto"))
-		eagerfpu = AUTO;
-	return 1;
+	/* Support all xfeatures known to us */
+	if (eagerfpu != DISABLE)
+		return XCNTXT_MASK;
+
+	/* Warning of xfeatures being disabled for no eagerfpu mode */
+	if (xfeatures_mask & XFEATURE_MASK_EAGER) {
+		pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
+			xfeatures_mask & XFEATURE_MASK_EAGER);
+	}
+
+	/* Return a mask that masks out all features requiring eagerfpu mode */
+	return ~XFEATURE_MASK_EAGER;
+}
+
+/*
+ * Disable features dependent on eagerfpu.
+ */
+static void __init fpu__clear_eager_fpu_features(void)
+{
+	setup_clear_cpu_cap(X86_FEATURE_MPX);
+	setup_clear_cpu_cap(X86_FEATURE_AVX);
+	setup_clear_cpu_cap(X86_FEATURE_AVX2);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
 }
-__setup("eagerfpu=", eager_fpu_setup);
 
 /*
  * Pick the FPU context switching strategy:
+ *
+ * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of
+ * the following is true:
+ *
+ * (1) the cpu has xsaveopt, as it has the optimization and doing eager
+ *     FPU switching has a relatively low cost compared to a plain xsave;
+ * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU
+ *     switching. Should the kernel boot with noxsaveopt, we support MPX
+ *     with eager FPU switching at a higher cost.
  */
 static void __init fpu__init_system_ctx_switch(void)
 {
@@ -295,19 +330,11 @@ static void __init fpu__init_system_ctx_switch(void)
 	WARN_ON_FPU(current->thread.fpu.fpstate_active);
 	current_thread_info()->status = 0;
 
-	/* Auto enable eagerfpu for xsaveopt */
 	if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
 		eagerfpu = ENABLE;
 
-	if (xfeatures_mask & XFEATURE_MASK_EAGER) {
-		if (eagerfpu == DISABLE) {
-			pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
-			       xfeatures_mask & XFEATURE_MASK_EAGER);
-			xfeatures_mask &= ~XFEATURE_MASK_EAGER;
-		} else {
-			eagerfpu = ENABLE;
-		}
-	}
+	if (xfeatures_mask & XFEATURE_MASK_EAGER)
+		eagerfpu = ENABLE;
 
 	if (eagerfpu == ENABLE)
 		setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
@@ -316,11 +343,48 @@ static void __init fpu__init_system_ctx_switch(void)
 }
 
 /*
+ * We parse fpu parameters early because fpu__init_system() is executed
+ * before parse_early_param().
+ */
+static void __init fpu__init_parse_early_param(void)
+{
+	/*
+	 * No need to check "eagerfpu=auto" again, since it is the
+	 * initial default.
+	 */
+	if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
+		eagerfpu = DISABLE;
+		fpu__clear_eager_fpu_features();
+	} else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) {
+		eagerfpu = ENABLE;
+	}
+
+	if (cmdline_find_option_bool(boot_command_line, "no387"))
+		setup_clear_cpu_cap(X86_FEATURE_FPU);
+
+	if (cmdline_find_option_bool(boot_command_line, "nofxsr")) {
+		setup_clear_cpu_cap(X86_FEATURE_FXSR);
+		setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
+		setup_clear_cpu_cap(X86_FEATURE_XMM);
+	}
+
+	if (cmdline_find_option_bool(boot_command_line, "noxsave"))
+		fpu__xstate_clear_all_cpu_caps();
+
+	if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
+		setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+	if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
+		setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+}
+
+/*
  * Called on the boot CPU once per system bootup, to set up the initial
  * FPU state that is later cloned into all processes:
  */
 void __init fpu__init_system(struct cpuinfo_x86 *c)
 {
+	fpu__init_parse_early_param();
 	fpu__init_system_early_generic(c);
 
 	/*
@@ -344,62 +408,3 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
 
 	fpu__init_system_ctx_switch();
 }
-
-/*
- * Boot parameter to turn off FPU support and fall back to math-emu:
- */
-static int __init no_387(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_FPU);
-	return 1;
-}
-__setup("no387", no_387);
-
-/*
- * Disable all xstate CPU features:
- */
-static int __init x86_noxsave_setup(char *s)
-{
-	if (strlen(s))
-		return 0;
-
-	fpu__xstate_clear_all_cpu_caps();
-
-	return 1;
-}
-__setup("noxsave", x86_noxsave_setup);
-
-/*
- * Disable the XSAVEOPT instruction specifically:
- */
-static int __init x86_noxsaveopt_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-
-	return 1;
-}
-__setup("noxsaveopt", x86_noxsaveopt_setup);
-
-/*
- * Disable the XSAVES instruction:
- */
-static int __init x86_noxsaves_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-
-	return 1;
-}
-__setup("noxsaves", x86_noxsaves_setup);
-
-/*
- * Disable FX save/restore and SSE support:
- */
-static int __init x86_nofxsr_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_FXSR);
-	setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
-	setup_clear_cpu_cap(X86_FEATURE_XMM);
-
-	return 1;
-}
-__setup("nofxsr", x86_nofxsr_setup);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 40f100285984..d425cda5ae6d 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -52,6 +52,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
 	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
 	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
 	setup_clear_cpu_cap(X86_FEATURE_MPX);
+	setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
 }
 
 /*
@@ -632,8 +633,7 @@ void __init fpu__init_system_xstate(void)
 		BUG();
 	}
 
-	/* Support only the state known to the OS: */
-	xfeatures_mask = xfeatures_mask & XCNTXT_MASK;
+	xfeatures_mask &= fpu__get_supported_xfeatures_mask();
 
 	/* Enable xstate instructions to be able to continue with initialization: */
 	fpu__init_cpu_xstate();
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d64889aa2d46..ab0adc0fa5db 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -182,6 +182,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
 		},
 	},
+	{	/* Handle problems with rebooting on the iMac10,1. */
+		.callback = set_pci_reboot,
+		.ident = "Apple iMac10,1",
+		.matches = {
+		    DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+		    DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"),
+		},
+	},
 
 	/* ASRock */
 	{	/* Handle problems with rebooting on ASRock Q1900DC-ITX */
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 4cf401f581e7..07efb35ee4bc 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -48,31 +48,31 @@
 	pushfl
 	popl	%eax
 	cmpl	%eax,%ebx
-	jz	verify_cpu_no_longmode	# cpu has no cpuid
+	jz	.Lverify_cpu_no_longmode	# cpu has no cpuid
 #endif
 
 	movl	$0x0,%eax		# See if cpuid 1 is implemented
 	cpuid
 	cmpl	$0x1,%eax
-	jb	verify_cpu_no_longmode	# no cpuid 1
+	jb	.Lverify_cpu_no_longmode	# no cpuid 1
 
 	xor	%di,%di
 	cmpl	$0x68747541,%ebx	# AuthenticAMD
-	jnz	verify_cpu_noamd
+	jnz	.Lverify_cpu_noamd
 	cmpl	$0x69746e65,%edx
-	jnz	verify_cpu_noamd
+	jnz	.Lverify_cpu_noamd
 	cmpl	$0x444d4163,%ecx
-	jnz	verify_cpu_noamd
+	jnz	.Lverify_cpu_noamd
 	mov	$1,%di			# cpu is from AMD
-	jmp	verify_cpu_check
+	jmp	.Lverify_cpu_check
 
-verify_cpu_noamd:
+.Lverify_cpu_noamd:
 	cmpl	$0x756e6547,%ebx        # GenuineIntel?
-	jnz	verify_cpu_check
+	jnz	.Lverify_cpu_check
 	cmpl	$0x49656e69,%edx
-	jnz	verify_cpu_check
+	jnz	.Lverify_cpu_check
 	cmpl	$0x6c65746e,%ecx
-	jnz	verify_cpu_check
+	jnz	.Lverify_cpu_check
 
 	# only call IA32_MISC_ENABLE when:
 	# family > 6 || (family == 6 && model >= 0xd)
@@ -83,59 +83,59 @@
 	andl	$0x0ff00f00, %eax	# mask family and extended family
 	shrl	$8, %eax
 	cmpl	$6, %eax
-	ja	verify_cpu_clear_xd	# family > 6, ok
-	jb	verify_cpu_check	# family < 6, skip
+	ja	.Lverify_cpu_clear_xd	# family > 6, ok
+	jb	.Lverify_cpu_check	# family < 6, skip
 
 	andl	$0x000f00f0, %ecx	# mask model and extended model
 	shrl	$4, %ecx
 	cmpl	$0xd, %ecx
-	jb	verify_cpu_check	# family == 6, model < 0xd, skip
+	jb	.Lverify_cpu_check	# family == 6, model < 0xd, skip
 
-verify_cpu_clear_xd:
+.Lverify_cpu_clear_xd:
 	movl	$MSR_IA32_MISC_ENABLE, %ecx
 	rdmsr
 	btrl	$2, %edx		# clear MSR_IA32_MISC_ENABLE_XD_DISABLE
-	jnc	verify_cpu_check	# only write MSR if bit was changed
+	jnc	.Lverify_cpu_check	# only write MSR if bit was changed
 	wrmsr
 
-verify_cpu_check:
+.Lverify_cpu_check:
 	movl    $0x1,%eax		# Does the cpu have what it takes
 	cpuid
 	andl	$REQUIRED_MASK0,%edx
 	xorl	$REQUIRED_MASK0,%edx
-	jnz	verify_cpu_no_longmode
+	jnz	.Lverify_cpu_no_longmode
 
 	movl    $0x80000000,%eax	# See if extended cpuid is implemented
 	cpuid
 	cmpl    $0x80000001,%eax
-	jb      verify_cpu_no_longmode	# no extended cpuid
+	jb      .Lverify_cpu_no_longmode	# no extended cpuid
 
 	movl    $0x80000001,%eax	# Does the cpu have what it takes
 	cpuid
 	andl    $REQUIRED_MASK1,%edx
 	xorl    $REQUIRED_MASK1,%edx
-	jnz     verify_cpu_no_longmode
+	jnz     .Lverify_cpu_no_longmode
 
-verify_cpu_sse_test:
+.Lverify_cpu_sse_test:
 	movl	$1,%eax
 	cpuid
 	andl	$SSE_MASK,%edx
 	cmpl	$SSE_MASK,%edx
-	je	verify_cpu_sse_ok
+	je	.Lverify_cpu_sse_ok
 	test	%di,%di
-	jz	verify_cpu_no_longmode	# only try to force SSE on AMD
+	jz	.Lverify_cpu_no_longmode	# only try to force SSE on AMD
 	movl	$MSR_K7_HWCR,%ecx
 	rdmsr
 	btr	$15,%eax		# enable SSE
 	wrmsr
 	xor	%di,%di			# don't loop
-	jmp	verify_cpu_sse_test	# try again
+	jmp	.Lverify_cpu_sse_test	# try again
 
-verify_cpu_no_longmode:
+.Lverify_cpu_no_longmode:
 	popf				# Restore caller passed flags
 	movl $1,%eax
 	ret
-verify_cpu_sse_ok:
+.Lverify_cpu_sse_ok:
 	popf				# Restore caller passed flags
 	xorl %eax, %eax
 	ret
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ec081fe0ce2c..8829482d69ec 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -814,8 +814,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
 		if (phys_addr < (phys_addr_t)0x40000000)
 			return;
 
-		if (IS_ALIGNED(addr, PAGE_SIZE) &&
-		    IS_ALIGNED(next, PAGE_SIZE)) {
+		if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
 			/*
 			 * Do not free direct mapping pages since they were
 			 * freed when offlining, or simplely not in use.
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6000ad7f560c..fc6a4c8f6e2a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -66,6 +66,9 @@ void update_page_count(int level, unsigned long pages)
 
 static void split_page_count(int level)
 {
+	if (direct_pages_count[level] == 0)
+		return;
+
 	direct_pages_count[level]--;
 	direct_pages_count[level - 1] += PTRS_PER_PTE;
 }
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 8ddb5d0d66fb..8f4cc3dfac32 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -161,7 +161,10 @@ void flush_tlb_current_task(void)
 	preempt_disable();
 
 	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+
+	/* This is an implicit full barrier that synchronizes with switch_mm. */
 	local_flush_tlb();
+
 	trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
 	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
 		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 	unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
 
 	preempt_disable();
-	if (current->active_mm != mm)
+	if (current->active_mm != mm) {
+		/* Synchronize with switch_mm. */
+		smp_mb();
+
 		goto out;
+	}
 
 	if (!current->mm) {
 		leave_mm(smp_processor_id());
+
+		/* Synchronize with switch_mm. */
+		smp_mb();
+
 		goto out;
 	}
 
 	if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
 		base_pages_to_flush = (end - start) >> PAGE_SHIFT;
 
+	/*
+	 * Both branches below are implicit full barriers (MOV to CR or
+	 * INVLPG) that synchronize with switch_mm.
+	 */
 	if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
 		base_pages_to_flush = TLB_FLUSH_ALL;
 		count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
@@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
 	preempt_disable();
 
 	if (current->active_mm == mm) {
-		if (current->mm)
+		if (current->mm) {
+			/*
+			 * Implicit full barrier (INVLPG) that synchronizes
+			 * with switch_mm.
+			 */
 			__flush_tlb_one(start);
-		else
+		} else {
 			leave_mm(smp_processor_id());
+
+			/* Synchronize with switch_mm. */
+			smp_mb();
+		}
 	}
 
 	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 312ffd3d0017..9e385b38debf 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -22,7 +22,8 @@
 
 unsigned long switcher_addr;
 struct page **lg_switcher_pages;
-static struct vm_struct *switcher_vma;
+static struct vm_struct *switcher_text_vma;
+static struct vm_struct *switcher_stacks_vma;
 
 /* This One Big lock protects all inter-guest data structures. */
 DEFINE_MUTEX(lguest_lock);
@@ -83,54 +84,80 @@ static __init int map_switcher(void)
 	}
 
 	/*
+	 * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
+	 * It goes in the first page, which we map in momentarily.
+	 */
+	memcpy(kmap(lg_switcher_pages[0]), start_switcher_text,
+	       end_switcher_text - start_switcher_text);
+	kunmap(lg_switcher_pages[0]);
+
+	/*
 	 * We place the Switcher underneath the fixmap area, which is the
 	 * highest virtual address we can get.  This is important, since we
 	 * tell the Guest it can't access this memory, so we want its ceiling
 	 * as high as possible.
 	 */
-	switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
+	switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE;
 
 	/*
-	 * Now we reserve the "virtual memory area" we want.  We might
-	 * not get it in theory, but in practice it's worked so far.
-	 * The end address needs +1 because __get_vm_area allocates an
-	 * extra guard page, so we need space for that.
+	 * Now we reserve the "virtual memory area"s we want.  We might
+	 * not get them in theory, but in practice it's worked so far.
+	 *
+	 * We want the switcher text to be read-only and executable, and
+	 * the stacks to be read-write and non-executable.
 	 */
-	switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
-				     VM_ALLOC, switcher_addr, switcher_addr
-				     + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
-	if (!switcher_vma) {
+	switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD,
+					  switcher_addr,
+					  switcher_addr + PAGE_SIZE);
+
+	if (!switcher_text_vma) {
 		err = -ENOMEM;
 		printk("lguest: could not map switcher pages high\n");
 		goto free_pages;
 	}
 
+	switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE,
+					    VM_ALLOC|VM_NO_GUARD,
+					    switcher_addr + PAGE_SIZE,
+					    switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE);
+	if (!switcher_stacks_vma) {
+		err = -ENOMEM;
+		printk("lguest: could not map switcher pages high\n");
+		goto free_text_vma;
+	}
+
 	/*
 	 * This code actually sets up the pages we've allocated to appear at
 	 * switcher_addr.  map_vm_area() takes the vma we allocated above, the
-	 * kind of pages we're mapping (kernel pages), and a pointer to our
-	 * array of struct pages.
+	 * kind of pages we're mapping (kernel text pages and kernel writable
+	 * pages respectively), and a pointer to our array of struct pages.
 	 */
-	err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, lg_switcher_pages);
+	err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages);
+	if (err) {
+		printk("lguest: text map_vm_area failed: %i\n", err);
+		goto free_vmas;
+	}
+
+	err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL,
+			  lg_switcher_pages + SWITCHER_TEXT_PAGES);
 	if (err) {
-		printk("lguest: map_vm_area failed: %i\n", err);
-		goto free_vma;
+		printk("lguest: stacks map_vm_area failed: %i\n", err);
+		goto free_vmas;
 	}
 
 	/*
 	 * Now the Switcher is mapped at the right address, we can't fail!
-	 * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
 	 */
-	memcpy(switcher_vma->addr, start_switcher_text,
-	       end_switcher_text - start_switcher_text);
-
 	printk(KERN_INFO "lguest: mapped switcher at %p\n",
-	       switcher_vma->addr);
+	       switcher_text_vma->addr);
 	/* And we succeeded... */
 	return 0;
 
-free_vma:
-	vunmap(switcher_vma->addr);
+free_vmas:
+	/* Undoes map_vm_area and __get_vm_area */
+	vunmap(switcher_stacks_vma->addr);
+free_text_vma:
+	vunmap(switcher_text_vma->addr);
 free_pages:
 	i = TOTAL_SWITCHER_PAGES;
 free_some_pages:
@@ -148,7 +175,8 @@ static void unmap_switcher(void)
 	unsigned int i;
 
 	/* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
-	vunmap(switcher_vma->addr);
+	vunmap(switcher_text_vma->addr);
+	vunmap(switcher_stacks_vma->addr);
 	/* Now we just need to free the pages we copied the switcher into */
 	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
 		__free_pages(lg_switcher_pages[i], 0);
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index eabcff411984..d0c473f65850 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -4,9 +4,11 @@ include ../lib.mk
 
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
-TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptrace_syscall
+TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall
 TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \
-			test_FCMOV test_FCOMI test_FISTTP
+			test_FCMOV test_FCOMI test_FISTTP \
+			ldt_gdt \
+			vdso_restorer
 
 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
 BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
diff --git a/tools/testing/selftests/x86/vdso_restorer.c b/tools/testing/selftests/x86/vdso_restorer.c
new file mode 100644
index 000000000000..cb038424a403
--- /dev/null
+++ b/tools/testing/selftests/x86/vdso_restorer.c
@@ -0,0 +1,88 @@
+/*
+ * vdso_restorer.c - tests vDSO-based signal restore
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * This makes sure that sa_restorer == NULL keeps working on 32-bit
+ * configurations.  Modern glibc doesn't use it under any circumstances,
+ * so it's easy to overlook breakage.
+ *
+ * 64-bit userspace has never supported sa_restorer == NULL, so this is
+ * 32-bit only.
+ */
+
+#define _GNU_SOURCE
+
+#include <err.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/syscall.h>
+
+/* Open-code this -- the headers are too messy to easily use them. */
+struct real_sigaction {
+	void *handler;
+	unsigned long flags;
+	void *restorer;
+	unsigned int mask[2];
+};
+
+static volatile sig_atomic_t handler_called;
+
+static void handler_with_siginfo(int sig, siginfo_t *info, void *ctx_void)
+{
+	handler_called = 1;
+}
+
+static void handler_without_siginfo(int sig)
+{
+	handler_called = 1;
+}
+
+int main()
+{
+	int nerrs = 0;
+	struct real_sigaction sa;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.handler = handler_with_siginfo;
+	sa.flags = SA_SIGINFO;
+	sa.restorer = NULL;	/* request kernel-provided restorer */
+
+	if (syscall(SYS_rt_sigaction, SIGUSR1, &sa, NULL, 8) != 0)
+		err(1, "raw rt_sigaction syscall");
+
+	raise(SIGUSR1);
+
+	if (handler_called) {
+		printf("[OK]\tSA_SIGINFO handler returned successfully\n");
+	} else {
+		printf("[FAIL]\tSA_SIGINFO handler was not called\n");
+		nerrs++;
+	}
+
+	sa.flags = 0;
+	sa.handler = handler_without_siginfo;
+	if (syscall(SYS_sigaction, SIGUSR1, &sa, 0) != 0)
+		err(1, "raw sigaction syscall");
+	handler_called = 0;
+
+	raise(SIGUSR1);
+
+	if (handler_called) {
+		printf("[OK]\t!SA_SIGINFO handler returned successfully\n");
+	} else {
+		printf("[FAIL]\t!SA_SIGINFO handler was not called\n");
+		nerrs++;
+	}
+}