lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Thu,  9 Jul 2015 19:17:33 -0700
From:	Andy Lutomirski <luto@...nel.org>
To:	x86@...nel.org, linux-kernel@...r.kernel.org
Cc:	Frédéric Weisbecker <fweisbec@...il.com>,
	Rik van Riel <riel@...hat.com>,
	Oleg Nesterov <oleg@...hat.com>,
	Denys Vlasenko <vda.linux@...glemail.com>,
	Borislav Petkov <bp@...en8.de>,
	Kees Cook <keescook@...omium.org>,
	Brian Gerst <brgerst@...il.com>,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	Andy Lutomirski <luto@...nel.org>
Subject: [RFC/PATCH v2 v2 5/6] x86/entry/32: Migrate to C exit path and rework vm86 exit hack

This removes the hybrid asm-and-C implementation of exit work.

This patch modifies a giant hack.  vm86 used to fiddle with
TIF_NOTIFY_RESUME and fix itself up in the exit asm.  The hack was
messy and completely incorrect: it broke vm86 if the syscall slow
path was being used.

Rework the hack.  We now forcibly exit vm86 mode on return to
userspace if we're delivering a signal (this is needed to deliver
the signal correctly) or if a new TIF_EXIT_VM86 flag is set.  The
TIF_NOTIFY_RESUME hack is changed to use TIF_EXIT_VM86 instead.

This makes prepare_exit_to_usermode a bit slower on CONFIG_VM86=y
kernels.  People shouldn't use such kernels if they care about
sanity, security, or performance.

Brian Gerst is planning to further rework vm86 mode to leave pt_regs
where it belongs.  That will allow us to revert the
pt_regs_to_thread_info slowdown the stack switching parts of this
code; instead we can just exit normally, as vm86 won't have a
special stack layout any more.

Before this change, the entry_from_vm86 test failed under strace.
Now it passes.

Signed-off-by: Andy Lutomirski <luto@...nel.org>
---
 arch/x86/entry/common.c            | 56 ++++++++++++++++++++++++++-
 arch/x86/entry/entry_32.S          | 79 ++++++--------------------------------
 arch/x86/include/asm/thread_info.h |  2 +
 arch/x86/kernel/vm86_32.c          |  6 +--
 4 files changed, 69 insertions(+), 74 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index febc53086a69..aeaf7d64be0f 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -240,10 +240,51 @@ void syscall_trace_leave(struct pt_regs *regs)
 
 static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
 {
+#ifdef CONFIG_VM86
+	/*
+	 * In VM86 mode, pt_regs isn't in a well-defined place on the
+	 * stack.  Skip the optimization entirely.
+	 */
+	return current_thread_info();
+#else
 	unsigned long top_of_stack =
 		(unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
 	return (struct thread_info *)(top_of_stack - THREAD_SIZE);
+#endif
+}
+
+#ifdef CONFIG_VM86
+static void __noreturn exit_vm86_immediately(struct pt_regs *regs)
+{
+	/*
+	 * VM86 sometimes needs to exit back to normal user mode
+	 * (unsurprisingly) and its hack of resetting the stack and
+	 * jumping into the exit asm isn't always usable (also
+	 * unsurprisingly).  Instead, we land in this abomination.
+	 *
+	 * While I can't defend this code as being anything other
+	 * than awful, at least it's more or less self-contained,
+	 * and it's less awful and much less buggy than the even
+	 * worse hack it replaces.  --Andy
+	 */
+	struct pt_regs *regs32;
+
+	clear_tsk_thread_flag(current, TIF_EXIT_VM86);
+	regs32 = save_v86_state((struct kernel_vm86_regs *)regs);
+	local_irq_disable();
+	__asm__ __volatile__(
+		"movl %0,%%esp\n\t"
+		"movl %1,%%ebp\n\t"
+		"jmp resume_userspace"
+		: : "r" (regs32), "r" (current_thread_info()));
+
+	/*
+	 * We don't get here.  Instead we restart
+	 * prepare_exit_to_usermode via resume_userspace.
+	 */
+	unreachable();
 }
+#endif
 
 /* Called with IRQs disabled. */
 __visible void prepare_exit_to_usermode(struct pt_regs *regs)
@@ -264,12 +305,18 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs)
 			READ_ONCE(pt_regs_to_thread_info(regs)->flags);
 
 		if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |
-				      _TIF_UPROBE | _TIF_NEED_RESCHED)))
+				      _TIF_UPROBE | _TIF_NEED_RESCHED |
+				      _TIF_EXIT_VM86)))
 			break;
 
 		/* We have work to do. */
 		local_irq_enable();
 
+#ifdef CONFIG_VM86
+		if (cached_flags & _TIF_EXIT_VM86)
+			exit_vm86_immediately(regs);
+#endif
+
 		if (cached_flags & _TIF_NEED_RESCHED)
 			schedule();
 
@@ -277,8 +324,13 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs)
 			uprobe_notify_resume(regs);
 
 		/* deal with pending signal delivery */
-		if (cached_flags & _TIF_SIGPENDING)
+		if (cached_flags & _TIF_SIGPENDING) {
+#ifdef CONFIG_VM86
+			if (v8086_mode(regs))
+				exit_vm86_immediately(regs);
+#endif
 			do_signal(regs);
+		}
 
 		if (cached_flags & _TIF_NOTIFY_RESUME) {
 			clear_thread_flag(TIF_NOTIFY_RESUME);
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 66ff9c4055d7..b2909bf8cf70 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -256,14 +256,10 @@ ret_from_intr:
 
 ENTRY(resume_userspace)
 	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)		# make sure we don't miss an interrupt
-						# setting need_resched or sigpending
-						# between sampling and the iret
+	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
-	movl	TI_flags(%ebp), %ecx
-	andl	$_TIF_WORK_MASK, %ecx		# is there any work to be done on
-						# int/exception return?
-	jne	work_pending
+	movl	%esp, %eax
+	call	prepare_exit_to_usermode
 	jmp	restore_all
 END(ret_from_exception)
 
@@ -341,7 +337,7 @@ sysenter_after_call:
 	TRACE_IRQS_OFF
 	movl	TI_flags(%ebp), %ecx
 	testl	$_TIF_ALLWORK_MASK, %ecx
-	jnz	syscall_exit_work
+	jnz	syscall_exit_work_irqs_off
 sysenter_exit:
 /* if something modifies registers it must also disable sysexit */
 	movl	PT_EIP(%esp), %edx
@@ -377,13 +373,7 @@ syscall_after_call:
 	movl	%eax, PT_EAX(%esp)		# store the return value
 syscall_exit:
 	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)		# make sure we don't miss an interrupt
-						# setting need_resched or sigpending
-						# between sampling and the iret
-	TRACE_IRQS_OFF
-	movl	TI_flags(%ebp), %ecx
-	testl	$_TIF_ALLWORK_MASK, %ecx	# current->work
-	jnz	syscall_exit_work
+	jmp	syscall_exit_work
 
 restore_all:
 	TRACE_IRQS_IRET
@@ -460,52 +450,6 @@ ldt_ss:
 #endif
 ENDPROC(entry_INT80_32)
 
-	# perform work that needs to be done immediately before resumption
-	ALIGN
-work_pending:
-	testb	$_TIF_NEED_RESCHED, %cl
-	jz	work_notifysig
-work_resched:
-	call	schedule
-	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)		# make sure we don't miss an interrupt
-						# setting need_resched or sigpending
-						# between sampling and the iret
-	TRACE_IRQS_OFF
-	movl	TI_flags(%ebp), %ecx
-	andl	$_TIF_WORK_MASK, %ecx		# is there any work to be done other
-						# than syscall tracing?
-	jz	restore_all
-	testb	$_TIF_NEED_RESCHED, %cl
-	jnz	work_resched
-
-work_notifysig:					# deal with pending signals and
-						# notify-resume requests
-#ifdef CONFIG_VM86
-	testl	$X86_EFLAGS_VM, PT_EFLAGS(%esp)
-	movl	%esp, %eax
-	jnz	work_notifysig_v86		# special case for v86
-1:
-#else
-	movl	%esp, %eax
-#endif
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	xorl	%edx, %edx
-	call	do_notify_resume
-	jmp	resume_userspace
-
-#ifdef CONFIG_VM86
-	ALIGN
-work_notifysig_v86:
-	pushl	%ecx				# save ti_flags for do_notify_resume
-	call	save_v86_state			# %eax contains pt_regs pointer
-	popl	%ecx
-	movl	%eax, %esp
-	jmp	1b
-#endif
-END(work_pending)
-
 	# perform syscall exit tracing
 	ALIGN
 syscall_trace_entry:
@@ -520,15 +464,14 @@ END(syscall_trace_entry)
 
 	# perform syscall exit tracing
 	ALIGN
-syscall_exit_work:
-	testl	$_TIF_WORK_SYSCALL_EXIT, %ecx
-	jz	work_pending
+syscall_exit_work_irqs_off:
 	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_ANY)		# could let syscall_trace_leave() call
-						# schedule() instead
+	ENABLE_INTERRUPTS(CLBR_ANY)
+
+syscall_exit_work:
 	movl	%esp, %eax
-	call	syscall_trace_leave
-	jmp	resume_userspace
+	call	syscall_return_slowpath
+	jmp	restore_all
 END(syscall_exit_work)
 
 syscall_fault:
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 225ee545e1a0..5a60392ce70e 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -95,6 +95,7 @@ struct thread_info {
 #define TIF_SYSCALL_EMU		6	/* syscall emulation active */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
+#define TIF_EXIT_VM86		9	/* deferred vm86 exit */
 #define TIF_USER_RETURN_NOTIFY	11	/* notify kernel of userspace return */
 #define TIF_UPROBE		12	/* breakpointed or singlestepping */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
@@ -119,6 +120,7 @@ struct thread_info {
 #define _TIF_SYSCALL_EMU	(1 << TIF_SYSCALL_EMU)
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
+#define _TIF_EXIT_VM86		(1 << TIF_EXIT_VM86)
 #define _TIF_USER_RETURN_NOTIFY	(1 << TIF_USER_RETURN_NOTIFY)
 #define _TIF_UPROBE		(1 << TIF_UPROBE)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index fc9db6ef2a95..46dcef7046b6 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -549,11 +549,9 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
 {
 	if (VMPI.is_vm86pus) {
 		if ((trapno == 3) || (trapno == 1)) {
+			/* Queue up a return to normal userspace. */
 			KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
-			/* setting this flag forces the code in entry_32.S to
-			   the path where we call save_v86_state() and change
-			   the stack pointer to KVM86->regs32 */
-			set_thread_flag(TIF_NOTIFY_RESUME);
+			set_thread_flag(TIF_EXIT_VM86);
 			return 0;
 		}
 		do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
-- 
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ