lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Sun, 10 Aug 2014 20:47:17 +0200
From:	Denys Vlasenko <dvlasenk@...hat.com>
To:	linux-kernel@...r.kernel.org
Cc:	Denys Vlasenko <dvlasenk@...hat.com>,
	Linus Torvalds <torvalds@...ux-foundation.org>,
	Oleg Nesterov <oleg@...hat.com>,
	"H. Peter Anvin" <hpa@...or.com>,
	Andy Lutomirski <luto@...capital.net>,
	Frederic Weisbecker <fweisbec@...il.com>,
	X86 ML <x86@...nel.org>, Alexei Starovoitov <ast@...mgrid.com>,
	Will Drewry <wad@...omium.org>,
	Kees Cook <keescook@...omium.org>
Subject: [PATCH 17/17 v2] x86: simplify iret stack handling on SYSCALL64 fastpath

Before this patch, rcx and r11 were saved in pt_regs->rcx
and pt_regs->r11. Which looks natural, but requires messy
shuffling to/from iret frame whenever ptrace or e.g. sys_iopl
wants to modify return address or flags - because that's
how these registers are used by SYSCALL/SYSRET.

This patch saves rcx and r11 in pt_regs->rip and pt_regs->flags,
and uses these values for SYSRET64 insn. Shuffling is eliminated.

On slow path, the values are saved in both locations - thus, ptrace
can modify rcx, and on syscall exit rcx will be different from
return address... don't see why that can be useful but anyway,
it works.

The lazy store of pt_regs->cs and pt_regs->ss is retained -
tests have shown that these insns do take ~2 cycles on fast path.

FIXUP_TOP_OF_STACK and RESTORE_TOP_OF_STACK macros are replaced
by a simpler single macro, STORE_IRET_FRAME_CS_SS.

stub_iopl is no longer needed: pt_regs->flags needs no fixing up.

PER_CPU(old_rsp) usage is simplified - now it is used only
as temp storage, and userspace stack pointer is immediately stored
in pt_regs->sp on syscall entry, instead of being used later,
on syscall exit. This allows to get rid of thread_struct::usersp.

Testing shows that syscall fast path is ~54.3 ns before
and after the patch (on 2.7 GHz Sandy Bridge CPU).

Changes since v1:
= added code to fill pt_regs->rcx and ->r11 on syscall exit slowpath.

Signed-off-by: Denys Vlasenko <dvlasenk@...hat.com>
CC: Linus Torvalds <torvalds@...ux-foundation.org>
CC: Oleg Nesterov <oleg@...hat.com>
CC: "H. Peter Anvin" <hpa@...or.com>
CC: Andy Lutomirski <luto@...capital.net>
CC: Frederic Weisbecker <fweisbec@...il.com>
CC: X86 ML <x86@...nel.org>
CC: Alexei Starovoitov <ast@...mgrid.com>
CC: Will Drewry <wad@...omium.org>
CC: Kees Cook <keescook@...omium.org>
CC: linux-kernel@...r.kernel.org
---
 arch/x86/include/asm/calling.h   |  18 +++--
 arch/x86/include/asm/compat.h    |   2 +-
 arch/x86/include/asm/processor.h |   1 -
 arch/x86/include/asm/ptrace.h    |   8 +--
 arch/x86/kernel/entry_64.S       | 139 ++++++++++++++++-----------------------
 arch/x86/kernel/process_64.c     |   8 +--
 arch/x86/syscalls/syscall_64.tbl |   2 +-
 arch/x86/um/sys_call_table_64.c  |   2 +-
 8 files changed, 76 insertions(+), 104 deletions(-)

diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index aa9113e..7afbcea 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -95,7 +95,7 @@ For 32-bit we have the following conventions - kernel is built with
 	CFI_ADJUST_CFA_OFFSET 15*8+\addskip
 	.endm
 
-	.macro SAVE_C_REGS_HELPER offset=0 rcx=1 r8plus=1
+	.macro SAVE_C_REGS_HELPER offset=0 rcx=1 r8910=1 r11=1
 	movq_cfi rdi, 14*8+\offset
 	movq_cfi rsi, 13*8+\offset
 	movq_cfi rdx, 12*8+\offset
@@ -103,21 +103,26 @@ For 32-bit we have the following conventions - kernel is built with
 	movq_cfi rcx, 11*8+\offset
 	.endif
 	movq_cfi rax, 10*8+\offset
-	.if \r8plus
+	.if \r8910
 	movq_cfi r8,  9*8+\offset
 	movq_cfi r9,  8*8+\offset
 	movq_cfi r10, 7*8+\offset
+	.endif
+	.if \r11
 	movq_cfi r11, 6*8+\offset
 	.endif
 	.endm
 	.macro SAVE_C_REGS offset=0
-	SAVE_C_REGS_HELPER \offset, 1, 1
+	SAVE_C_REGS_HELPER \offset, 1, 1, 1
 	.endm
 	.macro SAVE_C_REGS_EXCEPT_R891011
-	SAVE_C_REGS_HELPER 0, 1, 0
+	SAVE_C_REGS_HELPER 0, 1, 0, 0
 	.endm
 	.macro SAVE_C_REGS_EXCEPT_RCX_R891011
-	SAVE_C_REGS_HELPER 0, 0, 0
+	SAVE_C_REGS_HELPER 0, 0, 0, 0
+	.endm
+	.macro SAVE_C_REGS_EXCEPT_RCX_R11
+	SAVE_C_REGS_HELPER 0, 0, 1, 0
 	.endm
 
 	.macro SAVE_EXTRA_REGS offset=0
@@ -171,6 +176,9 @@ For 32-bit we have the following conventions - kernel is built with
 	.macro RESTORE_C_REGS_EXCEPT_RCX
 	RESTORE_C_REGS_HELPER 1,0,1,1,1
 	.endm
+	.macro RESTORE_C_REGS_EXCEPT_RCX_R11
+	RESTORE_C_REGS_HELPER 1,0,0,1,1
+	.endm
 	.macro RESTORE_RSI_RDI
 	RESTORE_C_REGS_HELPER 0,0,0,0,0
 	.endm
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 59c6c40..acdee09 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -301,7 +301,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
 		sp = task_pt_regs(current)->sp;
 	} else {
 		/* -128 for the x32 ABI redzone */
-		sp = this_cpu_read(old_rsp) - 128;
+		sp = task_pt_regs(current)->sp - 128;
 	}
 
 	return (void __user *)round_down(sp - len, 16);
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a4ea023..33c86c6 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -474,7 +474,6 @@ struct thread_struct {
 #ifdef CONFIG_X86_32
 	unsigned long		sysenter_cs;
 #else
-	unsigned long		usersp;	/* Copy from PDA */
 	unsigned short		es;
 	unsigned short		ds;
 	unsigned short		fsindex;
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index c822b35..271c779 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -140,12 +140,8 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
 #endif
 }
 
-#define current_user_stack_pointer()	this_cpu_read(old_rsp)
-/* ia32 vs. x32 difference */
-#define compat_user_stack_pointer()	\
-	(test_thread_flag(TIF_IA32) 	\
-	 ? current_pt_regs()->sp 	\
-	 : this_cpu_read(old_rsp))
+#define current_user_stack_pointer()	current_pt_regs()->sp
+#define compat_user_stack_pointer()	current_pt_regs()->sp
 #endif
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 5d639a6..45e0797 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -24,8 +24,6 @@
  * - CFI macros are used to generate dwarf2 unwind information for better
  * backtraces. They don't change any code.
  * - ENTRY/END Define functions in the symbol table.
- * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
- * frame that is otherwise undefined after a SYSCALL
  * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
  * - idtentry - Define exception entry points.
  */
@@ -121,29 +119,13 @@ ENDPROC(native_usergs_sysret64)
 #endif
 
 /*
- * C code is not supposed to know about undefined top of stack. Every time
- * a C function with an pt_regs argument is called from the SYSCALL based
- * fast path FIXUP_TOP_OF_STACK is needed.
- * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
- * manipulation.
+ * struct pt_regs is not fully populated on the fast path
+ * (rcx, r11, cs and ss are not filled in).
+ * This macro populates segment registers in iret frame.
  */
-
-	/* %rsp:at FRAMEEND */
-	.macro FIXUP_TOP_OF_STACK tmp offset=0
-	movq PER_CPU_VAR(old_rsp),\tmp
-	movq \tmp,RSP+\offset(%rsp)
-	movq $__USER_DS,SS+\offset(%rsp)
-	movq $__USER_CS,CS+\offset(%rsp)
-	movq $-1,RCX+\offset(%rsp)
-	movq R11+\offset(%rsp),\tmp  /* get eflags */
-	movq \tmp,EFLAGS+\offset(%rsp)
-	.endm
-
-	.macro RESTORE_TOP_OF_STACK tmp offset=0
-	movq RSP+\offset(%rsp),\tmp
-	movq \tmp,PER_CPU_VAR(old_rsp)
-	movq EFLAGS+\offset(%rsp),\tmp
-	movq \tmp,R11+\offset(%rsp)
+	.macro STORE_IRET_FRAME_CS_SS offset=0
+	movq	$__USER_CS,CS+\offset(%rsp)
+	movq	$__USER_DS,SS+\offset(%rsp)
 	.endm
 
 /*
@@ -226,9 +208,6 @@ ENDPROC(native_usergs_sysret64)
  * Interrupts are off on entry.
  * Only called from user space.
  *
- * XXX	if we had a free scratch register we could save the RSP into the stack frame
- *      and report it properly in ps. Unfortunately we haven't.
- *
  * When user can change the frames always force IRET. That is because
  * it deals with uncanonical addresses better. SYSRET has trouble
  * with them due to bugs in both AMD and Intel CPUs.
@@ -255,11 +234,18 @@ GLOBAL(system_call_after_swapgs)
 	 * and short:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	ALLOC_PTREGS_ON_STACK 6*8	/* space for orig_ax and iret frame */
-	SAVE_C_REGS
-	movq  %rax,ORIG_RAX(%rsp)
-	movq  %rcx,RIP(%rsp)
+	ALLOC_PTREGS_ON_STACK 6*8 /* 6*8: space for orig_ax and iret frame */
+	SAVE_C_REGS_EXCEPT_RCX_R11
+	/* Fill orig_ax and, partially, iret frame (rip,flags,rsp) */
+	movq	%rax,ORIG_RAX(%rsp)
+	movq	%rcx,RIP(%rsp)
 	CFI_REL_OFFSET rip,RIP
+	movq	PER_CPU_VAR(old_rsp),%rcx
+	/*movq	$__USER_CS,CS(%rsp) - moved out of fast path */
+	movq	%r11,EFLAGS(%rsp)
+	movq	%rcx,RSP(%rsp)
+	/*movq	$__USER_DS,SS(%rsp) - moved out of fast path */
+
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	jnz tracesys
 system_call_fastpath:
@@ -276,7 +262,7 @@ from_badsys:
 	movq %rax,RAX(%rsp)
 /*
  * Syscall return path ending with SYSRET (fast path)
- * Has incomplete stack frame and undefined top of stack.
+ * Has incompletely filled pt_regs and iret frame's ss and cs.
  */
 ret_from_sys_call:
 	movl $_TIF_ALLWORK_MASK,%edi
@@ -293,11 +279,12 @@ sysret_check:
 	 * sysretq will re-enable interrupts:
 	 */
 	TRACE_IRQS_ON
-	RESTORE_C_REGS_EXCEPT_RCX
-	movq RIP(%rsp),%rcx
+	RESTORE_C_REGS_EXCEPT_RCX_R11
+	movq	RIP(%rsp),%rcx
 	CFI_REGISTER	rip,rcx
+	movq	EFLAGS(%rsp),%r11
 	/*CFI_REGISTER	rflags,r11*/
-	movq	PER_CPU_VAR(old_rsp), %rsp
+	movq	RSP(%rsp),%rsp
 	/*
 	 * 64bit SYSRET restores rip from rcx,
 	 * rflags from r11 (but RF and VM bits are forced to 0),
@@ -309,6 +296,11 @@ sysret_check:
 	/* Handle reschedules */
 	/* edx:	work, edi: workmask */
 sysret_careful:
+	movq	RIP(%rsp),%rcx
+	movq	EFLAGS(%rsp),%r11
+	STORE_IRET_FRAME_CS_SS /* fill the rest of iret frame */
+	movq	%rcx,RCX(%rsp) /* fill the rest of pt_regs */
+	movq	%r11,R11(%rsp) /* ditto */
 	bt $TIF_NEED_RESCHED,%edx
 	jnc sysret_signal
 	TRACE_IRQS_ON
@@ -331,7 +323,6 @@ sysret_signal:
 	 * These all wind up with the iret return path anyway,
 	 * so just join that path right now.
 	 */
-	FIXUP_TOP_OF_STACK %r11
 	jmp int_check_syscall_exit_work
 
 badsys:
@@ -370,15 +361,18 @@ sysret_audit:
 	jmp sysret_check
 #endif	/* CONFIG_AUDITSYSCALL */
 
-	/* Do syscall tracing */
+	/* Do syscall entry tracing */
 tracesys:
+	movq	RIP(%rsp),%rcx
+	STORE_IRET_FRAME_CS_SS /* fill the rest of iret frame */
+	movq	%r11,R11(%rsp) /* fill the rest of pt_regs */
+	movq	%rcx,RCX(%rsp) /* ditto */
 #ifdef CONFIG_AUDITSYSCALL
 	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	jz auditsys
 #endif
 	SAVE_EXTRA_REGS
 	movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
-	FIXUP_TOP_OF_STACK %rdi
 	movq %rsp,%rdi
 	call syscall_trace_enter
 	/*
@@ -402,7 +396,7 @@ tracesys:
 
 /*
  * Syscall return path ending with IRET.
- * Has correct top of stack, but partial stack frame.
+ * Has correct top of stack, but possibly only partially filled pt_regs.
  */
 GLOBAL(int_ret_from_sys_call)
 	DISABLE_INTERRUPTS(CLBR_NONE)
@@ -469,43 +463,15 @@ ENTRY(stub_\func)
 	CFI_STARTPROC
 	DEFAULT_FRAME 0, 8		/* offset 8: return address */
 	SAVE_EXTRA_REGS 8
-	FIXUP_TOP_OF_STACK %r11, 8
-	call sys_\func
-	RESTORE_TOP_OF_STACK %r11, 8
-	ret
+	STORE_IRET_FRAME_CS_SS 8	/* not sure we need this. paranoia */
+	jmp sys_\func
 	CFI_ENDPROC
 END(stub_\func)
 	.endm
 
-	.macro FIXED_FRAME label,func
-ENTRY(\label)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8		/* offset 8: return address */
-	FIXUP_TOP_OF_STACK %r11, 8
-	call \func
-	RESTORE_TOP_OF_STACK %r11, 8
-	ret
-	CFI_ENDPROC
-END(\label)
-	.endm
-
 	FORK_LIKE  clone
 	FORK_LIKE  fork
 	FORK_LIKE  vfork
-	FIXED_FRAME stub_iopl, sys_iopl
-
-ENTRY(stub_execve)
-	CFI_STARTPROC
-	addq $8, %rsp
-	DEFAULT_FRAME 0
-	SAVE_EXTRA_REGS
-	FIXUP_TOP_OF_STACK %r11
-	call sys_execve
-	movq %rax,RAX(%rsp)
-	RESTORE_EXTRA_REGS
-	jmp int_ret_from_sys_call
-	CFI_ENDPROC
-END(stub_execve)
 
 /*
  * sigreturn is special because it needs to restore all registers on return.
@@ -516,25 +482,35 @@ ENTRY(stub_rt_sigreturn)
 	addq $8, %rsp
 	DEFAULT_FRAME 0
 	SAVE_EXTRA_REGS
-	FIXUP_TOP_OF_STACK %r11
+	STORE_IRET_FRAME_CS_SS
 	call sys_rt_sigreturn
-	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+stub_return:
+	movq %rax,RAX(%rsp)
 	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_rt_sigreturn)
 
+ENTRY(stub_execve)
+	CFI_STARTPROC
+	addq $8, %rsp
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
+	STORE_IRET_FRAME_CS_SS
+	call sys_execve
+	jmp stub_return
+	CFI_ENDPROC
+END(stub_execve)
+
 #ifdef CONFIG_X86_X32_ABI
 ENTRY(stub_x32_rt_sigreturn)
 	CFI_STARTPROC
 	addq $8, %rsp
 	DEFAULT_FRAME 0
 	SAVE_EXTRA_REGS
-	FIXUP_TOP_OF_STACK %r11
+	STORE_IRET_FRAME_CS_SS
 	call sys32_x32_rt_sigreturn
-	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
-	RESTORE_EXTRA_REGS
-	jmp int_ret_from_sys_call
+	jmp stub_return
 	CFI_ENDPROC
 END(stub_x32_rt_sigreturn)
 
@@ -543,15 +519,11 @@ ENTRY(stub_x32_execve)
 	addq $8, %rsp
 	DEFAULT_FRAME 0
 	SAVE_EXTRA_REGS
-	FIXUP_TOP_OF_STACK %r11
+	STORE_IRET_FRAME_CS_SS
 	call compat_sys_execve
-	RESTORE_TOP_OF_STACK %r11
-	movq %rax,RAX(%rsp)
-	RESTORE_EXTRA_REGS
-	jmp int_ret_from_sys_call
+	jmp stub_return
 	CFI_ENDPROC
 END(stub_x32_execve)
-
 #endif
 
 /*
@@ -576,10 +548,13 @@ ENTRY(ret_from_fork)
 	testl $3, CS(%rsp)			# from kernel_thread?
 	jz   1f
 
+	/*
+	 * TODO: can we instead check CS(%rsp) == __USER32_CS below?
+	 * We won't need GET_THREAD_INFO(%rcx) then...
+	 */
 	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET
 	jnz  int_ret_from_sys_call
 
-	RESTORE_TOP_OF_STACK %rdi
 	jmp ret_from_sys_call			# go to the SYSRET fastpath
 
 1:
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 2105487e..5a7ba74 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
 	childregs = task_pt_regs(p);
 	p->thread.sp = (unsigned long) childregs;
-	p->thread.usersp = me->thread.usersp;
 	set_tsk_thread_flag(p, TIF_FORK);
 	p->thread.fpu_counter = 0;
 	p->thread.io_bitmap_ptr = NULL;
@@ -238,10 +237,8 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 	loadsegment(es, _ds);
 	loadsegment(ds, _ds);
 	load_gs_index(0);
-	current->thread.usersp	= new_sp;
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
-	this_cpu_write(old_rsp, new_sp);
 	regs->cs		= _cs;
 	regs->ss		= _ss;
 	regs->flags		= X86_EFLAGS_IF;
@@ -359,8 +356,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/*
 	 * Switch the PDA and FPU contexts.
 	 */
-	prev->usersp = this_cpu_read(old_rsp);
-	this_cpu_write(old_rsp, next->usersp);
 	this_cpu_write(current_task, next_p);
 
 	/*
@@ -559,6 +554,5 @@ long sys_arch_prctl(int code, unsigned long addr)
 
 unsigned long KSTK_ESP(struct task_struct *task)
 {
-	return (test_tsk_thread_flag(task, TIF_IA32)) ?
-			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
+	return task_pt_regs(task)->sp;
 }
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index ec255a1..8b745ae 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -178,7 +178,7 @@
 169	common	reboot			sys_reboot
 170	common	sethostname		sys_sethostname
 171	common	setdomainname		sys_setdomainname
-172	common	iopl			stub_iopl
+172	common	iopl			sys_iopl
 173	common	ioperm			sys_ioperm
 174	64	create_module
 175	common	init_module		sys_init_module
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index f2f0723..5c81ed2 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -16,7 +16,7 @@
  */
 
 /* Not going to be implemented by UML, since we have no hardware. */
-#define stub_iopl sys_ni_syscall
+#define sys_iopl sys_ni_syscall
 #define sys_ioperm sys_ni_syscall
 
 /*
-- 
1.8.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ