lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Thu, 2 May 2019 18:21:33 +0200
From:   Peter Zijlstra <peterz@...radead.org>
To:     Steven Rostedt <rostedt@...dmis.org>
Cc:     linux-kernel@...r.kernel.org,
        Linus Torvalds <torvalds@...ux-foundation.org>,
        Ingo Molnar <mingo@...nel.org>,
        Andrew Morton <akpm@...ux-foundation.org>,
        Andy Lutomirski <luto@...nel.org>,
        Nicolai Stange <nstange@...e.de>,
        Thomas Gleixner <tglx@...utronix.de>,
        Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
        "H. Peter Anvin" <hpa@...or.com>,
        the arch/x86 maintainers <x86@...nel.org>,
        Josh Poimboeuf <jpoimboe@...hat.com>,
        Jiri Kosina <jikos@...nel.org>,
        Miroslav Benes <mbenes@...e.cz>,
        Petr Mladek <pmladek@...e.com>,
        Joe Lawrence <joe.lawrence@...hat.com>,
        Shuah Khan <shuah@...nel.org>,
        Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>,
        Tim Chen <tim.c.chen@...ux.intel.com>,
        Sebastian Andrzej Siewior <bigeasy@...utronix.de>,
        Mimi Zohar <zohar@...ux.ibm.com>,
        Juergen Gross <jgross@...e.com>,
        Nick Desaulniers <ndesaulniers@...gle.com>,
        Nayna Jain <nayna@...ux.ibm.com>,
        Masahiro Yamada <yamada.masahiro@...ionext.com>,
        Joerg Roedel <jroedel@...e.de>,
        "open list:KERNEL SELFTEST FRAMEWORK" 
        <linux-kselftest@...r.kernel.org>, stable@...r.kernel.org
Subject: Re: [RFC][PATCH 1/2] x86: Allow breakpoints to emulate call functions

On Wed, May 01, 2019 at 11:24:12PM -0400, Steven Rostedt wrote:
> On Wed, 01 May 2019 16:28:31 -0400
> Steven Rostedt <rostedt@...dmis.org> wrote:
> 
> > diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
> > index d309f30cf7af..50bbf4035baf 100644
> > --- a/arch/x86/entry/entry_32.S
> > +++ b/arch/x86/entry/entry_32.S
> > @@ -1478,6 +1478,17 @@ ENTRY(int3)
> >  	ASM_CLAC
> >  	pushl	$-1				# mark this as an int
> >  
> > +#ifdef CONFIG_VM86
> > +	testl	$X86_EFLAGS_VM, PT_EFLAGS(%esp)
> > +	jnz	.Lfrom_usermode_no_gap
> > +#endif
> > +	testl	$SEGMENT_RPL_MASK, PT_CS(%esp)
> > +	jnz	.Lfrom_usermode_no_gap
> > +	.rept 6
> > +	pushl	5*4(%esp)
> > +	.endr
> > +.Lfrom_usermode_no_gap:
> > +
> >  	SAVE_ALL switch_stacks=1
> >  	ENCODE_FRAME_POINTER
> >  	TRACE_IRQS_OFF
> 
> This failed to work on 32 bit at all (crashed and burned badly - triple
> fault!). 

Indeed so; find a working version below (albeit with a lot of debug
garbage still in).

It also includes the self-test code that Andy wanted -- it's what I used
to debug this mess.

Much thanks to Joerg Roedel for talking entry_32.S with me.

TL;DR, on x86_32 kernel->kernel IRET frames are only 3 entries and do
not include ESP/SS, so not only wasn't regs->sp setup, if you changed it
it wouldn't be effective and corrupt random stack state.

---
 arch/x86/entry/entry_32.S            |  87 +++++++++++++++++++++++---
 arch/x86/entry/entry_64.S            |  14 ++++-
 arch/x86/include/asm/text-patching.h |  20 ++++++
 arch/x86/kernel/alternative.c        | 116 +++++++++++++++++++++++++++++++++--
 arch/x86/kernel/traps.c              |   1 +
 5 files changed, 225 insertions(+), 13 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 7b23431be5cb..01c5bdbe5f39 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -203,7 +203,7 @@
 .Lend_\@:
 .endm
 
-.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 clear_cs=1
 	cld
 	PUSH_GS
 	pushl	%fs
@@ -225,7 +225,7 @@
 
 	/* Switch to kernel stack if necessary */
 .if \switch_stacks > 0
-	SWITCH_TO_KERNEL_STACK
+	SWITCH_TO_KERNEL_STACK \clear_cs
 .endif
 
 .endm
@@ -377,8 +377,9 @@
 
 #define CS_FROM_ENTRY_STACK	(1 << 31)
 #define CS_FROM_USER_CR3	(1 << 30)
+#define CS_FROM_INT3		(1 << 29)
 
-.macro SWITCH_TO_KERNEL_STACK
+.macro SWITCH_TO_KERNEL_STACK clear_cs=1
 
 	ALTERNATIVE     "", "jmp .Lend_\@", X86_FEATURE_XENPV
 
@@ -391,12 +392,13 @@
 	 * that register for the time this macro runs
 	 */
 
+	.if \clear_cs
 	/*
-	 * The high bits of the CS dword (__csh) are used for
-	 * CS_FROM_ENTRY_STACK and CS_FROM_USER_CR3. Clear them in case
-	 * hardware didn't do this for us.
+	 * The high bits of the CS dword (__csh) are used for CS_FROM_*. Clear
+	 * them in case hardware didn't do this for us.
 	 */
 	andl	$(0x0000ffff), PT_CS(%esp)
+	.endif
 
 	/* Are we on the entry stack? Bail out if not! */
 	movl	PER_CPU_VAR(cpu_entry_area), %ecx
@@ -1019,6 +1021,29 @@ ENTRY(entry_INT80_32)
 	/* Restore user state */
 	RESTORE_REGS pop=4			# skip orig_eax/error_code
 .Lirq_return:
+	testl $CS_FROM_INT3, 4(%esp)
+	jz .Lno_iret_fixup
+
+	/*
+	 * Undo the magic from ENTRY(int3), in particular consider the case
+	 * where regs->sp has been modified.
+	 */
+
+	pushl	%eax
+	movl	%esp, %eax
+
+	movl	4*4(%eax), %esp		# restore (modified) regs->sp
+
+	/* rebuild IRET frame */
+	pushl	3*4(%eax)		# flags
+	pushl	2*4(%eax)		# cs
+	pushl	1*4(%eax)		# ip
+
+	andl	$0x0000ffff, 4(%esp)	# clear high CS bits
+
+	movl	(%eax), %eax		# restore eax
+
+.Lno_iret_fixup:
 	/*
 	 * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on IRET core serialization
 	 * when returning from IPI handler and when returning from
@@ -1477,9 +1502,57 @@ END(nmi)
 
 ENTRY(int3)
 	ASM_CLAC
+
+	/*
+	 * The high bits of the CS dword (__csh) are used for CS_FROM_*. Clear
+	 * them in case hardware didn't do this for us.
+	 */
+	andl	$0x0000ffff, 4(%esp)
+
+#ifdef CONFIG_VM86
+	testl	$X86_EFLAGS_VM, 8(%esp)
+	jnz	.Lfrom_usermode_no_gap
+#endif
+	testl	$SEGMENT_RPL_MASK, 4(%esp)
+	jnz	.Lfrom_usermode_no_gap
+
+	/*
+	 * Here from kernel mode; so the (exception) stack looks like:
+	 *
+	 * 12(esp) - <previous context>
+	 *  8(esp) - flags
+	 *  4(esp) - cs
+	 *  0(esp) - ip
+	 *
+	 * Lets build a 5 entry IRET frame after that, such that struct pt_regs
+	 * is complete and in particular regs->sp is correct. This gives us
+	 * the original 3 enties as gap:
+	 *
+	 * 32(esp) - <previous context>
+	 * 28(esp) - orig_flags / gap
+	 * 24(esp) - orig_cs	/ gap
+	 * 20(esp) - orig_ip	/ gap
+	 * 16(esp) - ss
+	 * 12(esp) - sp
+	 *  8(esp) - flags
+	 *  4(esp) - cs
+	 *  0(esp) - ip
+	 */
+	pushl	%ss	  # ss
+	pushl	%esp      # sp (points at ss)
+	pushl	4*4(%esp) # flags
+	pushl	4*4(%esp) # cs
+	pushl	4*4(%esp) # ip
+
+	add	$16, 12(%esp) # point sp back at the previous context
+
+	orl	$CS_FROM_INT3, 4(%esp) # mark magic IRET
+
+.Lfrom_usermode_no_gap:
+
 	pushl	$-1				# mark this as an int
 
-	SAVE_ALL switch_stacks=1
+	SAVE_ALL switch_stacks=1 clear_cs=0
 	ENCODE_FRAME_POINTER
 	TRACE_IRQS_OFF
 	xorl	%edx, %edx			# zero error code
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 20e45d9b4e15..268cd9affe04 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -878,7 +878,7 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
  * @paranoid == 2 is special: the stub will never switch stacks.  This is for
  * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
  */
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0
+.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0
 ENTRY(\sym)
 	UNWIND_HINT_IRET_REGS offset=\has_error_code*8
 
@@ -898,6 +898,16 @@ ENTRY(\sym)
 	jnz	.Lfrom_usermode_switch_stack_\@
 	.endif
 
+	.if \create_gap == 1
+	testb	$3, CS-ORIG_RAX(%rsp)
+	jnz	.Lfrom_usermode_no_gap_\@
+	.rept 6
+	pushq	5*8(%rsp)
+	.endr
+	UNWIND_HINT_IRET_REGS offset=8
+.Lfrom_usermode_no_gap_\@:
+	.endif
+
 	.if \paranoid
 	call	paranoid_entry
 	.else
@@ -1129,7 +1139,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \
 #endif /* CONFIG_HYPERV */
 
 idtentry debug			do_debug		has_error_code=0	paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET
-idtentry int3			do_int3			has_error_code=0
+idtentry int3			do_int3			has_error_code=0	create_gap=1
 idtentry stack_segment		do_stack_segment	has_error_code=1
 
 #ifdef CONFIG_XEN_PV
diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h
index c90678fd391a..6aac6abf931e 100644
--- a/arch/x86/include/asm/text-patching.h
+++ b/arch/x86/include/asm/text-patching.h
@@ -42,4 +42,24 @@ extern int after_bootmem;
 extern __ro_after_init struct mm_struct *poking_mm;
 extern __ro_after_init unsigned long poking_addr;
 
+static inline void int3_emulate_push(struct pt_regs *regs, unsigned long val)
+{
+	regs->sp -= sizeof(unsigned long);
+	*(unsigned long *)regs->sp = val;
+}
+
+static inline void int3_emulate_jmp(struct pt_regs *regs, unsigned long ip)
+{
+	regs->ip = ip;
+}
+
+#define INT3_INSN_SIZE 1
+#define CALL_INSN_SIZE 5
+
+static inline void int3_emulate_call(struct pt_regs *regs, unsigned long func)
+{
+	int3_emulate_push(regs, regs->ip - INT3_INSN_SIZE + CALL_INSN_SIZE);
+	int3_emulate_jmp(regs, func);
+}
+
 #endif /* _ASM_X86_TEXT_PATCHING_H */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 4db9c0d29bc1..1e11076c3a2b 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -613,11 +613,118 @@ extern struct paravirt_patch_site __start_parainstructions[],
 	__stop_parainstructions[];
 #endif	/* CONFIG_PARAVIRT */
 
+static __always_inline void print_stack(struct pt_regs *regs)
+{
+#if 1
+	unsigned long *end = (unsigned long *)current_stack_pointer;
+	unsigned long *frame = (unsigned long *)__builtin_frame_address(0);
+	unsigned long *stack = (unsigned long *)(current_stack_pointer & ~(THREAD_SIZE - 1));
+	int i, j;
+
+	stack += THREAD_SIZE / sizeof(unsigned long);
+
+	printk("stack dump from: %lx\n", stack);
+
+	for (i=0; ; i++) {
+		pr_info("stack[%03d]: ", 16*i);
+		for (j=0; j<16; j++) {
+			if (i==0 && j==0) {
+				pr_cont(" %08lx  ", 0UL);
+				stack--;
+				continue;
+			}
+			if (stack == end)
+				pr_cont(">%08lx< ", *(stack--));
+			else if (stack == frame)
+				pr_cont("*%08lx* ", *(stack--));
+			else if (stack == regs)
+				pr_cont("r%08lxr ", *(stack--));
+			else if (regs && stack == regs->sp)
+				pr_cont("s%08lxs ", *(stack--));
+			else
+				pr_cont(" %08lx  ", *(stack--));
+		}
+		pr_cont("\n");
+
+		if (stack < end)
+			break;
+	}
+#endif
+}
+
+static void __init int3_magic(unsigned int *ptr)
+{
+	printk("*************** %lx\n", (unsigned long)ptr);
+	print_stack(NULL);
+	*ptr = 1;
+}
+
+static __initdata unsigned long int3_ip;
+
+static int __init int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
+{
+	struct die_args *args = data;
+	struct pt_regs *regs = args->regs;
+
+	if (!regs || user_mode(regs))
+		return NOTIFY_DONE;
+
+	if (val != DIE_INT3)
+		return NOTIFY_DONE;
+
+	printk("XXXXXXXXXXXXXXXXXXXXXXXXXX %lx %lx\n", regs->ip, int3_ip);
+	if (regs->ip - INT3_INSN_SIZE != int3_ip)
+		return NOTIFY_DONE;
+
+	print_stack(regs);
+	int3_emulate_call(regs, (unsigned long)&int3_magic);
+	print_stack(regs);
+
+	return NOTIFY_STOP;
+}
+
+static void __init int3_selftest(void)
+{
+	static __initdata struct notifier_block int3_exception_nb = {
+		.notifier_call	= int3_exception_notify,
+		.priority	= INT_MAX-1, /* last */
+	};
+	unsigned int val = 0;
+
+	BUG_ON(register_die_notifier(&int3_exception_nb));
+
+	printk("+++++++++++++++++++ %lx %lx\n", (unsigned long)&val, (unsigned long)&int3_ip);
+
+	print_stack(NULL);
+
+	/*
+	 * Basically: int3_magic(&val); but really complicated :-)
+	 *
+	 * Stick the address of the INT3 instruction into int3_ip, then trigger
+	 * the INT3, padded with NOPs to match a CALL instruction length.
+	 */
+#ifdef CONFIG_X86_32
+	asm volatile ("call 1f; 1: pop (%%edx); add $5, (%%edx);"
+		      "int3; nop; nop; nop; nop" : : "d" (&int3_ip), "a" (&val) : "memory");
+#else /* CONFIG_X86_64 */
+	asm volatile ("call 1f; 1: pop (%%rdx); add $5, (%%rdx);"
+		      "int3; nop; nop; nop; nop" : : "d" (&int3_ip), "D" (&val) : "memory");
+#endif
+
+	BUG_ON(val != 1);
+
+	unregister_die_notifier(&int3_exception_nb);
+}
+
 void __init alternative_instructions(void)
 {
-	/* The patching is not fully atomic, so try to avoid local interruptions
-	   that might execute the to be patched code.
-	   Other CPUs are not running. */
+	int3_selftest();
+
+	/*
+	 * The patching is not fully atomic, so try to avoid local
+	 * interruptions that might execute the to be patched code.
+	 * Other CPUs are not running.
+	 */
 	stop_nmi();
 
 	/*
@@ -642,10 +749,11 @@ void __init alternative_instructions(void)
 					    _text, _etext);
 	}
 
-	if (!uniproc_patched || num_possible_cpus() == 1)
+	if (!uniproc_patched || num_possible_cpus() == 1) {
 		free_init_pages("SMP alternatives",
 				(unsigned long)__smp_locks,
 				(unsigned long)__smp_locks_end);
+	}
 #endif
 
 	apply_paravirt(__parainstructions, __parainstructions_end);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8b6d03e55d2f..e072cdd07284 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -572,6 +572,7 @@ NOKPROBE_SYMBOL(do_general_protection);
 
 dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 {
+	printk("int3 frame: %lx\n", __builtin_frame_address(0));
 #ifdef CONFIG_DYNAMIC_FTRACE
 	/*
 	 * ftrace must be first, everything else may cause a recursive crash.

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ