From 1c8e7e2ef295d6325796fcf3ce6f8825ffa7f58b Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Sat, 28 Feb 2015 17:38:48 +1100
Subject: [PATCH 2/2] powerpc: Use generic_page_fault()

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/include/asm/fault.h | 165 ++++++++++++++++++++
 arch/powerpc/mm/fault.c          | 328 ++++++---------------------------------
 2 files changed, 215 insertions(+), 278 deletions(-)
 create mode 100644 arch/powerpc/include/asm/fault.h

diff --git a/arch/powerpc/include/asm/fault.h b/arch/powerpc/include/asm/fault.h
new file mode 100644
index 0000000..ebb46b9
--- /dev/null
+++ b/arch/powerpc/include/asm/fault.h
@@ -0,0 +1,165 @@
+#ifndef _ASM_POWERPC_FAULT_H
+#define _ASM_POWERPC_FAULT_H
+
+#include <linux/types.h>
+#include <linux/bug.h>
+
+#include <asm/ptrace.h>
+#include <asm/reg.h>
+#include <asm/firmware.h>
+#include <asm/paca.h>
+
+static inline bool fault_is_user(struct pt_regs *regs, unsigned long err_code)
+{
+	return user_mode(regs);
+}
+
+static inline bool fault_is_write(struct pt_regs *regs, unsigned long err_code)
+{
+#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
+	return !!(err_code & DSISR_ISSTORE);
+#else
+	return !!(err_code & ESR_DST);
+#endif /* CONFIG_4xx || CONFIG_BOOKE */
+}
+
+/* We need to pass a couple of flags throug the generic page fault
+ * code via "error_code" which contains either the DSISR or the ESR
+ * content depending on the CPU family.
+ *
+ * We hijack bits that we don't use in either
+ */
+#define PF_CAN_GROW_STACK	0x00000001ul
+#define PF_EXEC			0x00000002ul
+
+/* Return type for do_page_fault */
+typedef int gpf_ret_t;
+
+#define FAULT_NO_ERR	0
+
+/* Check if the stack is allowed to grow during a user page fault */
+static inline bool stack_can_grow(struct pt_regs *regs, unsigned long err_code,
+				  unsigned long address,
+				  struct vm_area_struct *vma)
+{
+	/*
+	 * N.B. The POWER/Open ABI allows programs to access up to
+	 * 288 bytes below the stack pointer.
+	 * The kernel signal delivery code writes up to about 1.5kB
+	 * below the stack pointer (r1) before decrementing it.
+	 * The exec code can write slightly over 640kB to the stack
+	 * before setting the user r1.  Thus we allow the stack to
+	 * expand to 1MB without further checks.
+	 */
+	if (address + 0x100000 < vma->vm_end) {
+		/* get user regs even if this fault is in kernel mode */
+		struct pt_regs *uregs = current->thread.regs;
+		if (uregs == NULL)
+			return false;
+
+		/*
+		 * A user-mode access to an address a long way below
+		 * the stack pointer is only valid if the instruction
+		 * is one which would update the stack pointer to the
+		 * address accessed if the instruction completed,
+		 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
+		 * (or the byte, halfword, float or double forms).
+		 *
+		 * If we don't check this then any write to the area
+		 * between the last mapped region and the stack will
+		 * expand the stack rather than segfaulting.
+		 */
+		if (address + 2048 < uregs->gpr[1] &&
+		    !(err_code & PF_CAN_GROW_STACK))
+			return false;
+	}
+	return true;
+}
+
+static inline bool access_error(struct pt_regs *regs, unsigned long err_code,
+				struct vm_area_struct *vma)
+{
+#if defined(CONFIG_6xx)
+	/* an error such as lwarx to I/O controller space,
+	   address matching DABR, eciwx, etc. */
+	if (err_code & 0x95700000)
+		return true;
+#endif /* CONFIG_6xx */
+#if defined(CONFIG_8xx)
+        /* The MPC8xx seems to always set 0x80000000, which is
+         * "undefined".  Of those that can be set, this is the only
+         * one which seems bad.
+         */
+	if (err_code & 0x10000000)
+                /* Guarded storage error. */
+		return true;
+#endif /* CONFIG_8xx */
+
+	if (err_code & PF_EXEC) {
+		/*
+		 * Allow execution from readable areas if the MMU does not
+		 * provide separate controls over reading and executing.
+		 *
+		 * Note: That code used to not be enabled for 4xx/BookE.
+		 * It is now as I/D cache coherency for these is done at
+		 * set_pte_at() time and I see no reason why the test
+		 * below wouldn't be valid on those processors. This -may-
+		 * break programs compiled with a really old ABI though.
+		 */
+		if (!(vma->vm_flags & VM_EXEC) &&
+		    (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
+		     !(vma->vm_flags & (VM_READ | VM_WRITE))))
+			return true;
+#ifdef CONFIG_PPC_STD_MMU
+		/*
+		 * protfault should only happen due to us
+		 * mapping a region readonly temporarily. PROT_NONE
+		 * is also covered by the VMA check above.
+		 */
+		WARN_ON_ONCE(err_code & DSISR_PROTFAULT);
+#endif /* CONFIG_PPC_STD_MMU */
+	/* a write */
+	} else if (fault_is_write(regs, err_code)) {
+		if (!(vma->vm_flags & VM_WRITE))
+			return true;
+	/* a read */
+	} else {
+		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+			return true;
+		WARN_ON_ONCE(err_code & DSISR_PROTFAULT);
+	}
+	return false;
+}
+
+/* Error handlers */
+
+gpf_ret_t handle_bad_area(struct pt_regs *regs, unsigned long error_code,
+			  unsigned long address, int si_code);
+
+static inline gpf_ret_t handle_kernel_fault(struct pt_regs *regs,
+					    unsigned long error_code,
+					    unsigned long address, int sig,
+					    int si_code)
+{
+	return sig;
+}
+
+gpf_ret_t do_sigbus(struct pt_regs *regs, unsigned long error_code,
+		    unsigned long address, unsigned int fault);
+
+static inline void arch_account_major_fault(void)
+{
+#ifdef CONFIG_PPC_SMLPAR
+	if (firmware_has_feature(FW_FEATURE_CMO)) {
+		u32 page_ins;
+
+		preempt_disable();
+		page_ins = be32_to_cpu(get_lppaca()->page_ins);
+		page_ins += 1 << PAGE_FACTOR;
+		get_lppaca()->page_ins = cpu_to_be32(page_ins);
+		preempt_enable();
+	}
+#endif /* CONFIG_PPC_SMLPAR */
+}
+
+#endif /*  _ASM_X86_FAULT_H */
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index b396868..c51c156 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -33,6 +33,7 @@
 #include <linux/ratelimit.h>
 #include <linux/context_tracking.h>
 #include <linux/hugetlb.h>
+#include <linux/fault.h>
 
 #include <asm/firmware.h>
 #include <asm/page.h>
@@ -72,15 +73,15 @@ static inline int notify_page_fault(struct pt_regs *regs)
  * Check whether the instruction at regs->nip is a store using
  * an update addressing form which will update r1.
  */
-static int store_updates_sp(struct pt_regs *regs)
+static bool store_updates_sp(struct pt_regs *regs)
 {
 	unsigned int inst;
 
 	if (get_user(inst, (unsigned int __user *)regs->nip))
-		return 0;
+		return false;
 	/* check for 1 in the rA field */
 	if (((inst >> 16) & 0x1f) != 1)
-		return 0;
+		return false;
 	/* check major opcode */
 	switch (inst >> 26) {
 	case 37:	/* stwu */
@@ -88,7 +89,7 @@ static int store_updates_sp(struct pt_regs *regs)
 	case 45:	/* sthu */
 	case 53:	/* stfsu */
 	case 55:	/* stfdu */
-		return 1;
+		return true;
 	case 62:	/* std or stdu */
 		return (inst & 3) == 1;
 	case 31:
@@ -100,10 +101,10 @@ static int store_updates_sp(struct pt_regs *regs)
 		case 439:	/* sthux */
 		case 695:	/* stfsux */
 		case 759:	/* stfdux */
-			return 1;
+			return true;
 		}
 	}
-	return 0;
+	return false;
 }
 /*
  * do_page_fault error handling helpers
@@ -113,16 +114,14 @@ static int store_updates_sp(struct pt_regs *regs)
 #define MM_FAULT_CONTINUE	-1
 #define MM_FAULT_ERR(sig)	(sig)
 
-static int do_sigbus(struct pt_regs *regs, unsigned long address,
-		     unsigned int fault)
+gpf_ret_t do_sigbus(struct pt_regs *regs, unsigned long error_code,
+		    unsigned long address, unsigned int fault)
 {
 	siginfo_t info;
 	unsigned int lsb = 0;
 
-	up_read(&current->mm->mmap_sem);
-
 	if (!user_mode(regs))
-		return MM_FAULT_ERR(SIGBUS);
+		return SIGBUS;
 
 	current->thread.trap_nr = BUS_ADRERR;
 	info.si_signo = SIGBUS;
@@ -143,53 +142,25 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address,
 #endif
 	info.si_addr_lsb = lsb;
 	force_sig_info(SIGBUS, &info, current);
-	return MM_FAULT_RETURN;
+	return 0;
 }
 
-static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
+gpf_ret_t handle_bad_area(struct pt_regs *regs, unsigned long error_code,
+			  unsigned long address, int si_code)
 {
-	/*
-	 * Pagefault was interrupted by SIGKILL. We have no reason to
-	 * continue the pagefault.
-	 */
-	if (fatal_signal_pending(current)) {
-		/*
-		 * If we have retry set, the mmap semaphore will have
-		 * alrady been released in __lock_page_or_retry(). Else
-		 * we release it now.
-		 */
-		if (!(fault & VM_FAULT_RETRY))
-			up_read(&current->mm->mmap_sem);
-		/* Coming from kernel, we need to deal with uaccess fixups */
-		if (user_mode(regs))
-			return MM_FAULT_RETURN;
-		return MM_FAULT_ERR(SIGKILL);
-	}
 
-	/* No fault: be happy */
-	if (!(fault & VM_FAULT_ERROR))
-		return MM_FAULT_CONTINUE;
-
-	/* Out of memory */
-	if (fault & VM_FAULT_OOM) {
-		up_read(&current->mm->mmap_sem);
-
-		/*
-		 * We ran out of memory, or some other thing happened to us that
-		 * made us unable to handle the page fault gracefully.
-		 */
-		if (!user_mode(regs))
-			return MM_FAULT_ERR(SIGKILL);
-		pagefault_out_of_memory();
-		return MM_FAULT_RETURN;
+	/* User mode accesses cause a SIGSEGV */
+	if (user_mode(regs)) {
+		_exception(SIGSEGV, regs, si_code, address);
+		return 0;
 	}
 
-	if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE))
-		return do_sigbus(regs, addr, fault);
+	if ((error_code & PF_EXEC) && (error_code & DSISR_PROTFAULT))
+		printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"
+				   " page (%lx) - exploit attempt? (uid: %d)\n",
+				   address, from_kuid(&init_user_ns, current_uid()));
 
-	/* We don't understand the fault code, this is fatal */
-	BUG();
-	return MM_FAULT_CONTINUE;
+	return SIGSEGV;
 }
 
 /*
@@ -205,19 +176,11 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
  * The return value is 0 if the fault was handled, or the signal
  * number if this is a kernel fault that can't be handled here.
  */
-int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
-			    unsigned long error_code)
+static int __do_page_fault(struct pt_regs *regs, unsigned long address,
+			   unsigned long error_code)
 {
-	enum ctx_state prev_state = exception_enter();
-	struct vm_area_struct * vma;
 	struct mm_struct *mm = current->mm;
-	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
-	int code = SEGV_MAPERR;
-	int is_write = 0;
 	int trap = TRAP(regs);
- 	int is_exec = trap == 0x400;
-	int fault;
-	int rc = 0, store_update_sp = 0;
 
 #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
 	/*
@@ -228,10 +191,6 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	 */
 	if (trap == 0x400)
 		error_code &= 0x48200000;
-	else
-		is_write = error_code & DSISR_ISSTORE;
-#else
-	is_write = error_code & ESR_DST;
 #endif /* CONFIG_4xx || CONFIG_BOOKE */
 
 #ifdef CONFIG_PPC_ICSWX
@@ -241,30 +200,28 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	 * look at it
 	 */
 	if (error_code & ICSWX_DSI_UCT) {
-		rc = acop_handle_fault(regs, address, error_code);
+		gfp_ret_t rc = acop_handle_fault(regs, address, error_code);
 		if (rc)
-			goto bail;
+			return rc;
 	}
 #endif /* CONFIG_PPC_ICSWX */
 
 	if (notify_page_fault(regs))
-		goto bail;
+		return 0;
 
 	if (unlikely(debugger_fault_handler(regs)))
-		goto bail;
+		return 0;
 
 	/* On a kernel SLB miss we can only check for a valid exception entry */
-	if (!user_mode(regs) && (address >= TASK_SIZE)) {
-		rc = SIGSEGV;
-		goto bail;
-	}
+	if (!user_mode(regs) && (address >= TASK_SIZE))
+		return SIGSEGV;
 
 #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \
 			     defined(CONFIG_PPC_BOOK3S_64))
   	if (error_code & DSISR_DABRMATCH) {
 		/* breakpoint match */
 		do_break(regs, address, error_code);
-		goto bail;
+		return 0;
 	}
 #endif
 
@@ -273,10 +230,9 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		local_irq_enable();
 
 	if (in_atomic() || mm == NULL) {
-		if (!user_mode(regs)) {
-			rc = SIGSEGV;
-			goto bail;
-		}
+		if (!user_mode(regs))
+			return SIGSEGV;
+
 		/* in_atomic() in user mode is really bad,
 		   as is current->mm == NULL. */
 		printk(KERN_EMERG "Page fault in user mode with "
@@ -286,220 +242,36 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
-	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
+	error_code &= ~(PF_CAN_GROW_STACK | PF_EXEC);
 
 	/*
 	 * We want to do this outside mmap_sem, because reading code around nip
 	 * can result in fault, which will cause a deadlock when called with
 	 * mmap_sem held
 	 */
-	if (user_mode(regs))
-		store_update_sp = store_updates_sp(regs);
-
-	if (user_mode(regs))
-		flags |= FAULT_FLAG_USER;
-
-	/* When running in the kernel we expect faults to occur only to
-	 * addresses in user space.  All other faults represent errors in the
-	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
-	 * erroneous fault occurring in a code path which already holds mmap_sem
-	 * we will deadlock attempting to validate the fault against the
-	 * address space.  Luckily the kernel only validly references user
-	 * space from well defined areas of code, which are listed in the
-	 * exceptions table.
-	 *
-	 * As the vast majority of faults will be valid we will only perform
-	 * the source reference check when there is a possibility of a deadlock.
-	 * Attempt to lock the address space, if we cannot we then validate the
-	 * source.  If this is invalid we can skip the address space check,
-	 * thus avoiding the deadlock.
-	 */
-	if (!down_read_trylock(&mm->mmap_sem)) {
-		if (!user_mode(regs) && !search_exception_tables(regs->nip))
-			goto bad_area_nosemaphore;
-
-retry:
-		down_read(&mm->mmap_sem);
-	} else {
-		/*
-		 * The above down_read_trylock() might have succeeded in
-		 * which case we'll have missed the might_sleep() from
-		 * down_read():
-		 */
-		might_sleep();
-	}
-
-	vma = find_vma(mm, address);
-	if (!vma)
-		goto bad_area;
-	if (vma->vm_start <= address)
-		goto good_area;
-	if (!(vma->vm_flags & VM_GROWSDOWN))
-		goto bad_area;
-
-	/*
-	 * N.B. The POWER/Open ABI allows programs to access up to
-	 * 288 bytes below the stack pointer.
-	 * The kernel signal delivery code writes up to about 1.5kB
-	 * below the stack pointer (r1) before decrementing it.
-	 * The exec code can write slightly over 640kB to the stack
-	 * before setting the user r1.  Thus we allow the stack to
-	 * expand to 1MB without further checks.
-	 */
-	if (address + 0x100000 < vma->vm_end) {
-		/* get user regs even if this fault is in kernel mode */
-		struct pt_regs *uregs = current->thread.regs;
-		if (uregs == NULL)
-			goto bad_area;
-
-		/*
-		 * A user-mode access to an address a long way below
-		 * the stack pointer is only valid if the instruction
-		 * is one which would update the stack pointer to the
-		 * address accessed if the instruction completed,
-		 * i.e. either stwu rs,n(r1) or stwux rs,r1,rb
-		 * (or the byte, halfword, float or double forms).
-		 *
-		 * If we don't check this then any write to the area
-		 * between the last mapped region and the stack will
-		 * expand the stack rather than segfaulting.
-		 */
-		if (address + 2048 < uregs->gpr[1] && !store_update_sp)
-			goto bad_area;
-	}
-	if (expand_stack(vma, address))
-		goto bad_area;
-
-good_area:
-	code = SEGV_ACCERR;
-#if defined(CONFIG_6xx)
-	if (error_code & 0x95700000)
-		/* an error such as lwarx to I/O controller space,
-		   address matching DABR, eciwx, etc. */
-		goto bad_area;
-#endif /* CONFIG_6xx */
-#if defined(CONFIG_8xx)
-        /* The MPC8xx seems to always set 0x80000000, which is
-         * "undefined".  Of those that can be set, this is the only
-         * one which seems bad.
-         */
-	if (error_code & 0x10000000)
-                /* Guarded storage error. */
-		goto bad_area;
-#endif /* CONFIG_8xx */
-
-	if (is_exec) {
-		/*
-		 * Allow execution from readable areas if the MMU does not
-		 * provide separate controls over reading and executing.
-		 *
-		 * Note: That code used to not be enabled for 4xx/BookE.
-		 * It is now as I/D cache coherency for these is done at
-		 * set_pte_at() time and I see no reason why the test
-		 * below wouldn't be valid on those processors. This -may-
-		 * break programs compiled with a really old ABI though.
-		 */
-		if (!(vma->vm_flags & VM_EXEC) &&
-		    (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
-		     !(vma->vm_flags & (VM_READ | VM_WRITE))))
-			goto bad_area;
-#ifdef CONFIG_PPC_STD_MMU
-		/*
-		 * protfault should only happen due to us
-		 * mapping a region readonly temporarily. PROT_NONE
-		 * is also covered by the VMA check above.
-		 */
-		WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
-#endif /* CONFIG_PPC_STD_MMU */
-	/* a write */
-	} else if (is_write) {
-		if (!(vma->vm_flags & VM_WRITE))
-			goto bad_area;
-		flags |= FAULT_FLAG_WRITE;
-	/* a read */
-	} else {
-		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
-			goto bad_area;
-		WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
-	}
-
-	/*
-	 * If for any reason at all we couldn't handle the fault,
-	 * make sure we exit gracefully rather than endlessly redo
-	 * the fault.
-	 */
-	fault = handle_mm_fault(mm, vma, address, flags);
-	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
-		if (fault & VM_FAULT_SIGSEGV)
-			goto bad_area;
-		rc = mm_fault_error(regs, address, fault);
-		if (rc >= MM_FAULT_RETURN)
-			goto bail;
-		else
-			rc = 0;
-	}
-
-	/*
-	 * Major/minor page fault accounting is only done on the
-	 * initial attempt. If we go through a retry, it is extremely
-	 * likely that the page will be found in page cache at that point.
-	 */
-	if (flags & FAULT_FLAG_ALLOW_RETRY) {
-		if (fault & VM_FAULT_MAJOR) {
-			current->maj_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
-				      regs, address);
-#ifdef CONFIG_PPC_SMLPAR
-			if (firmware_has_feature(FW_FEATURE_CMO)) {
-				u32 page_ins;
-
-				preempt_disable();
-				page_ins = be32_to_cpu(get_lppaca()->page_ins);
-				page_ins += 1 << PAGE_FACTOR;
-				get_lppaca()->page_ins = cpu_to_be32(page_ins);
-				preempt_enable();
-			}
-#endif /* CONFIG_PPC_SMLPAR */
-		} else {
-			current->min_flt++;
-			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
-				      regs, address);
-		}
-		if (fault & VM_FAULT_RETRY) {
-			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
-			 * of starvation. */
-			flags &= ~FAULT_FLAG_ALLOW_RETRY;
-			flags |= FAULT_FLAG_TRIED;
-			goto retry;
-		}
-	}
-
-	up_read(&mm->mmap_sem);
-	goto bail;
+	if (user_mode(regs) && store_updates_sp(regs))
+		error_code |= PF_CAN_GROW_STACK;
 
-bad_area:
-	up_read(&mm->mmap_sem);
-
-bad_area_nosemaphore:
-	/* User mode accesses cause a SIGSEGV */
-	if (user_mode(regs)) {
-		_exception(SIGSEGV, regs, code, address);
-		goto bail;
-	}
+	/* Set flag if exec fault for use by access_error */
+	if (trap == 0x400)
+		error_code |= PF_EXEC;
 
-	if (is_exec && (error_code & DSISR_PROTFAULT))
-		printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected"
-				   " page (%lx) - exploit attempt? (uid: %d)\n",
-				   address, from_kuid(&init_user_ns, current_uid()));
+	/* Generic page fault */
+	return generic_page_fault(regs, current, error_code, address);
+}
 
-	rc = SIGSEGV;
+int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
+			    unsigned long error_code)
+{
+	enum ctx_state prev_state = exception_enter();
+	int rc;
 
-bail:
+	rc = __do_page_fault(regs, address, error_code);
 	exception_exit(prev_state);
 	return rc;
-
 }
 
+
 /*
  * bad_page_fault is called when we have a bad access from the kernel.
  * It is called from the DSI and ISI handlers in head.S and from some
-- 
2.1.0