From 1c8e7e2ef295d6325796fcf3ce6f8825ffa7f58b Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sat, 28 Feb 2015 17:38:48 +1100 Subject: [PATCH 2/2] powerpc: Use generic_page_fault() Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/include/asm/fault.h | 165 ++++++++++++++++++++ arch/powerpc/mm/fault.c | 328 ++++++--------------------------------- 2 files changed, 215 insertions(+), 278 deletions(-) create mode 100644 arch/powerpc/include/asm/fault.h diff --git a/arch/powerpc/include/asm/fault.h b/arch/powerpc/include/asm/fault.h new file mode 100644 index 0000000..ebb46b9 --- /dev/null +++ b/arch/powerpc/include/asm/fault.h @@ -0,0 +1,165 @@ +#ifndef _ASM_POWERPC_FAULT_H +#define _ASM_POWERPC_FAULT_H + +#include +#include + +#include +#include +#include +#include + +static inline bool fault_is_user(struct pt_regs *regs, unsigned long err_code) +{ + return user_mode(regs); +} + +static inline bool fault_is_write(struct pt_regs *regs, unsigned long err_code) +{ +#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) + return !!(err_code & DSISR_ISSTORE); +#else + return !!(err_code & ESR_DST); +#endif /* CONFIG_4xx || CONFIG_BOOKE */ +} + +/* We need to pass a couple of flags throug the generic page fault + * code via "error_code" which contains either the DSISR or the ESR + * content depending on the CPU family. + * + * We hijack bits that we don't use in either + */ +#define PF_CAN_GROW_STACK 0x00000001ul +#define PF_EXEC 0x00000002ul + +/* Return type for do_page_fault */ +typedef int gpf_ret_t; + +#define FAULT_NO_ERR 0 + +/* Check if the stack is allowed to grow during a user page fault */ +static inline bool stack_can_grow(struct pt_regs *regs, unsigned long err_code, + unsigned long address, + struct vm_area_struct *vma) +{ + /* + * N.B. The POWER/Open ABI allows programs to access up to + * 288 bytes below the stack pointer. + * The kernel signal delivery code writes up to about 1.5kB + * below the stack pointer (r1) before decrementing it. + * The exec code can write slightly over 640kB to the stack + * before setting the user r1. Thus we allow the stack to + * expand to 1MB without further checks. + */ + if (address + 0x100000 < vma->vm_end) { + /* get user regs even if this fault is in kernel mode */ + struct pt_regs *uregs = current->thread.regs; + if (uregs == NULL) + return false; + + /* + * A user-mode access to an address a long way below + * the stack pointer is only valid if the instruction + * is one which would update the stack pointer to the + * address accessed if the instruction completed, + * i.e. either stwu rs,n(r1) or stwux rs,r1,rb + * (or the byte, halfword, float or double forms). + * + * If we don't check this then any write to the area + * between the last mapped region and the stack will + * expand the stack rather than segfaulting. + */ + if (address + 2048 < uregs->gpr[1] && + !(err_code & PF_CAN_GROW_STACK)) + return false; + } + return true; +} + +static inline bool access_error(struct pt_regs *regs, unsigned long err_code, + struct vm_area_struct *vma) +{ +#if defined(CONFIG_6xx) + /* an error such as lwarx to I/O controller space, + address matching DABR, eciwx, etc. */ + if (err_code & 0x95700000) + return true; +#endif /* CONFIG_6xx */ +#if defined(CONFIG_8xx) + /* The MPC8xx seems to always set 0x80000000, which is + * "undefined". Of those that can be set, this is the only + * one which seems bad. + */ + if (err_code & 0x10000000) + /* Guarded storage error. */ + return true; +#endif /* CONFIG_8xx */ + + if (err_code & PF_EXEC) { + /* + * Allow execution from readable areas if the MMU does not + * provide separate controls over reading and executing. + * + * Note: That code used to not be enabled for 4xx/BookE. + * It is now as I/D cache coherency for these is done at + * set_pte_at() time and I see no reason why the test + * below wouldn't be valid on those processors. This -may- + * break programs compiled with a really old ABI though. + */ + if (!(vma->vm_flags & VM_EXEC) && + (cpu_has_feature(CPU_FTR_NOEXECUTE) || + !(vma->vm_flags & (VM_READ | VM_WRITE)))) + return true; +#ifdef CONFIG_PPC_STD_MMU + /* + * protfault should only happen due to us + * mapping a region readonly temporarily. PROT_NONE + * is also covered by the VMA check above. + */ + WARN_ON_ONCE(err_code & DSISR_PROTFAULT); +#endif /* CONFIG_PPC_STD_MMU */ + /* a write */ + } else if (fault_is_write(regs, err_code)) { + if (!(vma->vm_flags & VM_WRITE)) + return true; + /* a read */ + } else { + if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) + return true; + WARN_ON_ONCE(err_code & DSISR_PROTFAULT); + } + return false; +} + +/* Error handlers */ + +gpf_ret_t handle_bad_area(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code); + +static inline gpf_ret_t handle_kernel_fault(struct pt_regs *regs, + unsigned long error_code, + unsigned long address, int sig, + int si_code) +{ + return sig; +} + +gpf_ret_t do_sigbus(struct pt_regs *regs, unsigned long error_code, + unsigned long address, unsigned int fault); + +static inline void arch_account_major_fault(void) +{ +#ifdef CONFIG_PPC_SMLPAR + if (firmware_has_feature(FW_FEATURE_CMO)) { + u32 page_ins; + + preempt_disable(); + page_ins = be32_to_cpu(get_lppaca()->page_ins); + page_ins += 1 << PAGE_FACTOR; + get_lppaca()->page_ins = cpu_to_be32(page_ins); + preempt_enable(); + } +#endif /* CONFIG_PPC_SMLPAR */ +} + +#endif /* _ASM_X86_FAULT_H */ diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index b396868..c51c156 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -72,15 +73,15 @@ static inline int notify_page_fault(struct pt_regs *regs) * Check whether the instruction at regs->nip is a store using * an update addressing form which will update r1. */ -static int store_updates_sp(struct pt_regs *regs) +static bool store_updates_sp(struct pt_regs *regs) { unsigned int inst; if (get_user(inst, (unsigned int __user *)regs->nip)) - return 0; + return false; /* check for 1 in the rA field */ if (((inst >> 16) & 0x1f) != 1) - return 0; + return false; /* check major opcode */ switch (inst >> 26) { case 37: /* stwu */ @@ -88,7 +89,7 @@ static int store_updates_sp(struct pt_regs *regs) case 45: /* sthu */ case 53: /* stfsu */ case 55: /* stfdu */ - return 1; + return true; case 62: /* std or stdu */ return (inst & 3) == 1; case 31: @@ -100,10 +101,10 @@ static int store_updates_sp(struct pt_regs *regs) case 439: /* sthux */ case 695: /* stfsux */ case 759: /* stfdux */ - return 1; + return true; } } - return 0; + return false; } /* * do_page_fault error handling helpers @@ -113,16 +114,14 @@ static int store_updates_sp(struct pt_regs *regs) #define MM_FAULT_CONTINUE -1 #define MM_FAULT_ERR(sig) (sig) -static int do_sigbus(struct pt_regs *regs, unsigned long address, - unsigned int fault) +gpf_ret_t do_sigbus(struct pt_regs *regs, unsigned long error_code, + unsigned long address, unsigned int fault) { siginfo_t info; unsigned int lsb = 0; - up_read(¤t->mm->mmap_sem); - if (!user_mode(regs)) - return MM_FAULT_ERR(SIGBUS); + return SIGBUS; current->thread.trap_nr = BUS_ADRERR; info.si_signo = SIGBUS; @@ -143,53 +142,25 @@ static int do_sigbus(struct pt_regs *regs, unsigned long address, #endif info.si_addr_lsb = lsb; force_sig_info(SIGBUS, &info, current); - return MM_FAULT_RETURN; + return 0; } -static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) +gpf_ret_t handle_bad_area(struct pt_regs *regs, unsigned long error_code, + unsigned long address, int si_code) { - /* - * Pagefault was interrupted by SIGKILL. We have no reason to - * continue the pagefault. - */ - if (fatal_signal_pending(current)) { - /* - * If we have retry set, the mmap semaphore will have - * alrady been released in __lock_page_or_retry(). Else - * we release it now. - */ - if (!(fault & VM_FAULT_RETRY)) - up_read(¤t->mm->mmap_sem); - /* Coming from kernel, we need to deal with uaccess fixups */ - if (user_mode(regs)) - return MM_FAULT_RETURN; - return MM_FAULT_ERR(SIGKILL); - } - /* No fault: be happy */ - if (!(fault & VM_FAULT_ERROR)) - return MM_FAULT_CONTINUE; - - /* Out of memory */ - if (fault & VM_FAULT_OOM) { - up_read(¤t->mm->mmap_sem); - - /* - * We ran out of memory, or some other thing happened to us that - * made us unable to handle the page fault gracefully. - */ - if (!user_mode(regs)) - return MM_FAULT_ERR(SIGKILL); - pagefault_out_of_memory(); - return MM_FAULT_RETURN; + /* User mode accesses cause a SIGSEGV */ + if (user_mode(regs)) { + _exception(SIGSEGV, regs, si_code, address); + return 0; } - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) - return do_sigbus(regs, addr, fault); + if ((error_code & PF_EXEC) && (error_code & DSISR_PROTFAULT)) + printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected" + " page (%lx) - exploit attempt? (uid: %d)\n", + address, from_kuid(&init_user_ns, current_uid())); - /* We don't understand the fault code, this is fatal */ - BUG(); - return MM_FAULT_CONTINUE; + return SIGSEGV; } /* @@ -205,19 +176,11 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault) * The return value is 0 if the fault was handled, or the signal * number if this is a kernel fault that can't be handled here. */ -int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code) +static int __do_page_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code) { - enum ctx_state prev_state = exception_enter(); - struct vm_area_struct * vma; struct mm_struct *mm = current->mm; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; - int code = SEGV_MAPERR; - int is_write = 0; int trap = TRAP(regs); - int is_exec = trap == 0x400; - int fault; - int rc = 0, store_update_sp = 0; #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) /* @@ -228,10 +191,6 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, */ if (trap == 0x400) error_code &= 0x48200000; - else - is_write = error_code & DSISR_ISSTORE; -#else - is_write = error_code & ESR_DST; #endif /* CONFIG_4xx || CONFIG_BOOKE */ #ifdef CONFIG_PPC_ICSWX @@ -241,30 +200,28 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, * look at it */ if (error_code & ICSWX_DSI_UCT) { - rc = acop_handle_fault(regs, address, error_code); + gfp_ret_t rc = acop_handle_fault(regs, address, error_code); if (rc) - goto bail; + return rc; } #endif /* CONFIG_PPC_ICSWX */ if (notify_page_fault(regs)) - goto bail; + return 0; if (unlikely(debugger_fault_handler(regs))) - goto bail; + return 0; /* On a kernel SLB miss we can only check for a valid exception entry */ - if (!user_mode(regs) && (address >= TASK_SIZE)) { - rc = SIGSEGV; - goto bail; - } + if (!user_mode(regs) && (address >= TASK_SIZE)) + return SIGSEGV; #if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE) || \ defined(CONFIG_PPC_BOOK3S_64)) if (error_code & DSISR_DABRMATCH) { /* breakpoint match */ do_break(regs, address, error_code); - goto bail; + return 0; } #endif @@ -273,10 +230,9 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, local_irq_enable(); if (in_atomic() || mm == NULL) { - if (!user_mode(regs)) { - rc = SIGSEGV; - goto bail; - } + if (!user_mode(regs)) + return SIGSEGV; + /* in_atomic() in user mode is really bad, as is current->mm == NULL. */ printk(KERN_EMERG "Page fault in user mode with " @@ -286,220 +242,36 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); + error_code &= ~(PF_CAN_GROW_STACK | PF_EXEC); /* * We want to do this outside mmap_sem, because reading code around nip * can result in fault, which will cause a deadlock when called with * mmap_sem held */ - if (user_mode(regs)) - store_update_sp = store_updates_sp(regs); - - if (user_mode(regs)) - flags |= FAULT_FLAG_USER; - - /* When running in the kernel we expect faults to occur only to - * addresses in user space. All other faults represent errors in the - * kernel and should generate an OOPS. Unfortunately, in the case of an - * erroneous fault occurring in a code path which already holds mmap_sem - * we will deadlock attempting to validate the fault against the - * address space. Luckily the kernel only validly references user - * space from well defined areas of code, which are listed in the - * exceptions table. - * - * As the vast majority of faults will be valid we will only perform - * the source reference check when there is a possibility of a deadlock. - * Attempt to lock the address space, if we cannot we then validate the - * source. If this is invalid we can skip the address space check, - * thus avoiding the deadlock. - */ - if (!down_read_trylock(&mm->mmap_sem)) { - if (!user_mode(regs) && !search_exception_tables(regs->nip)) - goto bad_area_nosemaphore; - -retry: - down_read(&mm->mmap_sem); - } else { - /* - * The above down_read_trylock() might have succeeded in - * which case we'll have missed the might_sleep() from - * down_read(): - */ - might_sleep(); - } - - vma = find_vma(mm, address); - if (!vma) - goto bad_area; - if (vma->vm_start <= address) - goto good_area; - if (!(vma->vm_flags & VM_GROWSDOWN)) - goto bad_area; - - /* - * N.B. The POWER/Open ABI allows programs to access up to - * 288 bytes below the stack pointer. - * The kernel signal delivery code writes up to about 1.5kB - * below the stack pointer (r1) before decrementing it. - * The exec code can write slightly over 640kB to the stack - * before setting the user r1. Thus we allow the stack to - * expand to 1MB without further checks. - */ - if (address + 0x100000 < vma->vm_end) { - /* get user regs even if this fault is in kernel mode */ - struct pt_regs *uregs = current->thread.regs; - if (uregs == NULL) - goto bad_area; - - /* - * A user-mode access to an address a long way below - * the stack pointer is only valid if the instruction - * is one which would update the stack pointer to the - * address accessed if the instruction completed, - * i.e. either stwu rs,n(r1) or stwux rs,r1,rb - * (or the byte, halfword, float or double forms). - * - * If we don't check this then any write to the area - * between the last mapped region and the stack will - * expand the stack rather than segfaulting. - */ - if (address + 2048 < uregs->gpr[1] && !store_update_sp) - goto bad_area; - } - if (expand_stack(vma, address)) - goto bad_area; - -good_area: - code = SEGV_ACCERR; -#if defined(CONFIG_6xx) - if (error_code & 0x95700000) - /* an error such as lwarx to I/O controller space, - address matching DABR, eciwx, etc. */ - goto bad_area; -#endif /* CONFIG_6xx */ -#if defined(CONFIG_8xx) - /* The MPC8xx seems to always set 0x80000000, which is - * "undefined". Of those that can be set, this is the only - * one which seems bad. - */ - if (error_code & 0x10000000) - /* Guarded storage error. */ - goto bad_area; -#endif /* CONFIG_8xx */ - - if (is_exec) { - /* - * Allow execution from readable areas if the MMU does not - * provide separate controls over reading and executing. - * - * Note: That code used to not be enabled for 4xx/BookE. - * It is now as I/D cache coherency for these is done at - * set_pte_at() time and I see no reason why the test - * below wouldn't be valid on those processors. This -may- - * break programs compiled with a really old ABI though. - */ - if (!(vma->vm_flags & VM_EXEC) && - (cpu_has_feature(CPU_FTR_NOEXECUTE) || - !(vma->vm_flags & (VM_READ | VM_WRITE)))) - goto bad_area; -#ifdef CONFIG_PPC_STD_MMU - /* - * protfault should only happen due to us - * mapping a region readonly temporarily. PROT_NONE - * is also covered by the VMA check above. - */ - WARN_ON_ONCE(error_code & DSISR_PROTFAULT); -#endif /* CONFIG_PPC_STD_MMU */ - /* a write */ - } else if (is_write) { - if (!(vma->vm_flags & VM_WRITE)) - goto bad_area; - flags |= FAULT_FLAG_WRITE; - /* a read */ - } else { - if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) - goto bad_area; - WARN_ON_ONCE(error_code & DSISR_PROTFAULT); - } - - /* - * If for any reason at all we couldn't handle the fault, - * make sure we exit gracefully rather than endlessly redo - * the fault. - */ - fault = handle_mm_fault(mm, vma, address, flags); - if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) { - if (fault & VM_FAULT_SIGSEGV) - goto bad_area; - rc = mm_fault_error(regs, address, fault); - if (rc >= MM_FAULT_RETURN) - goto bail; - else - rc = 0; - } - - /* - * Major/minor page fault accounting is only done on the - * initial attempt. If we go through a retry, it is extremely - * likely that the page will be found in page cache at that point. - */ - if (flags & FAULT_FLAG_ALLOW_RETRY) { - if (fault & VM_FAULT_MAJOR) { - current->maj_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, - regs, address); -#ifdef CONFIG_PPC_SMLPAR - if (firmware_has_feature(FW_FEATURE_CMO)) { - u32 page_ins; - - preempt_disable(); - page_ins = be32_to_cpu(get_lppaca()->page_ins); - page_ins += 1 << PAGE_FACTOR; - get_lppaca()->page_ins = cpu_to_be32(page_ins); - preempt_enable(); - } -#endif /* CONFIG_PPC_SMLPAR */ - } else { - current->min_flt++; - perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, - regs, address); - } - if (fault & VM_FAULT_RETRY) { - /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk - * of starvation. */ - flags &= ~FAULT_FLAG_ALLOW_RETRY; - flags |= FAULT_FLAG_TRIED; - goto retry; - } - } - - up_read(&mm->mmap_sem); - goto bail; + if (user_mode(regs) && store_updates_sp(regs)) + error_code |= PF_CAN_GROW_STACK; -bad_area: - up_read(&mm->mmap_sem); - -bad_area_nosemaphore: - /* User mode accesses cause a SIGSEGV */ - if (user_mode(regs)) { - _exception(SIGSEGV, regs, code, address); - goto bail; - } + /* Set flag if exec fault for use by access_error */ + if (trap == 0x400) + error_code |= PF_EXEC; - if (is_exec && (error_code & DSISR_PROTFAULT)) - printk_ratelimited(KERN_CRIT "kernel tried to execute NX-protected" - " page (%lx) - exploit attempt? (uid: %d)\n", - address, from_kuid(&init_user_ns, current_uid())); + /* Generic page fault */ + return generic_page_fault(regs, current, error_code, address); +} - rc = SIGSEGV; +int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, + unsigned long error_code) +{ + enum ctx_state prev_state = exception_enter(); + int rc; -bail: + rc = __do_page_fault(regs, address, error_code); exception_exit(prev_state); return rc; - } + /* * bad_page_fault is called when we have a bad access from the kernel. * It is called from the DSI and ISI handlers in head.S and from some -- 2.1.0