[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20130726131947.GE17761@dhcp22.suse.cz>
Date: Fri, 26 Jul 2013 15:19:47 +0200
From: Michal Hocko <mhocko@...e.cz>
To: Johannes Weiner <hannes@...xchg.org>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
David Rientjes <rientjes@...gle.com>,
KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>,
azurIt <azurit@...ox.sk>, linux-mm@...ck.org,
cgroups@...r.kernel.org, x86@...nel.org,
linux-arch@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [patch 3/6] arch: mm: pass userspace fault flag to generic fault
handler
On Thu 25-07-13 18:25:35, Johannes Weiner wrote:
> Unlike global OOM handling, memory cgroup code will invoke the OOM
> killer in any OOM situation because it has no way of telling faults
> occuring in kernel context - which could be handled more gracefully -
> from user-triggered faults.
>
> Pass a flag that identifies faults originating in user space from the
> architecture-specific fault handlers to generic code so that memcg OOM
> handling can be improved.
>
> Signed-off-by: Johannes Weiner <hannes@...xchg.org>
Looks good to me but I guess maintainers of the affected archs should be
CCed
Reviewed-by: Michal Hocko <mhocko@...e.cz>
> ---
> arch/alpha/mm/fault.c | 7 ++++---
> arch/arc/mm/fault.c | 6 ++++--
> arch/arm/mm/fault.c | 9 ++++++---
> arch/arm64/mm/fault.c | 9 ++++++---
> arch/avr32/mm/fault.c | 2 ++
> arch/cris/mm/fault.c | 6 ++++--
> arch/frv/mm/fault.c | 10 ++++++----
> arch/hexagon/mm/vm_fault.c | 6 ++++--
> arch/ia64/mm/fault.c | 6 ++++--
> arch/m32r/mm/fault.c | 10 ++++++----
> arch/m68k/mm/fault.c | 2 ++
> arch/metag/mm/fault.c | 6 ++++--
> arch/microblaze/mm/fault.c | 7 +++++--
> arch/mips/mm/fault.c | 6 ++++--
> arch/mn10300/mm/fault.c | 2 ++
> arch/openrisc/mm/fault.c | 1 +
> arch/parisc/mm/fault.c | 7 +++++--
> arch/powerpc/mm/fault.c | 7 ++++---
> arch/s390/mm/fault.c | 2 ++
> arch/score/mm/fault.c | 7 ++++++-
> arch/sh/mm/fault.c | 9 ++++++---
> arch/sparc/mm/fault_32.c | 12 +++++++++---
> arch/sparc/mm/fault_64.c | 8 +++++---
> arch/tile/mm/fault.c | 7 +++++--
> arch/um/kernel/trap.c | 20 ++++++++++++--------
> arch/unicore32/mm/fault.c | 8 ++++++--
> arch/x86/mm/fault.c | 8 +++++---
> arch/xtensa/mm/fault.c | 2 ++
> include/linux/mm.h | 1 +
> 29 files changed, 132 insertions(+), 61 deletions(-)
>
> diff --git a/arch/alpha/mm/fault.c b/arch/alpha/mm/fault.c
> index 0c4132d..98838a0 100644
> --- a/arch/alpha/mm/fault.c
> +++ b/arch/alpha/mm/fault.c
> @@ -89,8 +89,7 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
> const struct exception_table_entry *fixup;
> int fault, si_code = SEGV_MAPERR;
> siginfo_t info;
> - unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> - (cause > 0 ? FAULT_FLAG_WRITE : 0));
> + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>
> /* As of EV6, a load into $31/$f31 is a prefetch, and never faults
> (or is suppressed by the PALcode). Support that for older CPUs
> @@ -115,7 +114,8 @@ do_page_fault(unsigned long address, unsigned long mmcsr,
> if (address >= TASK_SIZE)
> goto vmalloc_fault;
> #endif
> -
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> retry:
> down_read(&mm->mmap_sem);
> vma = find_vma(mm, address);
> @@ -142,6 +142,7 @@ retry:
> } else {
> if (!(vma->vm_flags & VM_WRITE))
> goto bad_area;
> + flags |= FAULT_FLAG_WRITE;
> }
>
> /* If for any reason at all we couldn't handle the fault,
> diff --git a/arch/arc/mm/fault.c b/arch/arc/mm/fault.c
> index 6b0bb41..d63f3de 100644
> --- a/arch/arc/mm/fault.c
> +++ b/arch/arc/mm/fault.c
> @@ -60,8 +60,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address)
> siginfo_t info;
> int fault, ret;
> int write = regs->ecr_cause & ECR_C_PROTV_STORE; /* ST/EX */
> - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> - (write ? FAULT_FLAG_WRITE : 0);
> + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>
> /*
> * We fault-in kernel-space virtual memory on-demand. The
> @@ -89,6 +88,8 @@ void do_page_fault(struct pt_regs *regs, unsigned long address)
> if (in_atomic() || !mm)
> goto no_context;
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> retry:
> down_read(&mm->mmap_sem);
> vma = find_vma(mm, address);
> @@ -117,6 +118,7 @@ good_area:
> if (write) {
> if (!(vma->vm_flags & VM_WRITE))
> goto bad_area;
> + flags |= FAULT_FLAG_WRITE;
> } else {
> if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
> goto bad_area;
> diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
> index 217bcbf..eb8830a 100644
> --- a/arch/arm/mm/fault.c
> +++ b/arch/arm/mm/fault.c
> @@ -261,9 +261,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
> struct task_struct *tsk;
> struct mm_struct *mm;
> int fault, sig, code;
> - int write = fsr & FSR_WRITE;
> - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> - (write ? FAULT_FLAG_WRITE : 0);
> + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>
> if (notify_page_fault(regs, fsr))
> return 0;
> @@ -282,6 +280,11 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
> if (in_atomic() || !mm)
> goto no_context;
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> + if (fsr & FSR_WRITE)
> + flags |= FAULT_FLAG_WRITE;
> +
> /*
> * As per x86, we may deadlock here. However, since the kernel only
> * validly references user space from well defined areas of the code,
> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index dab1cfd..12205b4 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -208,9 +208,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
> struct task_struct *tsk;
> struct mm_struct *mm;
> int fault, sig, code;
> - bool write = (esr & ESR_WRITE) && !(esr & ESR_CM);
> - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> - (write ? FAULT_FLAG_WRITE : 0);
> + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>
> tsk = current;
> mm = tsk->mm;
> @@ -226,6 +224,11 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr,
> if (in_atomic() || !mm)
> goto no_context;
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> + if ((esr & ESR_WRITE) && !(esr & ESR_CM))
> + flags |= FAULT_FLAG_WRITE;
> +
> /*
> * As per x86, we may deadlock here. However, since the kernel only
> * validly references user space from well defined areas of the code,
> diff --git a/arch/avr32/mm/fault.c b/arch/avr32/mm/fault.c
> index 2ca27b0..0eca933 100644
> --- a/arch/avr32/mm/fault.c
> +++ b/arch/avr32/mm/fault.c
> @@ -86,6 +86,8 @@ asmlinkage void do_page_fault(unsigned long ecr, struct pt_regs *regs)
>
> local_irq_enable();
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> retry:
> down_read(&mm->mmap_sem);
>
> diff --git a/arch/cris/mm/fault.c b/arch/cris/mm/fault.c
> index 73312ab..1790f22 100644
> --- a/arch/cris/mm/fault.c
> +++ b/arch/cris/mm/fault.c
> @@ -58,8 +58,7 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
> struct vm_area_struct * vma;
> siginfo_t info;
> int fault;
> - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> - ((writeaccess & 1) ? FAULT_FLAG_WRITE : 0);
> + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>
> D(printk(KERN_DEBUG
> "Page fault for %lX on %X at %lX, prot %d write %d\n",
> @@ -117,6 +116,8 @@ do_page_fault(unsigned long address, struct pt_regs *regs,
> if (in_atomic() || !mm)
> goto no_context;
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> retry:
> down_read(&mm->mmap_sem);
> vma = find_vma(mm, address);
> @@ -155,6 +156,7 @@ retry:
> } else if (writeaccess == 1) {
> if (!(vma->vm_flags & VM_WRITE))
> goto bad_area;
> + flags |= FAULT_FLAG_WRITE;
> } else {
> if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
> goto bad_area;
> diff --git a/arch/frv/mm/fault.c b/arch/frv/mm/fault.c
> index 331c1e2..9a66372 100644
> --- a/arch/frv/mm/fault.c
> +++ b/arch/frv/mm/fault.c
> @@ -34,11 +34,11 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
> struct vm_area_struct *vma;
> struct mm_struct *mm;
> unsigned long _pme, lrai, lrad, fixup;
> + unsigned long flags = 0;
> siginfo_t info;
> pgd_t *pge;
> pud_t *pue;
> pte_t *pte;
> - int write;
> int fault;
>
> #if 0
> @@ -81,6 +81,9 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
> if (in_atomic() || !mm)
> goto no_context;
>
> + if (user_mode(__frame))
> + flags |= FAULT_FLAG_USER;
> +
> down_read(&mm->mmap_sem);
>
> vma = find_vma(mm, ear0);
> @@ -129,7 +132,6 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
> */
> good_area:
> info.si_code = SEGV_ACCERR;
> - write = 0;
> switch (esr0 & ESR0_ATXC) {
> default:
> /* handle write to write protected page */
> @@ -140,7 +142,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
> #endif
> if (!(vma->vm_flags & VM_WRITE))
> goto bad_area;
> - write = 1;
> + flags |= FAULT_FLAG_WRITE;
> break;
>
> /* handle read from protected page */
> @@ -162,7 +164,7 @@ asmlinkage void do_page_fault(int datammu, unsigned long esr0, unsigned long ear
> * make sure we exit gracefully rather than endlessly redo
> * the fault.
> */
> - fault = handle_mm_fault(mm, vma, ear0, write ? FAULT_FLAG_WRITE : 0);
> + fault = handle_mm_fault(mm, vma, ear0, flags);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> goto out_of_memory;
> diff --git a/arch/hexagon/mm/vm_fault.c b/arch/hexagon/mm/vm_fault.c
> index 1bd276d..8704c93 100644
> --- a/arch/hexagon/mm/vm_fault.c
> +++ b/arch/hexagon/mm/vm_fault.c
> @@ -53,8 +53,7 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
> int si_code = SEGV_MAPERR;
> int fault;
> const struct exception_table_entry *fixup;
> - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> - (cause > 0 ? FAULT_FLAG_WRITE : 0);
> + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>
> /*
> * If we're in an interrupt or have no user context,
> @@ -65,6 +64,8 @@ void do_page_fault(unsigned long address, long cause, struct pt_regs *regs)
>
> local_irq_enable();
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> retry:
> down_read(&mm->mmap_sem);
> vma = find_vma(mm, address);
> @@ -96,6 +97,7 @@ good_area:
> case FLT_STORE:
> if (!(vma->vm_flags & VM_WRITE))
> goto bad_area;
> + flags |= FAULT_FLAG_WRITE;
> break;
> }
>
> diff --git a/arch/ia64/mm/fault.c b/arch/ia64/mm/fault.c
> index 6cf0341..7225dad 100644
> --- a/arch/ia64/mm/fault.c
> +++ b/arch/ia64/mm/fault.c
> @@ -90,8 +90,6 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
> mask = ((((isr >> IA64_ISR_X_BIT) & 1UL) << VM_EXEC_BIT)
> | (((isr >> IA64_ISR_W_BIT) & 1UL) << VM_WRITE_BIT));
>
> - flags |= ((mask & VM_WRITE) ? FAULT_FLAG_WRITE : 0);
> -
> /* mmap_sem is performance critical.... */
> prefetchw(&mm->mmap_sem);
>
> @@ -119,6 +117,10 @@ ia64_do_page_fault (unsigned long address, unsigned long isr, struct pt_regs *re
> if (notify_page_fault(regs, TRAP_BRKPT))
> return;
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> + if (mask & VM_WRITE)
> + flags |= FAULT_FLAG_WRITE;
> retry:
> down_read(&mm->mmap_sem);
>
> diff --git a/arch/m32r/mm/fault.c b/arch/m32r/mm/fault.c
> index 3cdfa9c..e9c6a80 100644
> --- a/arch/m32r/mm/fault.c
> +++ b/arch/m32r/mm/fault.c
> @@ -78,7 +78,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
> struct mm_struct *mm;
> struct vm_area_struct * vma;
> unsigned long page, addr;
> - int write;
> + unsigned long flags = 0;
> int fault;
> siginfo_t info;
>
> @@ -117,6 +117,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
> if (in_atomic() || !mm)
> goto bad_area_nosemaphore;
>
> + if (error_code & ACE_USERMODE)
> + flags |= FAULT_FLAG_USER;
> +
> /* When running in the kernel we expect faults to occur only to
> * addresses in user space. All other faults represent errors in the
> * kernel and should generate an OOPS. Unfortunately, in the case of an
> @@ -166,14 +169,13 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code,
> */
> good_area:
> info.si_code = SEGV_ACCERR;
> - write = 0;
> switch (error_code & (ACE_WRITE|ACE_PROTECTION)) {
> default: /* 3: write, present */
> /* fall through */
> case ACE_WRITE: /* write, not present */
> if (!(vma->vm_flags & VM_WRITE))
> goto bad_area;
> - write++;
> + flags |= FAULT_FLAG_WRITE;
> break;
> case ACE_PROTECTION: /* read, present */
> case 0: /* read, not present */
> @@ -194,7 +196,7 @@ good_area:
> */
> addr = (address & PAGE_MASK);
> set_thread_fault_code(error_code);
> - fault = handle_mm_fault(mm, vma, addr, write ? FAULT_FLAG_WRITE : 0);
> + fault = handle_mm_fault(mm, vma, addr, flags);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> goto out_of_memory;
> diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c
> index a563727..eb1d61f 100644
> --- a/arch/m68k/mm/fault.c
> +++ b/arch/m68k/mm/fault.c
> @@ -88,6 +88,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
> if (in_atomic() || !mm)
> goto no_context;
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> retry:
> down_read(&mm->mmap_sem);
>
> diff --git a/arch/metag/mm/fault.c b/arch/metag/mm/fault.c
> index 8fddf46..332680e 100644
> --- a/arch/metag/mm/fault.c
> +++ b/arch/metag/mm/fault.c
> @@ -53,8 +53,7 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
> struct vm_area_struct *vma, *prev_vma;
> siginfo_t info;
> int fault;
> - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> - (write_access ? FAULT_FLAG_WRITE : 0);
> + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>
> tsk = current;
>
> @@ -109,6 +108,8 @@ int do_page_fault(struct pt_regs *regs, unsigned long address,
> if (in_atomic() || !mm)
> goto no_context;
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> retry:
> down_read(&mm->mmap_sem);
>
> @@ -121,6 +122,7 @@ good_area:
> if (write_access) {
> if (!(vma->vm_flags & VM_WRITE))
> goto bad_area;
> + flags |= FAULT_FLAG_WRITE;
> } else {
> if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
> goto bad_area;
> diff --git a/arch/microblaze/mm/fault.c b/arch/microblaze/mm/fault.c
> index 731f739..fa4cf52 100644
> --- a/arch/microblaze/mm/fault.c
> +++ b/arch/microblaze/mm/fault.c
> @@ -92,8 +92,7 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
> int code = SEGV_MAPERR;
> int is_write = error_code & ESR_S;
> int fault;
> - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> - (is_write ? FAULT_FLAG_WRITE : 0);
> + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>
> regs->ear = address;
> regs->esr = error_code;
> @@ -121,6 +120,9 @@ void do_page_fault(struct pt_regs *regs, unsigned long address,
> die("Weird page fault", regs, SIGSEGV);
> }
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> +
> /* When running in the kernel we expect faults to occur only to
> * addresses in user space. All other faults represent errors in the
> * kernel and should generate an OOPS. Unfortunately, in the case of an
> @@ -199,6 +201,7 @@ good_area:
> if (unlikely(is_write)) {
> if (unlikely(!(vma->vm_flags & VM_WRITE)))
> goto bad_area;
> + flags |= FAULT_FLAG_WRITE;
> /* a read */
> } else {
> /* protection fault */
> diff --git a/arch/mips/mm/fault.c b/arch/mips/mm/fault.c
> index 94d3a31..becc42b 100644
> --- a/arch/mips/mm/fault.c
> +++ b/arch/mips/mm/fault.c
> @@ -42,8 +42,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
> const int field = sizeof(unsigned long) * 2;
> siginfo_t info;
> int fault;
> - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> - (write ? FAULT_FLAG_WRITE : 0);
> + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>
> #if 0
> printk("Cpu%d[%s:%d:%0*lx:%ld:%0*lx]\n", raw_smp_processor_id(),
> @@ -93,6 +92,8 @@ static void __kprobes __do_page_fault(struct pt_regs *regs, unsigned long write,
> if (in_atomic() || !mm)
> goto bad_area_nosemaphore;
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> retry:
> down_read(&mm->mmap_sem);
> vma = find_vma(mm, address);
> @@ -114,6 +115,7 @@ good_area:
> if (write) {
> if (!(vma->vm_flags & VM_WRITE))
> goto bad_area;
> + flags |= FAULT_FLAG_WRITE;
> } else {
> if (cpu_has_rixi) {
> if (address == regs->cp0_epc && !(vma->vm_flags & VM_EXEC)) {
> diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c
> index 8a2e6de..3516cbd 100644
> --- a/arch/mn10300/mm/fault.c
> +++ b/arch/mn10300/mm/fault.c
> @@ -171,6 +171,8 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long fault_code,
> if (in_atomic() || !mm)
> goto no_context;
>
> + if ((fault_code & MMUFCR_xFC_ACCESS) == MMUFCR_xFC_ACCESS_USR)
> + flags |= FAULT_FLAG_USER;
> retry:
> down_read(&mm->mmap_sem);
>
> diff --git a/arch/openrisc/mm/fault.c b/arch/openrisc/mm/fault.c
> index 4a41f84..0703acf 100644
> --- a/arch/openrisc/mm/fault.c
> +++ b/arch/openrisc/mm/fault.c
> @@ -86,6 +86,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long address,
> if (user_mode(regs)) {
> /* Exception was in userspace: reenable interrupts */
> local_irq_enable();
> + flags |= FAULT_FLAG_USER;
> } else {
> /* If exception was in a syscall, then IRQ's may have
> * been enabled or disabled. If they were enabled,
> diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
> index f247a34..d10d27a 100644
> --- a/arch/parisc/mm/fault.c
> +++ b/arch/parisc/mm/fault.c
> @@ -180,6 +180,10 @@ void do_page_fault(struct pt_regs *regs, unsigned long code,
> if (in_atomic() || !mm)
> goto no_context;
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> + if (acc_type & VM_WRITE)
> + flags |= FAULT_FLAG_WRITE;
> retry:
> down_read(&mm->mmap_sem);
> vma = find_vma_prev(mm, address, &prev_vma);
> @@ -203,8 +207,7 @@ good_area:
> * fault.
> */
>
> - fault = handle_mm_fault(mm, vma, address,
> - flags | ((acc_type & VM_WRITE) ? FAULT_FLAG_WRITE : 0));
> + fault = handle_mm_fault(mm, vma, address, flags);
>
> if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
> return;
> diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
> index 8726779..d9196c9 100644
> --- a/arch/powerpc/mm/fault.c
> +++ b/arch/powerpc/mm/fault.c
> @@ -223,9 +223,6 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
> is_write = error_code & ESR_DST;
> #endif /* CONFIG_4xx || CONFIG_BOOKE */
>
> - if (is_write)
> - flags |= FAULT_FLAG_WRITE;
> -
> #ifdef CONFIG_PPC_ICSWX
> /*
> * we need to do this early because this "data storage
> @@ -280,6 +277,9 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
>
> perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> +
> /* When running in the kernel we expect faults to occur only to
> * addresses in user space. All other faults represent errors in the
> * kernel and should generate an OOPS. Unfortunately, in the case of an
> @@ -408,6 +408,7 @@ good_area:
> } else if (is_write) {
> if (!(vma->vm_flags & VM_WRITE))
> goto bad_area;
> + flags |= FAULT_FLAG_WRITE;
> /* a read */
> } else {
> /* protection fault */
> diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
> index f00aefb..6fa7b05 100644
> --- a/arch/s390/mm/fault.c
> +++ b/arch/s390/mm/fault.c
> @@ -302,6 +302,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
> address = trans_exc_code & __FAIL_ADDR_MASK;
> perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
> flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
> + if (regs->psw.mask & PSW_MASK_PSTATE)
> + flags |= FAULT_FLAG_USER;
> if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
> flags |= FAULT_FLAG_WRITE;
> down_read(&mm->mmap_sem);
> diff --git a/arch/score/mm/fault.c b/arch/score/mm/fault.c
> index 4b71a62..52238983 100644
> --- a/arch/score/mm/fault.c
> +++ b/arch/score/mm/fault.c
> @@ -47,6 +47,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
> struct task_struct *tsk = current;
> struct mm_struct *mm = tsk->mm;
> const int field = sizeof(unsigned long) * 2;
> + unsigned long flags = 0;
> siginfo_t info;
> int fault;
>
> @@ -75,6 +76,9 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long write,
> if (in_atomic() || !mm)
> goto bad_area_nosemaphore;
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> +
> down_read(&mm->mmap_sem);
> vma = find_vma(mm, address);
> if (!vma)
> @@ -95,6 +99,7 @@ good_area:
> if (write) {
> if (!(vma->vm_flags & VM_WRITE))
> goto bad_area;
> + flags |= FAULT_FLAG_WRITE;
> } else {
> if (!(vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)))
> goto bad_area;
> @@ -105,7 +110,7 @@ good_area:
> * make sure we exit gracefully rather than endlessly redo
> * the fault.
> */
> - fault = handle_mm_fault(mm, vma, address, write);
> + fault = handle_mm_fault(mm, vma, address, flags);
> if (unlikely(fault & VM_FAULT_ERROR)) {
> if (fault & VM_FAULT_OOM)
> goto out_of_memory;
> diff --git a/arch/sh/mm/fault.c b/arch/sh/mm/fault.c
> index 1f49c28..541dc61 100644
> --- a/arch/sh/mm/fault.c
> +++ b/arch/sh/mm/fault.c
> @@ -400,9 +400,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
> struct mm_struct *mm;
> struct vm_area_struct * vma;
> int fault;
> - int write = error_code & FAULT_CODE_WRITE;
> - unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> - (write ? FAULT_FLAG_WRITE : 0));
> + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>
> tsk = current;
> mm = tsk->mm;
> @@ -476,6 +474,11 @@ good_area:
>
> set_thread_fault_code(error_code);
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> + if (error_code & FAULT_CODE_WRITE)
> + flags |= FAULT_FLAG_WRITE;
> +
> /*
> * If for any reason at all we couldn't handle the fault,
> * make sure we exit gracefully rather than endlessly redo
> diff --git a/arch/sparc/mm/fault_32.c b/arch/sparc/mm/fault_32.c
> index e98bfda..59dbd46 100644
> --- a/arch/sparc/mm/fault_32.c
> +++ b/arch/sparc/mm/fault_32.c
> @@ -177,8 +177,7 @@ asmlinkage void do_sparc_fault(struct pt_regs *regs, int text_fault, int write,
> unsigned long g2;
> int from_user = !(regs->psr & PSR_PS);
> int fault, code;
> - unsigned int flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> - (write ? FAULT_FLAG_WRITE : 0));
> + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>
> if (text_fault)
> address = regs->pc;
> @@ -235,6 +234,11 @@ good_area:
> goto bad_area;
> }
>
> + if (from_user)
> + flags |= FAULT_FLAG_USER;
> + if (write)
> + flags |= FAULT_FLAG_WRITE;
> +
> /*
> * If for any reason at all we couldn't handle the fault,
> * make sure we exit gracefully rather than endlessly redo
> @@ -383,6 +387,7 @@ static void force_user_fault(unsigned long address, int write)
> struct vm_area_struct *vma;
> struct task_struct *tsk = current;
> struct mm_struct *mm = tsk->mm;
> + unsigned int flags = FAULT_FLAG_USER;
> int code;
>
> code = SEGV_MAPERR;
> @@ -402,11 +407,12 @@ good_area:
> if (write) {
> if (!(vma->vm_flags & VM_WRITE))
> goto bad_area;
> + flags |= FAULT_FLAG_WRITE;
> } else {
> if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
> goto bad_area;
> }
> - switch (handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0)) {
> + switch (handle_mm_fault(mm, vma, address, flags)) {
> case VM_FAULT_SIGBUS:
> case VM_FAULT_OOM:
> goto do_sigbus;
> diff --git a/arch/sparc/mm/fault_64.c b/arch/sparc/mm/fault_64.c
> index 5062ff3..c08b9bb 100644
> --- a/arch/sparc/mm/fault_64.c
> +++ b/arch/sparc/mm/fault_64.c
> @@ -314,8 +314,9 @@ asmlinkage void __kprobes do_sparc64_fault(struct pt_regs *regs)
> } else {
> bad_kernel_pc(regs, address);
> return;
> - }
> - }
> + }
> + } else
> + flags |= FAULT_FLAG_USER;
>
> /*
> * If we're in an interrupt or have no user
> @@ -418,13 +419,14 @@ good_area:
> vma->vm_file != NULL)
> set_thread_fault_code(fault_code |
> FAULT_CODE_BLKCOMMIT);
> +
> + flags |= FAULT_FLAG_WRITE;
> } else {
> /* Allow reads even for write-only mappings */
> if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
> goto bad_area;
> }
>
> - flags |= ((fault_code & FAULT_CODE_WRITE) ? FAULT_FLAG_WRITE : 0);
> fault = handle_mm_fault(mm, vma, address, flags);
>
> if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
> diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
> index ac553ee..3ff289f 100644
> --- a/arch/tile/mm/fault.c
> +++ b/arch/tile/mm/fault.c
> @@ -280,8 +280,7 @@ static int handle_page_fault(struct pt_regs *regs,
> if (!is_page_fault)
> write = 1;
>
> - flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> - (write ? FAULT_FLAG_WRITE : 0));
> + flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>
> is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
>
> @@ -365,6 +364,9 @@ static int handle_page_fault(struct pt_regs *regs,
> goto bad_area_nosemaphore;
> }
>
> + if (!is_kernel_mode)
> + flags |= FAULT_FLAG_USER;
> +
> /*
> * When running in the kernel we expect faults to occur only to
> * addresses in user space. All other faults represent errors in the
> @@ -425,6 +427,7 @@ good_area:
> #endif
> if (!(vma->vm_flags & VM_WRITE))
> goto bad_area;
> + flags |= FAULT_FLAG_WRITE;
> } else {
> if (!is_page_fault || !(vma->vm_flags & VM_READ))
> goto bad_area;
> diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
> index b2f5adf..5c3aef7 100644
> --- a/arch/um/kernel/trap.c
> +++ b/arch/um/kernel/trap.c
> @@ -30,8 +30,7 @@ int handle_page_fault(unsigned long address, unsigned long ip,
> pmd_t *pmd;
> pte_t *pte;
> int err = -EFAULT;
> - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> - (is_write ? FAULT_FLAG_WRITE : 0);
> + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>
> *code_out = SEGV_MAPERR;
>
> @@ -42,6 +41,8 @@ int handle_page_fault(unsigned long address, unsigned long ip,
> if (in_atomic())
> goto out_nosemaphore;
>
> + if (is_user)
> + flags |= FAULT_FLAG_USER;
> retry:
> down_read(&mm->mmap_sem);
> vma = find_vma(mm, address);
> @@ -58,12 +59,15 @@ retry:
>
> good_area:
> *code_out = SEGV_ACCERR;
> - if (is_write && !(vma->vm_flags & VM_WRITE))
> - goto out;
> -
> - /* Don't require VM_READ|VM_EXEC for write faults! */
> - if (!is_write && !(vma->vm_flags & (VM_READ | VM_EXEC)))
> - goto out;
> + if (is_write) {
> + if (!(vma->vm_flags & VM_WRITE))
> + goto out;
> + flags |= FAULT_FLAG_WRITE;
> + } else {
> + /* Don't require VM_READ|VM_EXEC for write faults! */
> + if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
> + goto out;
> + }
>
> do {
> int fault;
> diff --git a/arch/unicore32/mm/fault.c b/arch/unicore32/mm/fault.c
> index 8ed3c45..0dc922d 100644
> --- a/arch/unicore32/mm/fault.c
> +++ b/arch/unicore32/mm/fault.c
> @@ -209,8 +209,7 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
> struct task_struct *tsk;
> struct mm_struct *mm;
> int fault, sig, code;
> - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> - ((!(fsr ^ 0x12)) ? FAULT_FLAG_WRITE : 0);
> + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>
> tsk = current;
> mm = tsk->mm;
> @@ -222,6 +221,11 @@ static int do_pf(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
> if (in_atomic() || !mm)
> goto no_context;
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> + if (!(fsr ^ 0x12))
> + flags |= FAULT_FLAG_WRITE;
> +
> /*
> * As per x86, we may deadlock here. However, since the kernel only
> * validly references user space from well defined areas of the code,
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index 654be4a..6d77c38 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -1011,9 +1011,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
> unsigned long address;
> struct mm_struct *mm;
> int fault;
> - int write = error_code & PF_WRITE;
> - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
> - (write ? FAULT_FLAG_WRITE : 0);
> + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
>
> tsk = current;
> mm = tsk->mm;
> @@ -1083,6 +1081,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
> if (user_mode_vm(regs)) {
> local_irq_enable();
> error_code |= PF_USER;
> + flags |= FAULT_FLAG_USER;
> } else {
> if (regs->flags & X86_EFLAGS_IF)
> local_irq_enable();
> @@ -1109,6 +1108,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code)
> return;
> }
>
> + if (error_code & PF_WRITE)
> + flags |= FAULT_FLAG_WRITE;
> +
> /*
> * When running in the kernel we expect faults to occur only to
> * addresses in user space. All other faults represent errors in
> diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c
> index 4b7bc8d..70fa7bc 100644
> --- a/arch/xtensa/mm/fault.c
> +++ b/arch/xtensa/mm/fault.c
> @@ -72,6 +72,8 @@ void do_page_fault(struct pt_regs *regs)
> address, exccause, regs->pc, is_write? "w":"", is_exec? "x":"");
> #endif
>
> + if (user_mode(regs))
> + flags |= FAULT_FLAG_USER;
> retry:
> down_read(&mm->mmap_sem);
> vma = find_vma(mm, address);
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index d5c82dc..c51fc32 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -170,6 +170,7 @@ extern pgprot_t protection_map[16];
> #define FAULT_FLAG_RETRY_NOWAIT 0x10 /* Don't drop mmap_sem and wait when retrying */
> #define FAULT_FLAG_KILLABLE 0x20 /* The fault task is in SIGKILL killable region */
> #define FAULT_FLAG_TRIED 0x40 /* second try */
> +#define FAULT_FLAG_USER 0x80 /* The fault originated in userspace */
>
> /*
> * vm_fault is filled by the the pagefault handler and passed to the vma's
> --
> 1.8.3.2
>
--
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists