Allow major faults to drop the mmap_sem read lock while waiting for IO to complete. This is done by flagging in current->flags that the caller can tolerate a retry in the ->nopage path. Benchmarks indicate that this double data structure walking in the case of a major fault results in << 1% performance hit. (test was to balloon and drain the page cache, followed by 64 threads faulting in pages off disk in reverse order). Signed-off-by: Mike Waychison Differences ... arch/i386/mm/fault.c | 30 ++++++++++++++++++++++++++---- arch/x86_64/mm/fault.c | 30 ++++++++++++++++++++++++++---- include/linux/mm.h | 2 ++ include/linux/sched.h | 1 + mm/filemap.c | 22 +++++++++++++++++++++- mm/memory.c | 4 ++++ 6 files changed, 80 insertions(+), 9 deletions(-) Index: linux-2.6.18-rc7/arch/i386/mm/fault.c =================================================================== --- linux-2.6.18-rc7.orig/arch/i386/mm/fault.c 2006-09-19 16:04:21.000000000 -0700 +++ linux-2.6.18-rc7/arch/i386/mm/fault.c 2006-09-19 16:08:15.000000000 -0700 @@ -325,8 +325,8 @@ static inline int vmalloc_fault(unsigned * bit 3 == 1 means use of reserved bit detected * bit 4 == 1 means fault was an instruction fetch */ -fastcall void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) +static inline void __do_page_fault(struct pt_regs *regs, + unsigned long error_code) { struct task_struct *tsk; struct mm_struct *mm; @@ -407,7 +407,7 @@ fastcall void __kprobes do_page_fault(st goto bad_area_nosemaphore; down_read(&mm->mmap_sem); } - +retry: vma = find_vma(mm, address); if (!vma) goto bad_area; @@ -461,7 +461,15 @@ good_area: */ switch (handle_mm_fault(mm, vma, address, write)) { case VM_FAULT_MINOR: - tsk->min_flt++; + /* + * If we had to retry (PF_FAULT_MAYRETRY cleared), then + * the page originally wasn't up to date before the + * retry, but now it is. + */ + if (!(current->flags & PF_FAULT_MAYRETRY)) + tsk->maj_flt++; + else + tsk->min_flt++; break; case VM_FAULT_MAJOR: tsk->maj_flt++; @@ -470,6 +478,12 @@ good_area: goto do_sigbus; case VM_FAULT_OOM: goto out_of_memory; + case VM_FAULT_RETRY: + if (current->flags & PF_FAULT_MAYRETRY) { + current->flags &= ~PF_FAULT_MAYRETRY; + goto retry; + } + BUG(); default: BUG(); } @@ -625,6 +639,14 @@ do_sigbus: force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); } +fastcall void __kprobes do_page_fault(struct pt_regs *regs, + unsigned long error_code) +{ + current->flags |= PF_FAULT_MAYRETRY; + __do_page_fault(regs, error_code); + current->flags &= ~PF_FAULT_MAYRETRY; +} + #ifndef CONFIG_X86_PAE void vmalloc_sync_all(void) { Index: linux-2.6.18-rc7/arch/x86_64/mm/fault.c =================================================================== --- linux-2.6.18-rc7.orig/arch/x86_64/mm/fault.c 2006-09-19 16:04:28.000000000 -0700 +++ linux-2.6.18-rc7/arch/x86_64/mm/fault.c 2006-09-19 16:09:02.000000000 -0700 @@ -336,8 +336,8 @@ int exception_trace = 1; * and the problem, and then passes it off to one of the appropriate * routines. */ -asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, - unsigned long error_code) +static inline void __do_page_fault(struct pt_regs *regs, + unsigned long error_code) { struct task_struct *tsk; struct mm_struct *mm; @@ -435,7 +435,7 @@ asmlinkage void __kprobes do_page_fault( goto bad_area_nosemaphore; down_read(&mm->mmap_sem); } - +retry: vma = find_vma(mm, address); if (!vma) goto bad_area; @@ -481,13 +481,27 @@ good_area: */ switch (handle_mm_fault(mm, vma, address, write)) { case VM_FAULT_MINOR: - tsk->min_flt++; + /* + * If we had to retry (PF_FAULT_MAYRETRY cleared), then the + * page originally wasn't up to date before the retry, but now + * it is. + */ + if (!(current->flags & PF_FAULT_MAYRETRY)) + tsk->maj_flt++; + else + tsk->min_flt++; break; case VM_FAULT_MAJOR: tsk->maj_flt++; break; case VM_FAULT_SIGBUS: goto do_sigbus; + case VM_FAULT_RETRY: + if (current->flags & PF_FAULT_MAYRETRY) { + current->flags &= ~PF_FAULT_MAYRETRY; + goto retry; + } + BUG(); default: goto out_of_memory; } @@ -613,6 +627,14 @@ do_sigbus: return; } +asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, + unsigned long error_code) +{ + current->flags |= PF_FAULT_MAYRETRY; + __do_page_fault(regs, error_code); + current->flags &= ~PF_FAULT_MAYRETRY; +} + DEFINE_SPINLOCK(pgd_lock); struct page *pgd_list; Index: linux-2.6.18-rc7/include/linux/mm.h =================================================================== --- linux-2.6.18-rc7.orig/include/linux/mm.h 2006-09-19 16:04:53.000000000 -0700 +++ linux-2.6.18-rc7/include/linux/mm.h 2006-09-19 16:05:33.000000000 -0700 @@ -623,6 +623,7 @@ static inline int page_mapped(struct pag */ #define NOPAGE_SIGBUS (NULL) #define NOPAGE_OOM ((struct page *) (-1)) +#define NOPAGE_RETRY ((struct page *) (-2)) /* * Different kinds of faults, as returned by handle_mm_fault(). @@ -633,6 +634,7 @@ static inline int page_mapped(struct pag #define VM_FAULT_SIGBUS 0x01 #define VM_FAULT_MINOR 0x02 #define VM_FAULT_MAJOR 0x03 +#define VM_FAULT_RETRY 0x04 /* * Special case for get_user_pages. Index: linux-2.6.18-rc7/mm/filemap.c =================================================================== --- linux-2.6.18-rc7.orig/mm/filemap.c 2006-09-19 16:04:55.000000000 -0700 +++ linux-2.6.18-rc7/mm/filemap.c 2006-09-19 16:05:33.000000000 -0700 @@ -1486,7 +1486,27 @@ page_not_uptodate: majmin = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); } - lock_page(page); + + if (!(current->flags & PF_FAULT_MAYRETRY)) { + lock_page(page); + } else if (TestSetPageLocked(page)) { + struct mm_struct *mm = area->vm_mm; + + /* + * Page is already locked by someone else. + * + * We don't want to be holding down_read(mmap_sem) + * inside lock_page(), so use wait_on_page_locked() here. + */ + up_read(&mm->mmap_sem); + wait_on_page_locked(page); + down_read(&mm->mmap_sem); + /* + * The VMA tree may have changed at this point. + */ + page_cache_release(page); + return NOPAGE_RETRY; + } /* Did it get unhashed while we waited for it? */ if (!page->mapping) { Index: linux-2.6.18-rc7/mm/memory.c =================================================================== --- linux-2.6.18-rc7.orig/mm/memory.c 2006-09-19 16:04:55.000000000 -0700 +++ linux-2.6.18-rc7/mm/memory.c 2006-09-19 16:05:33.000000000 -0700 @@ -2122,6 +2122,10 @@ retry: return VM_FAULT_SIGBUS; if (new_page == NOPAGE_OOM) return VM_FAULT_OOM; + /* page may be available, but we have to restart the process because + * mmap_sem was dropped during the ->nopage */ + if (new_page == NOPAGE_RETRY) + return VM_FAULT_RETRY; /* * Should we do an early C-O-W break? Index: linux-2.6.18-rc7/include/linux/sched.h =================================================================== --- linux-2.6.18-rc7.orig/include/linux/sched.h 2006-09-19 16:04:54.000000000 -0700 +++ linux-2.6.18-rc7/include/linux/sched.h 2006-09-19 16:10:01.000000000 -0700 @@ -1054,6 +1054,7 @@ static inline void put_task_struct(struc #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ #define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ #define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ +#define PF_FAULT_MAYRETRY 0x04000000 /* I may drop mmap_sem during fault */ #define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */