Allow major faults to drop the mmap_sem read lock while waiting for IO to complete.  This is done by flagging in current->flags that the caller can tolerate a retry in the ->nopage path.
	
	Benchmarks indicate that this double data structure walking in the case of a major fault results in << 1% performance hit. (test was to balloon and drain the page cache, followed by 64 threads faulting in pages off disk in reverse order).
	
	Signed-off-by: Mike Waychison <mikew@google.com>

Differences ...

 arch/i386/mm/fault.c   |   30 ++++++++++++++++++++++++++----
 arch/x86_64/mm/fault.c |   30 ++++++++++++++++++++++++++----
 include/linux/mm.h     |    2 ++
 include/linux/sched.h  |    1 +
 mm/filemap.c           |   22 +++++++++++++++++++++-
 mm/memory.c            |    4 ++++
 6 files changed, 80 insertions(+), 9 deletions(-)

Index: linux-2.6.18-rc7/arch/i386/mm/fault.c
===================================================================
--- linux-2.6.18-rc7.orig/arch/i386/mm/fault.c	2006-09-19 16:04:21.000000000 -0700
+++ linux-2.6.18-rc7/arch/i386/mm/fault.c	2006-09-19 16:08:15.000000000 -0700
@@ -325,8 +325,8 @@ static inline int vmalloc_fault(unsigned
  *	bit 3 == 1 means use of reserved bit detected
  *	bit 4 == 1 means fault was an instruction fetch
  */
-fastcall void __kprobes do_page_fault(struct pt_regs *regs,
-				      unsigned long error_code)
+static inline void __do_page_fault(struct pt_regs *regs,
+				   unsigned long error_code)
 {
 	struct task_struct *tsk;
 	struct mm_struct *mm;
@@ -407,7 +407,7 @@ fastcall void __kprobes do_page_fault(st
 			goto bad_area_nosemaphore;
 		down_read(&mm->mmap_sem);
 	}
-
+retry:
 	vma = find_vma(mm, address);
 	if (!vma)
 		goto bad_area;
@@ -461,7 +461,15 @@ good_area:
 	 */
 	switch (handle_mm_fault(mm, vma, address, write)) {
 		case VM_FAULT_MINOR:
-			tsk->min_flt++;
+			/*
+			 * If we had to retry (PF_FAULT_MAYRETRY cleared), then
+			 * the page originally wasn't up to date before the
+			 * retry, but now it is.
+			 */
+			if (!(current->flags & PF_FAULT_MAYRETRY))
+				tsk->maj_flt++;
+			else
+				tsk->min_flt++;
 			break;
 		case VM_FAULT_MAJOR:
 			tsk->maj_flt++;
@@ -470,6 +478,12 @@ good_area:
 			goto do_sigbus;
 		case VM_FAULT_OOM:
 			goto out_of_memory;
+		case VM_FAULT_RETRY:
+			if (current->flags & PF_FAULT_MAYRETRY) {
+				current->flags &= ~PF_FAULT_MAYRETRY;
+				goto retry;
+			}
+			BUG();
 		default:
 			BUG();
 	}
@@ -625,6 +639,14 @@ do_sigbus:
 	force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk);
 }
 
+fastcall void __kprobes do_page_fault(struct pt_regs *regs,
+				      unsigned long error_code)
+{
+	current->flags |= PF_FAULT_MAYRETRY;
+	__do_page_fault(regs, error_code);
+	current->flags &= ~PF_FAULT_MAYRETRY;
+}
+
 #ifndef CONFIG_X86_PAE
 void vmalloc_sync_all(void)
 {
Index: linux-2.6.18-rc7/arch/x86_64/mm/fault.c
===================================================================
--- linux-2.6.18-rc7.orig/arch/x86_64/mm/fault.c	2006-09-19 16:04:28.000000000 -0700
+++ linux-2.6.18-rc7/arch/x86_64/mm/fault.c	2006-09-19 16:09:02.000000000 -0700
@@ -336,8 +336,8 @@ int exception_trace = 1;
  * and the problem, and then passes it off to one of the appropriate
  * routines.
  */
-asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
-					unsigned long error_code)
+static inline void __do_page_fault(struct pt_regs *regs,
+				   unsigned long error_code)
 {
 	struct task_struct *tsk;
 	struct mm_struct *mm;
@@ -435,7 +435,7 @@ asmlinkage void __kprobes do_page_fault(
 			goto bad_area_nosemaphore;
 		down_read(&mm->mmap_sem);
 	}
-
+retry:
 	vma = find_vma(mm, address);
 	if (!vma)
 		goto bad_area;
@@ -481,13 +481,27 @@ good_area:
 	 */
 	switch (handle_mm_fault(mm, vma, address, write)) {
 	case VM_FAULT_MINOR:
-		tsk->min_flt++;
+		/*
+		 * If we had to retry (PF_FAULT_MAYRETRY cleared), then the
+		 * page originally wasn't up to date before the retry, but now
+		 * it is.
+		 */
+		if (!(current->flags & PF_FAULT_MAYRETRY))
+			tsk->maj_flt++;
+		else
+			tsk->min_flt++;
 		break;
 	case VM_FAULT_MAJOR:
 		tsk->maj_flt++;
 		break;
 	case VM_FAULT_SIGBUS:
 		goto do_sigbus;
+	case VM_FAULT_RETRY:
+		if (current->flags & PF_FAULT_MAYRETRY) {
+			current->flags &= ~PF_FAULT_MAYRETRY;
+			goto retry;
+		}
+		BUG();
 	default:
 		goto out_of_memory;
 	}
@@ -613,6 +627,14 @@ do_sigbus:
 	return;
 }
 
+asmlinkage void __kprobes do_page_fault(struct pt_regs *regs,
+					unsigned long error_code)
+{
+	current->flags |= PF_FAULT_MAYRETRY;
+	__do_page_fault(regs, error_code);
+	current->flags &= ~PF_FAULT_MAYRETRY;
+}
+
 DEFINE_SPINLOCK(pgd_lock);
 struct page *pgd_list;
 
Index: linux-2.6.18-rc7/include/linux/mm.h
===================================================================
--- linux-2.6.18-rc7.orig/include/linux/mm.h	2006-09-19 16:04:53.000000000 -0700
+++ linux-2.6.18-rc7/include/linux/mm.h	2006-09-19 16:05:33.000000000 -0700
@@ -623,6 +623,7 @@ static inline int page_mapped(struct pag
  */
 #define NOPAGE_SIGBUS	(NULL)
 #define NOPAGE_OOM	((struct page *) (-1))
+#define NOPAGE_RETRY	((struct page *) (-2))
 
 /*
  * Different kinds of faults, as returned by handle_mm_fault().
@@ -633,6 +634,7 @@ static inline int page_mapped(struct pag
 #define VM_FAULT_SIGBUS	0x01
 #define VM_FAULT_MINOR	0x02
 #define VM_FAULT_MAJOR	0x03
+#define VM_FAULT_RETRY	0x04
 
 /* 
  * Special case for get_user_pages.
Index: linux-2.6.18-rc7/mm/filemap.c
===================================================================
--- linux-2.6.18-rc7.orig/mm/filemap.c	2006-09-19 16:04:55.000000000 -0700
+++ linux-2.6.18-rc7/mm/filemap.c	2006-09-19 16:05:33.000000000 -0700
@@ -1486,7 +1486,27 @@ page_not_uptodate:
 		majmin = VM_FAULT_MAJOR;
 		count_vm_event(PGMAJFAULT);
 	}
-	lock_page(page);
+
+	if (!(current->flags & PF_FAULT_MAYRETRY)) {
+		lock_page(page);
+	} else if (TestSetPageLocked(page)) {
+		struct mm_struct *mm = area->vm_mm;
+
+		/*
+		 * Page is already locked by someone else.
+		 *
+		 * We don't want to be holding down_read(mmap_sem)
+		 * inside lock_page(), so use wait_on_page_locked() here.
+		 */
+		up_read(&mm->mmap_sem);
+		wait_on_page_locked(page);
+		down_read(&mm->mmap_sem);
+		/*
+		 * The VMA tree may have changed at this point.
+		 */
+		page_cache_release(page);
+		return NOPAGE_RETRY;
+	}
 
 	/* Did it get unhashed while we waited for it? */
 	if (!page->mapping) {
Index: linux-2.6.18-rc7/mm/memory.c
===================================================================
--- linux-2.6.18-rc7.orig/mm/memory.c	2006-09-19 16:04:55.000000000 -0700
+++ linux-2.6.18-rc7/mm/memory.c	2006-09-19 16:05:33.000000000 -0700
@@ -2122,6 +2122,10 @@ retry:
 		return VM_FAULT_SIGBUS;
 	if (new_page == NOPAGE_OOM)
 		return VM_FAULT_OOM;
+	/* page may be available, but we have to restart the process because
+	 * mmap_sem was dropped during the ->nopage */
+	if (new_page == NOPAGE_RETRY)
+		return VM_FAULT_RETRY;
 
 	/*
 	 * Should we do an early C-O-W break?
Index: linux-2.6.18-rc7/include/linux/sched.h
===================================================================
--- linux-2.6.18-rc7.orig/include/linux/sched.h	2006-09-19 16:04:54.000000000 -0700
+++ linux-2.6.18-rc7/include/linux/sched.h	2006-09-19 16:10:01.000000000 -0700
@@ -1054,6 +1054,7 @@ static inline void put_task_struct(struc
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
+#define PF_FAULT_MAYRETRY 0x04000000	/* I may drop mmap_sem during fault */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */