linux-kernel - [RFC 3/4] lockless vma caching

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Fri, 18 Dec 2009 09:45:13 +0900
From:	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
To:	KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
Cc:	Minchan Kim <minchan.kim@...il.com>,
	Andi Kleen <andi@...stfloor.org>,
	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
	"linux-mm@...ck.org" <linux-mm@...ck.org>, cl@...ux-foundation.org,
	"akpm@...ux-foundation.org" <akpm@...ux-foundation.org>,
	"mingo@...e.hu" <mingo@...e.hu>
Subject: [RFC 3/4] lockless vma caching

For accessing vma in lockless style, some modification for vma lookup is
required. Now, rb-tree is used and it doesn't allow read while modification.

This is a trial to caching vma rather than diving into rb-tree. The last
fault vma is cached to pgd's page->cached_vma field. And, add reference count
and waitqueue to vma.

The accessor will have to do

	vma = lookup_vma_cache(mm, address);
	if (vma) {
		if (mm_check_version(mm) && /* no write lock at this point ? */
		    (vma->vm_start <= address) && (vma->vm_end > address))
			goto found_vma; /* start speculative job */
		else
			vma_release_cache(vma);
		vma = NULL;
	}
	vma = find_vma();
found_vma:
	....do some jobs....
	vma_release_cache(vma);

Maybe some more consideration for invalidation point is necessary.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
---
 include/linux/mm.h       |   20 +++++++++
 include/linux/mm_types.h |    5 ++
 mm/memory.c              |   14 ++++++
 mm/mmap.c                |  102 +++++++++++++++++++++++++++++++++++++++++++++--
 mm/page_alloc.c          |    1 
 5 files changed, 138 insertions(+), 4 deletions(-)

Index: mmotm-mm-accessor/include/linux/mm.h
===================================================================
--- mmotm-mm-accessor.orig/include/linux/mm.h
+++ mmotm-mm-accessor/include/linux/mm.h
@@ -763,6 +763,26 @@ unsigned long unmap_vmas(struct mmu_gath
 		unsigned long end_addr, unsigned long *nr_accounted,
 		struct zap_details *);
 
+struct vm_area_struct *lookup_vma_cache(struct mm_struct *mm,
+		unsigned long address);
+void invalidate_vma_cache(struct mm_struct *mm,
+		struct vm_area_struct *vma);
+void wait_vmas_cache_range(struct vm_area_struct *vma, unsigned long end);
+
+static inline void vma_hold(struct vm_area_struct *vma)
+{
+	atomic_inc(&vma->cache_access);
+}
+
+void __vma_release(struct vm_area_struct *vma);
+static inline void vma_release(struct vm_area_struct *vma)
+{
+	if (atomic_dec_and_test(&vma->cache_access)) {
+		if (waitqueue_active(&vma->cache_wait))
+			__vma_release(vma);
+	}
+}
+
 /**
  * mm_walk - callbacks for walk_page_range
  * @pgd_entry: if set, called for each non-empty PGD (top-level) entry
Index: mmotm-mm-accessor/include/linux/mm_types.h
===================================================================
--- mmotm-mm-accessor.orig/include/linux/mm_types.h
+++ mmotm-mm-accessor/include/linux/mm_types.h
@@ -12,6 +12,7 @@
 #include <linux/completion.h>
 #include <linux/cpumask.h>
 #include <linux/page-debug-flags.h>
+#include <linux/wait.h>
 #include <asm/page.h>
 #include <asm/mmu.h>
 
@@ -77,6 +78,7 @@ struct page {
 	union {
 		pgoff_t index;		/* Our offset within mapping. */
 		void *freelist;		/* SLUB: freelist req. slab lock */
+		void *cache;
 	};
 	struct list_head lru;		/* Pageout list, eg. active_list
 					 * protected by zone->lru_lock !
@@ -180,6 +182,9 @@ struct vm_area_struct {
 	void * vm_private_data;		/* was vm_pte (shared mem) */
 	unsigned long vm_truncate_count;/* truncate_count or restart_addr */
 
+	atomic_t cache_access;
+	wait_queue_head_t cache_wait;
+
 #ifndef CONFIG_MMU
 	struct vm_region *vm_region;	/* NOMMU mapping region */
 #endif
Index: mmotm-mm-accessor/mm/memory.c
===================================================================
--- mmotm-mm-accessor.orig/mm/memory.c
+++ mmotm-mm-accessor/mm/memory.c
@@ -145,6 +145,14 @@ void pmd_clear_bad(pmd_t *pmd)
 	pmd_clear(pmd);
 }
 
+static void update_vma_cache(pmd_t *pmd, struct vm_area_struct *vma)
+{
+	struct page *page;
+	/* ptelock is held */
+	page = pmd_page(*pmd);
+	page->cache = vma;
+}
+
 /*
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
@@ -2118,6 +2126,7 @@ reuse:
 		if (ptep_set_access_flags(vma, address, page_table, entry,1))
 			update_mmu_cache(vma, address, entry);
 		ret |= VM_FAULT_WRITE;
+		update_vma_cache(pmd, vma);
 		goto unlock;
 	}
 
@@ -2186,6 +2195,7 @@ gotten:
 		 */
 		set_pte_at_notify(mm, address, page_table, entry);
 		update_mmu_cache(vma, address, entry);
+		update_vma_cache(pmd, vma);
 		if (old_page) {
 			/*
 			 * Only after switching the pte to the new page may
@@ -2626,6 +2636,7 @@ static int do_swap_page(struct mm_struct
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, pte);
+	update_vma_cache(pmd, vma);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 out:
@@ -2691,6 +2702,7 @@ setpte:
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, address, entry);
+	update_vma_cache(pmd, vma);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 	return 0;
@@ -2852,6 +2864,7 @@ static int __do_fault(struct mm_struct *
 
 		/* no need to invalidate: a not-present page won't be cached */
 		update_mmu_cache(vma, address, entry);
+		update_vma_cache(pmd, vma);
 	} else {
 		if (charged)
 			mem_cgroup_uncharge_page(page);
@@ -2989,6 +3002,7 @@ static inline int handle_pte_fault(struc
 	entry = pte_mkyoung(entry);
 	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
 		update_mmu_cache(vma, address, entry);
+		update_vma_cache(pmd, vma);
 	} else {
 		/*
 		 * This is needed only for protection faults but the arch code
Index: mmotm-mm-accessor/mm/mmap.c
===================================================================
--- mmotm-mm-accessor.orig/mm/mmap.c
+++ mmotm-mm-accessor/mm/mmap.c
@@ -187,6 +187,94 @@ error:
 	return -ENOMEM;
 }
 
+struct vm_area_struct *
+lookup_vma_cache(struct mm_struct *mm, unsigned long address)
+{
+	struct page *page;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	struct vm_area_struct *ret = NULL;
+
+	if (!mm)
+		return NULL;
+
+	preempt_disable();
+	pgd = pgd_offset(mm, address);
+	if (!pgd_present(*pgd))
+		goto out;
+	pud = pud_offset(pgd, address);
+	if (!pud_present(*pud))
+		goto out;
+	pmd = pmd_offset(pud, address);
+	if (!pmd_present(*pmd))
+		goto out;
+	page = pmd_page(*pmd);
+	if (PageReserved(page))
+		goto out;
+	ret = (struct vm_area_struct *)page->cache;
+	if (ret)
+		vma_hold(ret);
+out:
+	preempt_enable();
+	return ret;
+}
+
+void invalidate_vma_cache_range(struct mm_struct *mm,
+	unsigned long start, unsigned long end)
+{
+	struct page *page;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	unsigned long address = start;
+	spinlock_t *lock;
+
+	if (!mm)
+		return;
+
+	while (address < end) {
+		pgd = pgd_offset(mm, address);
+		if (!pgd_present(*pgd)) {
+			address = pgd_addr_end(address, end);
+			continue;
+		}
+		pud = pud_offset(pgd, address);
+		if (!pud_present(*pud)) {
+			address = pud_addr_end(address, end);
+			continue;
+		}
+		pmd = pmd_offset(pud, address);
+		if (pmd_present(*pmd)) {
+			page = pmd_page(*pmd);
+			/*
+ 			 * this spinlock guarantee no race with speculative
+			 * page fault, finally.
+			 */
+			lock = pte_lockptr(mm, pmd);
+			spin_lock(lock);
+			page->cache = NULL;
+			spin_unlock(lock);
+		}
+		address = pmd_addr_end(address, end);
+	}
+}
+
+/* called under mm_write_lock() */
+void wait_vmas_cache_access(struct vm_area_struct *vma, unsigned long end)
+{
+	while (vma && (vma->vm_start < end)) {
+		wait_event_interruptible(vma->cache_wait,
+				atomic_read(&vma->cache_access) == 0);
+		vma = vma->vm_next;
+	}
+}
+
+void __vma_release(struct vm_area_struct *vma)
+{
+	wake_up(&vma->cache_wait);
+}
+
 /*
  * Requires inode->i_mapping->i_mmap_lock
  */
@@ -406,6 +494,8 @@ void __vma_link_rb(struct mm_struct *mm,
 {
 	rb_link_node(&vma->vm_rb, rb_parent, rb_link);
 	rb_insert_color(&vma->vm_rb, &mm->mm_rb);
+	atomic_set(&vma->cache_access, 0);
+	init_waitqueue_head(&vma->cache_wait);
 }
 
 static void __vma_link_file(struct vm_area_struct *vma)
@@ -774,7 +864,8 @@ struct vm_area_struct *vma_merge(struct 
 	area = next;
 	if (next && next->vm_end == end)		/* cases 6, 7, 8 */
 		next = next->vm_next;
-
+	invalidate_vma_cache_range(mm, addr, end);
+	wait_vmas_cache_access(next, end);
 	/*
 	 * Can it merge with the predecessor?
 	 */
@@ -1162,7 +1253,6 @@ munmap_back:
 			return -ENOMEM;
 		vm_flags |= VM_ACCOUNT;
 	}
-
 	/*
 	 * Can we just expand an old mapping?
 	 */
@@ -1930,7 +2020,9 @@ int do_munmap(struct mm_struct *mm, unsi
 	end = start + len;
 	if (vma->vm_start >= end)
 		return 0;
-
+	/* Before going further, clear vma cache */
+	invalidate_vma_cache_range(mm, start, end);
+	wait_vmas_cache_access(vma, end);
 	/*
 	 * If we need to split any vma, do it now to save pain later.
 	 *
@@ -1940,7 +2032,6 @@ int do_munmap(struct mm_struct *mm, unsi
 	 */
 	if (start > vma->vm_start) {
 		int error;
-
 		/*
 		 * Make sure that map_count on return from munmap() will
 		 * not exceed its limit; but let map_count go just above
@@ -2050,6 +2141,7 @@ unsigned long do_brk(unsigned long addr,
 	if (error)
 		return error;
 
+
 	/*
 	 * mlock MCL_FUTURE?
 	 */
@@ -2069,6 +2161,7 @@ unsigned long do_brk(unsigned long addr,
 	 */
 	verify_mm_writelocked(mm);
 
+	invalidate_vma_cache_range(mm, addr,addr+len);
 	/*
 	 * Clear old maps.  this also does some error checking for us
 	 */
@@ -2245,6 +2338,7 @@ struct vm_area_struct *copy_vma(struct v
 				kmem_cache_free(vm_area_cachep, new_vma);
 				return NULL;
 			}
+			atomic_set(&new_vma->cache_access, 0);
 			vma_set_policy(new_vma, pol);
 			new_vma->vm_start = addr;
 			new_vma->vm_end = addr + len;
Index: mmotm-mm-accessor/mm/page_alloc.c
===================================================================
--- mmotm-mm-accessor.orig/mm/page_alloc.c
+++ mmotm-mm-accessor/mm/page_alloc.c
@@ -698,6 +698,7 @@ static int prep_new_page(struct page *pa
 
 	set_page_private(page, 0);
 	set_page_refcounted(page);
+	page->cache = NULL;
 
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/