lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:	Sat, 6 Mar 2010 02:02:12 +0100
From:	Christian Ehrhardt <uni@...e.de>
To:	Rik van Riel <riel@...hat.com>
Cc:	Christian Ehrhardt <lk@...e.de>, linux-kernel@...r.kernel.org,
	akpm@...ux-foundation.org, linux-mm@...ck.org
Subject: Re: [PATCH] rmap: Fix Bugzilla Bug #5493


Hi Rik,

On Fri, Mar 05, 2010 at 09:10:53AM -0500, Rik van Riel wrote:
> Your patch will not apply against a current -mm, because it
> conflicts with my anon_vma linking patches (which attacks
> another "rmap walks too many vmas" failure mode).
> 
> Please rediff your patch against the latest -mm tree.

Conflict resolution turned out to be somewhat difficult. The following
is a patch agains the -mm tree from git://zen-kernel.org/kernel/mmotm.git
of today.

It compiles but I probably won't have time to test the -mm version
before next week.

There is one caveat: I had to remove the debugging patch from
mm/prio_tree.c because this check can no longer be done at that place.
If it is still required, it should be done in a macro inside the
callers that pass vmas and not anon_vma_chains to this function.

       regards     Christian

Signed-off-by: Christian Ehrhardt <lk@...e.de>

 arch/arm/mm/fault-armv.c   |    3 +-
 arch/arm/mm/flush.c        |    3 +-
 arch/parisc/kernel/cache.c |    3 +-
 arch/x86/mm/hugetlbpage.c  |    3 +-
 fs/hugetlbfs/inode.c       |    3 +-
 fs/inode.c                 |    2 +-
 include/linux/mm.h         |   28 ++++++--
 include/linux/mm_types.h   |   10 +---
 include/linux/prio_tree.h  |   17 ++++-
 include/linux/rmap.h       |   11 ++--
 kernel/fork.c              |    2 +-
 lib/prio_tree.c            |   14 +++-
 mm/filemap_xip.c           |    3 +-
 mm/fremap.c                |    2 +-
 mm/hugetlb.c               |    3 +-
 mm/ksm.c                   |   21 +++++-
 mm/memory-failure.c        |    9 ++-
 mm/memory.c                |    5 +-
 mm/mmap.c                  |   26 ++++----
 mm/nommu.c                 |   12 ++--
 mm/prio_tree.c             |  161 ++++++++++++++++++--------------------------
 mm/rmap.c                  |   38 +++++++---
 22 files changed, 208 insertions(+), 171 deletions(-)

diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c
index c9b97e9..4b8d01f 100644
--- a/arch/arm/mm/fault-armv.c
+++ b/arch/arm/mm/fault-armv.c
@@ -117,7 +117,8 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma,
 	 * cache coherency.
 	 */
 	flush_dcache_mmap_lock(mapping);
-	vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_prio_tree_foreach(mpnt, struct vm_area_struct, shared, &iter,
+				&mapping->i_mmap, pgoff, pgoff) {
 		/*
 		 * If this VMA is not in our MM, we can ignore it.
 		 * Note that we intentionally mask out the VMA
diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
index e34f095..5264230 100644
--- a/arch/arm/mm/flush.c
+++ b/arch/arm/mm/flush.c
@@ -194,7 +194,8 @@ static void __flush_dcache_aliases(struct address_space *mapping, struct page *p
 	pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 
 	flush_dcache_mmap_lock(mapping);
-	vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_prio_tree_foreach(mpnt, struct vm_area_struct, shared, &iter,
+					&mapping->i_mmap, pgoff, pgoff) {
 		unsigned long offset;
 
 		/*
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index d054f3d..bf9890b 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -365,7 +365,8 @@ void flush_dcache_page(struct page *page)
 	 * to flush one address here for them all to become coherent */
 
 	flush_dcache_mmap_lock(mapping);
-	vma_prio_tree_foreach(mpnt, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_prio_tree_foreach(mpnt, struct vm_area_struct, shared, &iter,
+					&mapping->i_mmap, pgoff, pgoff) {
 		offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT;
 		addr = mpnt->vm_start + offset;
 
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index f46c340..f7a7954 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -74,7 +74,8 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 		return;
 
 	spin_lock(&mapping->i_mmap_lock);
-	vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
+	vma_prio_tree_foreach(svma, struct vm_area_struct, shared, &iter,
+					&mapping->i_mmap, idx, idx) {
 		if (svma == vma)
 			continue;
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a0bbd3d..1827552 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -400,7 +400,8 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff)
 	struct vm_area_struct *vma;
 	struct prio_tree_iter iter;
 
-	vma_prio_tree_foreach(vma, &iter, root, pgoff, ULONG_MAX) {
+	vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter, root,
+						pgoff, ULONG_MAX) {
 		unsigned long v_offset;
 
 		/*
diff --git a/fs/inode.c b/fs/inode.c
index 2800597..3a27f74 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -262,7 +262,7 @@ void inode_init_once(struct inode *inode)
 	spin_lock_init(&inode->i_data.i_mmap_lock);
 	INIT_LIST_HEAD(&inode->i_data.private_list);
 	spin_lock_init(&inode->i_data.private_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
+	INIT_SHARED_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
 	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
 	i_size_ordered_init(inode);
 #ifdef CONFIG_FSNOTIFY
diff --git a/include/linux/mm.h b/include/linux/mm.h
index cb1144f..632d4c5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1200,15 +1200,29 @@ extern atomic_long_t mmap_pages_allocated;
 extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
 
 /* prio_tree.c */
-void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
-void vma_prio_tree_insert(struct vm_area_struct *, struct prio_tree_root *);
-void vma_prio_tree_remove(struct vm_area_struct *, struct prio_tree_root *);
-struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
+void vma_prio_tree_add(union vma_prio_tree_node *node,
+	union vma_prio_tree_node *old);
+void vma_prio_tree_insert(union vma_prio_tree_node *, struct prio_tree_root *);
+void vma_prio_tree_remove(union vma_prio_tree_node *, struct prio_tree_root *);
+union vma_prio_tree_node *vma_prio_tree_next(union vma_prio_tree_node *,
 	struct prio_tree_iter *iter);
 
-#define vma_prio_tree_foreach(vma, iter, root, begin, end)	\
-	for (prio_tree_iter_init(iter, root, begin, end), vma = NULL;	\
-		(vma = vma_prio_tree_next(vma, iter)); )
+#define vma_prio_tree_first_entry(iter, type, field) ({			\
+	union vma_prio_tree_node *__t;					\
+	__t = vma_prio_tree_next(NULL, iter);				\
+	__t ? prio_tree_entry(__t, type, field) : NULL;\
+})
+
+#define vma_prio_tree_next_entry(obj, iter, type, field) ({		\
+	union vma_prio_tree_node *__t;					\
+	__t = vma_prio_tree_next(&(obj)->field, iter);			\
+	__t ? prio_tree_entry(__t, type, field) : NULL;			\
+})
+
+#define vma_prio_tree_foreach(obj, type, field, iter, root, begin, end)	\
+	prio_tree_iter_init(iter, root, begin, end);			\
+	for (obj = vma_prio_tree_first_entry(iter, type, field); obj ;	\
+		(obj = vma_prio_tree_next_entry(obj, iter, type, field)))
 
 static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
 					struct list_head *list)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 048b462..06b74c1 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -147,15 +147,7 @@ struct vm_area_struct {
 	 * linkage to the list of like vmas hanging off its node, or
 	 * linkage of vma in the address_space->i_mmap_nonlinear list.
 	 */
-	union {
-		struct {
-			struct list_head list;
-			void *parent;	/* aligns with prio_tree_node parent */
-			struct vm_area_struct *head;
-		} vm_set;
-
-		struct raw_prio_tree_node prio_tree_node;
-	} shared;
+	union vma_prio_tree_node shared;
 
 	/*
 	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
diff --git a/include/linux/prio_tree.h b/include/linux/prio_tree.h
index db04abb..ae683b2 100644
--- a/include/linux/prio_tree.h
+++ b/include/linux/prio_tree.h
@@ -25,13 +25,25 @@ struct prio_tree_node {
 	unsigned long		last;	/* last location _in_ interval */
 };
 
+union vma_prio_tree_node {
+	struct {
+		struct list_head list;
+		void *parent;
+		union vma_prio_tree_node *head;
+	} vm_set;
+	struct raw_prio_tree_node prio_tree_node;
+};
+
 struct prio_tree_root {
 	struct prio_tree_node	*prio_tree_node;
 	unsigned short 		index_bits;
 	unsigned short		raw;
 		/*
 		 * 0: nodes are of type struct prio_tree_node
-		 * 1: nodes are of type raw_prio_tree_node
+		 * 1: nodes are of type raw_prio_tree_node and the vmas
+		 *    use the shared field.
+		 * 2: nodes are of type raw_prio_tree_node and the vmas
+		 *    use the anon field.
 		 */
 };
 
@@ -63,7 +75,8 @@ do {					\
 } while (0)
 
 #define INIT_PRIO_TREE_ROOT(ptr)	__INIT_PRIO_TREE_ROOT(ptr, 0)
-#define INIT_RAW_PRIO_TREE_ROOT(ptr)	__INIT_PRIO_TREE_ROOT(ptr, 1)
+#define INIT_SHARED_PRIO_TREE_ROOT(ptr)	__INIT_PRIO_TREE_ROOT(ptr, 1)
+#define INIT_ANON_PRIO_TREE_ROOT(ptr)	__INIT_PRIO_TREE_ROOT(ptr, 2)
 
 #define INIT_PRIO_TREE_NODE(ptr)				\
 do {								\
diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index d25bd22..9e9a521 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/spinlock.h>
+#include <linux/prio_tree.h>
 #include <linux/memcontrol.h>
 
 /*
@@ -30,14 +31,14 @@ struct anon_vma {
 	atomic_t ksm_refcount;
 #endif
 	/*
-	 * NOTE: the LSB of the head.next is set by
+	 * NOTE: the LSB of the head.prio_tree_node is set by
 	 * mm_take_all_locks() _after_ taking the above lock. So the
 	 * head must only be read/written after taking the above lock
-	 * to be sure to see a valid next pointer. The LSB bit itself
-	 * is serialized by a system wide lock only visible to
+	 * to be sure to see a valid prio_tree_node pointer. The LSB bit
+	 * itself is serialized by a system wide lock only visible to
 	 * mm_take_all_locks() (mm_all_locks_mutex).
 	 */
-	struct list_head head;	/* Chain of private "related" vmas */
+	struct prio_tree_root head;
 };
 
 /*
@@ -57,7 +58,7 @@ struct anon_vma_chain {
 	struct vm_area_struct *vma;
 	struct anon_vma *anon_vma;
 	struct list_head same_vma;   /* locked by mmap_sem & page_table_lock */
-	struct list_head same_anon_vma;	/* locked by anon_vma->lock */
+	union vma_prio_tree_node same_anon_vma; /* locked by anon_vma->lock */
 };
 
 #ifdef CONFIG_MMU
diff --git a/kernel/fork.c b/kernel/fork.c
index b54abc4..51b16ea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -355,7 +355,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			tmp->vm_truncate_count = mpnt->vm_truncate_count;
 			flush_dcache_mmap_lock(mapping);
 			/* insert tmp into the share list, just after mpnt */
-			vma_prio_tree_add(tmp, mpnt);
+			vma_prio_tree_add(&tmp->shared, &mpnt->shared);
 			flush_dcache_mmap_unlock(mapping);
 			spin_unlock(&mapping->i_mmap_lock);
 		}
diff --git a/lib/prio_tree.c b/lib/prio_tree.c
index ccfd850..1d48709 100644
--- a/lib/prio_tree.c
+++ b/lib/prio_tree.c
@@ -14,6 +14,7 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 #include <linux/prio_tree.h>
+#include <linux/rmap.h>
 
 /*
  * A clever mix of heap and radix trees forms a radix priority search tree (PST)
@@ -53,14 +54,21 @@ static void get_index(const struct prio_tree_root *root,
     const struct prio_tree_node *node,
     unsigned long *radix, unsigned long *heap)
 {
-	if (root->raw) {
+	if (root->raw == 1) {
 		struct vm_area_struct *vma = prio_tree_entry(
 		    node, struct vm_area_struct, shared.prio_tree_node);
 
 		*radix = RADIX_INDEX(vma);
 		*heap = HEAP_INDEX(vma);
-	}
-	else {
+	} else if (root->raw == 2) {
+		struct vm_area_struct *vma;
+
+		vma  = prio_tree_entry(node, struct anon_vma_chain,
+		    same_anon_vma.prio_tree_node)->vma;
+
+		*radix = RADIX_INDEX(vma);
+		*heap = HEAP_INDEX(vma);
+	} else {
 		*radix = node->start;
 		*heap = node->last;
 	}
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 78b94f0..f0e36fe 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -183,7 +183,8 @@ __xip_unmap (struct address_space * mapping,
 
 retry:
 	spin_lock(&mapping->i_mmap_lock);
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter,
+					&mapping->i_mmap, pgoff, pgoff) {
 		mm = vma->vm_mm;
 		address = vma->vm_start +
 			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
diff --git a/mm/fremap.c b/mm/fremap.c
index 46f5dac..dd0853c 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -211,7 +211,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 		spin_lock(&mapping->i_mmap_lock);
 		flush_dcache_mmap_lock(mapping);
 		vma->vm_flags |= VM_NONLINEAR;
-		vma_prio_tree_remove(vma, &mapping->i_mmap);
+		vma_prio_tree_remove(&vma->shared, &mapping->i_mmap);
 		vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
 		flush_dcache_mmap_unlock(mapping);
 		spin_unlock(&mapping->i_mmap_lock);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 3a5aeb3..bbe3c0e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2245,7 +2245,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * __unmap_hugepage_range() is called as the lock is already held
 	 */
 	spin_lock(&mapping->i_mmap_lock);
-	vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_prio_tree_foreach(iter_vma, struct vm_area_struct, shared, &iter,
+					&mapping->i_mmap, pgoff, pgoff) {
 		/* Do not unmap the current VMA */
 		if (iter_vma == vma)
 			continue;
diff --git a/mm/ksm.c b/mm/ksm.c
index a93f1b7..2eded1e 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -326,7 +326,7 @@ static void drop_anon_vma(struct rmap_item *rmap_item)
 	struct anon_vma *anon_vma = rmap_item->anon_vma;
 
 	if (atomic_dec_and_lock(&anon_vma->ksm_refcount, &anon_vma->lock)) {
-		int empty = list_empty(&anon_vma->head);
+		int empty = prio_tree_empty(&anon_vma->head);
 		spin_unlock(&anon_vma->lock);
 		if (empty)
 			anon_vma_free(anon_vma);
@@ -1562,12 +1562,17 @@ int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg,
 		return 0;
 again:
 	hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+		pgoff_t pgoff = rmap_item->address >> PAGE_SHIFT;
 		struct anon_vma *anon_vma = rmap_item->anon_vma;
 		struct anon_vma_chain *vmac;
 		struct vm_area_struct *vma;
+		struct prio_tree_iter iter;
 
 		spin_lock(&anon_vma->lock);
-		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+
+		vma_prio_tree_foreach(vmac, struct anon_vma_chain,
+				      same_anon_vma, &iter,
+				      &anon_vma->head, pgoff, pgoff) {
 			vma = vmac->vma;
 			if (rmap_item->address < vma->vm_start ||
 			    rmap_item->address >= vma->vm_end)
@@ -1615,12 +1620,16 @@ int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
 		return SWAP_FAIL;
 again:
 	hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+		pgoff_t pgoff = rmap_item->address >> PAGE_SHIFT;
 		struct anon_vma *anon_vma = rmap_item->anon_vma;
 		struct anon_vma_chain *vmac;
 		struct vm_area_struct *vma;
+		struct prio_tree_iter iter;
 
 		spin_lock(&anon_vma->lock);
-		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+		vma_prio_tree_foreach(vmac, struct anon_vma_chain,
+				      same_anon_vma, &iter,
+				      &anon_vma->head, pgoff, pgoff) {
 			vma = vmac->vma;
 			if (rmap_item->address < vma->vm_start ||
 			    rmap_item->address >= vma->vm_end)
@@ -1667,12 +1676,16 @@ int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
 		return ret;
 again:
 	hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) {
+		pgoff_t pgoff = rmap_item->address >> PAGE_SHIFT;
 		struct anon_vma *anon_vma = rmap_item->anon_vma;
 		struct anon_vma_chain *vmac;
 		struct vm_area_struct *vma;
+		struct prio_tree_iter iter;
 
 		spin_lock(&anon_vma->lock);
-		list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
+		vma_prio_tree_foreach(vmac, struct anon_vma_chain,
+				      same_anon_vma, &iter,
+				      &anon_vma->head, pgoff, pgoff) {
 			vma = vmac->vma;
 			if (rmap_item->address < vma->vm_start ||
 			    rmap_item->address >= vma->vm_end)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d1f3351..9ebe34c 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -383,11 +383,14 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
 	if (av == NULL)	/* Not actually mapped anymore */
 		goto out;
 	for_each_process (tsk) {
+		struct prio_tree_iter iter;
+		pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 		struct anon_vma_chain *vmac;
 
 		if (!task_early_kill(tsk))
 			continue;
-		list_for_each_entry(vmac, &av->head, same_anon_vma) {
+		vma_prio_tree_foreach(vmac, struct anon_vma_chain,
+			      same_anon_vma, &iter, &av->head, pgoff, pgoff) {
 			vma = vmac->vma;
 			if (!page_mapped_in_vma(page, vma))
 				continue;
@@ -428,8 +431,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
 		if (!task_early_kill(tsk))
 			continue;
 
-		vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff,
-				      pgoff) {
+		vma_prio_tree_foreach(vma, struct vm_area_struct, shared,
+				      &iter, &mapping->i_mmap, pgoff, pgoff) {
 			/*
 			 * Send early kill signal to tasks where a vma covers
 			 * the page but the corrupted page is not necessarily
diff --git a/mm/memory.c b/mm/memory.c
index f531087..af760c1 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2411,7 +2411,8 @@ static void reset_vma_truncate_counts(struct address_space *mapping)
 	struct vm_area_struct *vma;
 	struct prio_tree_iter iter;
 
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
+	vma_prio_tree_foreach(vma, struct vm_area_struct, shared,
+				&iter, &mapping->i_mmap, 0, ULONG_MAX)
 		vma->vm_truncate_count = 0;
 	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
 		vma->vm_truncate_count = 0;
@@ -2472,7 +2473,7 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
 	pgoff_t vba, vea, zba, zea;
 
 restart:
-	vma_prio_tree_foreach(vma, &iter, root,
+	vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter, root,
 			details->first_index, details->last_index) {
 		/* Skip quickly over those we have already dealt with */
 		if (vma->vm_truncate_count == details->truncate_count)
diff --git a/mm/mmap.c b/mm/mmap.c
index 6cfd507..4497e79 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -207,7 +207,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 	if (unlikely(vma->vm_flags & VM_NONLINEAR))
 		list_del_init(&vma->shared.vm_set.list);
 	else
-		vma_prio_tree_remove(vma, &mapping->i_mmap);
+		vma_prio_tree_remove(&vma->shared, &mapping->i_mmap);
 	flush_dcache_mmap_unlock(mapping);
 }
 
@@ -430,7 +430,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
 		if (unlikely(vma->vm_flags & VM_NONLINEAR))
 			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
 		else
-			vma_prio_tree_insert(vma, &mapping->i_mmap);
+			vma_prio_tree_insert(&vma->shared, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
 	}
 }
@@ -593,9 +593,9 @@ again:			remove_next = 1 + (end > next->vm_end);
 
 	if (root) {
 		flush_dcache_mmap_lock(mapping);
-		vma_prio_tree_remove(vma, root);
+		vma_prio_tree_remove(&vma->shared, root);
 		if (adjust_next)
-			vma_prio_tree_remove(next, root);
+			vma_prio_tree_remove(&next->shared, root);
 	}
 
 	vma->vm_start = start;
@@ -608,8 +608,8 @@ again:			remove_next = 1 + (end > next->vm_end);
 
 	if (root) {
 		if (adjust_next)
-			vma_prio_tree_insert(next, root);
-		vma_prio_tree_insert(vma, root);
+			vma_prio_tree_insert(&next->shared, root);
+		vma_prio_tree_insert(&vma->shared, root);
 		flush_dcache_mmap_unlock(mapping);
 	}
 
@@ -866,7 +866,7 @@ try_prev:
 	 * It is potentially slow to have to call find_vma_prev here.
 	 * But it's only on the first write fault on the vma, not
 	 * every time, and we could devise a way to avoid it later
-	 * (e.g. stash info in next's anon_vma_node when assigning
+	 * (e.g. stash info in next's anon node when assigning
 	 * an anon_vma, or when trying vma_merge).  Another time.
 	 */
 	BUG_ON(find_vma_prev(vma->vm_mm, vma->vm_start, &near) != vma);
@@ -2440,7 +2440,7 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
 
 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 {
-	if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+	if (!test_bit(0, (unsigned long *) &anon_vma->head.prio_tree_node)) {
 		/*
 		 * The LSB of head.next can't change from under us
 		 * because we hold the mm_all_locks_mutex.
@@ -2456,7 +2456,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 		 * anon_vma->lock.
 		 */
 		if (__test_and_set_bit(0, (unsigned long *)
-				       &anon_vma->head.next))
+				       &anon_vma->head.prio_tree_node))
 			BUG();
 	}
 }
@@ -2497,8 +2497,8 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  * A single task can't take more than one mm_take_all_locks() in a row
  * or it would deadlock.
  *
- * The LSB in anon_vma->head.next and the AS_MM_ALL_LOCKS bitflag in
- * mapping->flags avoid to take the same lock twice, if more than one
+ * The LSB in anon_vma->head.prio_tree_node and the AS_MM_ALL_LOCKS bitflag
+ * in mapping->flags avoid to take the same lock twice, if more than one
  * vma in this mm is backed by the same anon_vma or address_space.
  *
  * We can take all the locks in random order because the VM code
@@ -2547,7 +2547,7 @@ out_unlock:
 
 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-	if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+	if (test_bit(0, (unsigned long *) &anon_vma->head.prio_tree_node)) {
 		/*
 		 * The LSB of head.next can't change to 0 from under
 		 * us because we hold the mm_all_locks_mutex.
@@ -2561,7 +2561,7 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 		 * anon_vma->lock.
 		 */
 		if (!__test_and_clear_bit(0, (unsigned long *)
-					  &anon_vma->head.next))
+					  &anon_vma->head.prio_tree_node))
 			BUG();
 		spin_unlock(&anon_vma->lock);
 	}
diff --git a/mm/nommu.c b/mm/nommu.c
index 605ace8..aa4f77a 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -627,7 +627,7 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 		mapping = vma->vm_file->f_mapping;
 
 		flush_dcache_mmap_lock(mapping);
-		vma_prio_tree_insert(vma, &mapping->i_mmap);
+		vma_prio_tree_insert(&vma->shared, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
 	}
 
@@ -695,7 +695,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma)
 		mapping = vma->vm_file->f_mapping;
 
 		flush_dcache_mmap_lock(mapping);
-		vma_prio_tree_remove(vma, &mapping->i_mmap);
+		vma_prio_tree_remove(&vma->shared, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
 	}
 
@@ -1965,8 +1965,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 	down_write(&nommu_region_sem);
 
 	/* search for VMAs that fall within the dead zone */
-	vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
-			      low, high) {
+	vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter,
+				&inode->i_mapping->i_mmap, low, high) {
 		/* found one - only interested if it's shared out of the page
 		 * cache */
 		if (vma->vm_flags & VM_SHARED) {
@@ -1981,8 +1981,8 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 	 * we don't check for any regions that start beyond the EOF as there
 	 * shouldn't be any
 	 */
-	vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap,
-			      0, ULONG_MAX) {
+	vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter,
+				&inode->i_mapping->i_mmap, 0, ULONG_MAX) {
 		if (!(vma->vm_flags & VM_SHARED))
 			continue;
 
diff --git a/mm/prio_tree.c b/mm/prio_tree.c
index c297a46..8e9194f 100644
--- a/mm/prio_tree.c
+++ b/mm/prio_tree.c
@@ -67,114 +67,85 @@
  * 	vma->shared.vm_set.head == NULL ==> a list node
  */
 
-static void dump_vma(struct vm_area_struct *vma)
-{
-	void **ptr = (void **) vma;
-	int i;
-
-	printk("vm_area_struct at %p:", ptr);
-	for (i = 0; i < sizeof(*vma)/sizeof(*ptr); i++, ptr++) {
-		if (!(i & 3))
-			printk("\n");
-		printk(" %p", *ptr);
-	}
-	printk("\n");
-}
-
 /*
  * Add a new vma known to map the same set of pages as the old vma:
  * useful for fork's dup_mmap as well as vma_prio_tree_insert below.
  * Note that it just happens to work correctly on i_mmap_nonlinear too.
  */
-void vma_prio_tree_add(struct vm_area_struct *vma, struct vm_area_struct *old)
+void vma_prio_tree_add(union vma_prio_tree_node *node,
+		       union vma_prio_tree_node *old)
 {
-	vma->shared.vm_set.head = NULL;
-	vma->shared.vm_set.parent = NULL;
-
-	if (WARN_ON(RADIX_INDEX(vma) != RADIX_INDEX(old) ||
-		    HEAP_INDEX(vma)  != HEAP_INDEX(old))) {
-		/*
-		 * This should never happen, yet it has been seen a few times:
-		 * we cannot say much about it without seeing the vma contents.
-		 */
-		dump_vma(vma);
-		dump_vma(old);
-		/*
-		 * Don't try to link this (corrupt?) vma into the (corrupt?)
-		 * prio_tree, but arrange for its removal to succeed later.
-		 */
-		INIT_LIST_HEAD(&vma->shared.vm_set.list);
-	} else if (!old->shared.vm_set.parent)
-		list_add(&vma->shared.vm_set.list,
-				&old->shared.vm_set.list);
-	else if (old->shared.vm_set.head)
-		list_add_tail(&vma->shared.vm_set.list,
-				&old->shared.vm_set.head->shared.vm_set.list);
+	node->vm_set.head = NULL;
+	node->vm_set.parent = NULL;
+
+	if (!old->vm_set.parent)
+		list_add(&node->vm_set.list, &old->vm_set.list);
+	else if (old->vm_set.head)
+		list_add_tail(&node->vm_set.list,
+				&old->vm_set.head->vm_set.list);
 	else {
-		INIT_LIST_HEAD(&vma->shared.vm_set.list);
-		vma->shared.vm_set.head = old;
-		old->shared.vm_set.head = vma;
+		INIT_LIST_HEAD(&node->vm_set.list);
+		node->vm_set.head = old;
+		old->vm_set.head = node;
 	}
 }
 
-void vma_prio_tree_insert(struct vm_area_struct *vma,
+void vma_prio_tree_insert(union vma_prio_tree_node *node,
 			  struct prio_tree_root *root)
 {
 	struct prio_tree_node *ptr;
-	struct vm_area_struct *old;
+	union vma_prio_tree_node *old;
 
-	vma->shared.vm_set.head = NULL;
+	node->vm_set.head = NULL;
 
-	ptr = raw_prio_tree_insert(root, &vma->shared.prio_tree_node);
-	if (ptr != (struct prio_tree_node *) &vma->shared.prio_tree_node) {
-		old = prio_tree_entry(ptr, struct vm_area_struct,
-					shared.prio_tree_node);
-		vma_prio_tree_add(vma, old);
+	ptr = raw_prio_tree_insert(root, &node->prio_tree_node);
+	if (ptr != (struct prio_tree_node *) &node->prio_tree_node) {
+		old = prio_tree_entry(ptr, union vma_prio_tree_node,
+				prio_tree_node);
+		vma_prio_tree_add(node, old);
 	}
 }
 
-void vma_prio_tree_remove(struct vm_area_struct *vma,
+void vma_prio_tree_remove(union vma_prio_tree_node *target,
 			  struct prio_tree_root *root)
 {
-	struct vm_area_struct *node, *head, *new_head;
+	union vma_prio_tree_node *node, *head, *new_head;
 
-	if (!vma->shared.vm_set.head) {
-		if (!vma->shared.vm_set.parent)
-			list_del_init(&vma->shared.vm_set.list);
+	if (!target->vm_set.head) {
+		if (!target->vm_set.parent)
+			list_del_init(&target->vm_set.list);
 		else
-			raw_prio_tree_remove(root, &vma->shared.prio_tree_node);
+			raw_prio_tree_remove(root, &target->prio_tree_node);
 	} else {
 		/* Leave this BUG_ON till prio_tree patch stabilizes */
-		BUG_ON(vma->shared.vm_set.head->shared.vm_set.head != vma);
-		if (vma->shared.vm_set.parent) {
-			head = vma->shared.vm_set.head;
-			if (!list_empty(&head->shared.vm_set.list)) {
-				new_head = list_entry(
-					head->shared.vm_set.list.next,
-					struct vm_area_struct,
-					shared.vm_set.list);
-				list_del_init(&head->shared.vm_set.list);
+		BUG_ON(target->vm_set.head->vm_set.head != target);
+		if (target->vm_set.parent) {
+			head = target->vm_set.head;
+			if (!list_empty(&head->vm_set.list)) {
+				new_head = prio_tree_entry(
+					head->vm_set.list.next,
+					union vma_prio_tree_node, vm_set.list);
+				list_del_init(&head->vm_set.list);
 			} else
 				new_head = NULL;
 
-			raw_prio_tree_replace(root, &vma->shared.prio_tree_node,
-					&head->shared.prio_tree_node);
-			head->shared.vm_set.head = new_head;
+			raw_prio_tree_replace(root, &target->prio_tree_node,
+					&head->prio_tree_node);
+			head->vm_set.head = new_head;
 			if (new_head)
-				new_head->shared.vm_set.head = head;
+				new_head->vm_set.head = head;
 
 		} else {
-			node = vma->shared.vm_set.head;
-			if (!list_empty(&vma->shared.vm_set.list)) {
-				new_head = list_entry(
-					vma->shared.vm_set.list.next,
-					struct vm_area_struct,
-					shared.vm_set.list);
-				list_del_init(&vma->shared.vm_set.list);
-				node->shared.vm_set.head = new_head;
-				new_head->shared.vm_set.head = node;
+			node = target->vm_set.head;
+			if (!list_empty(&target->vm_set.list)) {
+				new_head = prio_tree_entry(
+					target->vm_set.list.next,
+					union vma_prio_tree_node, vm_set.list);
+				list_del_init(&target->vm_set.list);
+				node->vm_set.head = new_head;
+				new_head->vm_set.head = node;
 			} else
-				node->shared.vm_set.head = NULL;
+				node->vm_set.head = NULL;
 		}
 	}
 }
@@ -184,46 +155,46 @@ void vma_prio_tree_remove(struct vm_area_struct *vma,
  * contiguous file pages. The function returns vmas that at least map a single
  * page in the given range of contiguous file pages.
  */
-struct vm_area_struct *vma_prio_tree_next(struct vm_area_struct *vma,
+union vma_prio_tree_node *vma_prio_tree_next(union vma_prio_tree_node *node,
 					struct prio_tree_iter *iter)
 {
 	struct prio_tree_node *ptr;
-	struct vm_area_struct *next;
+	union vma_prio_tree_node *next;
 
-	if (!vma) {
+	if (!node) {
 		/*
-		 * First call is with NULL vma
+		 * First call is with NULL node
 		 */
 		ptr = prio_tree_next(iter);
 		if (ptr) {
-			next = prio_tree_entry(ptr, struct vm_area_struct,
-						shared.prio_tree_node);
-			prefetch(next->shared.vm_set.head);
+			next = prio_tree_entry(ptr, union vma_prio_tree_node,
+						prio_tree_node);
+			prefetch(next->vm_set.head);
 			return next;
 		} else
 			return NULL;
 	}
 
-	if (vma->shared.vm_set.parent) {
-		if (vma->shared.vm_set.head) {
-			next = vma->shared.vm_set.head;
-			prefetch(next->shared.vm_set.list.next);
+	if (node->vm_set.parent) {
+		if (node->vm_set.head) {
+			next = node->vm_set.head;
+			prefetch(next->vm_set.list.next);
 			return next;
 		}
 	} else {
-		next = list_entry(vma->shared.vm_set.list.next,
-				struct vm_area_struct, shared.vm_set.list);
-		if (!next->shared.vm_set.head) {
-			prefetch(next->shared.vm_set.list.next);
+		next = list_entry(node->vm_set.list.next,
+				union vma_prio_tree_node, vm_set.list);
+		if (!next->vm_set.head) {
+			prefetch(next->vm_set.list.next);
 			return next;
 		}
 	}
 
 	ptr = prio_tree_next(iter);
 	if (ptr) {
-		next = prio_tree_entry(ptr, struct vm_area_struct,
-					shared.prio_tree_node);
-		prefetch(next->shared.vm_set.head);
+		next = prio_tree_entry(ptr, union vma_prio_tree_node,
+					prio_tree_node);
+		prefetch(next->vm_set.head);
 		return next;
 	} else
 		return NULL;
diff --git a/mm/rmap.c b/mm/rmap.c
index fcd593c..34391d4 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -142,7 +142,8 @@ int anon_vma_prepare(struct vm_area_struct *vma)
 			avc->anon_vma = anon_vma;
 			avc->vma = vma;
 			list_add(&avc->same_vma, &vma->anon_vma_chain);
-			list_add(&avc->same_anon_vma, &anon_vma->head);
+			vma_prio_tree_insert(&avc->same_anon_vma,
+						&anon_vma->head);
 			allocated = NULL;
 		}
 		spin_unlock(&mm->page_table_lock);
@@ -170,7 +171,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
 	list_add(&avc->same_vma, &vma->anon_vma_chain);
 
 	spin_lock(&anon_vma->lock);
-	list_add_tail(&avc->same_anon_vma, &anon_vma->head);
+	vma_prio_tree_insert(&avc->same_anon_vma, &anon_vma->head);
 	spin_unlock(&anon_vma->lock);
 }
 
@@ -245,10 +246,10 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
 		return;
 
 	spin_lock(&anon_vma->lock);
-	list_del(&anon_vma_chain->same_anon_vma);
+	vma_prio_tree_remove(&anon_vma_chain->same_anon_vma, &anon_vma->head);
 
 	/* We must garbage collect the anon_vma if it's empty */
-	empty = list_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
+	empty = prio_tree_empty(&anon_vma->head) && !ksm_refcount(anon_vma);
 	spin_unlock(&anon_vma->lock);
 
 	if (empty)
@@ -273,7 +274,7 @@ static void anon_vma_ctor(void *data)
 
 	spin_lock_init(&anon_vma->lock);
 	ksm_refcount_init(anon_vma);
-	INIT_LIST_HEAD(&anon_vma->head);
+	INIT_ANON_PRIO_TREE_ROOT(&anon_vma->head);
 }
 
 void __init anon_vma_init(void)
@@ -483,9 +484,11 @@ static int page_referenced_anon(struct page *page,
 				struct mem_cgroup *mem_cont,
 				unsigned long *vm_flags)
 {
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	unsigned int mapcount;
 	struct anon_vma *anon_vma;
 	struct anon_vma_chain *avc;
+	struct prio_tree_iter iter;
 	int referenced = 0;
 
 	anon_vma = page_lock_anon_vma(page);
@@ -493,7 +496,8 @@ static int page_referenced_anon(struct page *page,
 		return referenced;
 
 	mapcount = page_mapcount(page);
-	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+	vma_prio_tree_foreach(avc, struct anon_vma_chain, same_anon_vma,
+				&iter, &anon_vma->head, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)
@@ -562,7 +566,8 @@ static int page_referenced_file(struct page *page,
 	 */
 	mapcount = page_mapcount(page);
 
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter,
+					&mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)
 			continue;
@@ -667,7 +672,8 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page)
 	BUG_ON(PageAnon(page));
 
 	spin_lock(&mapping->i_mmap_lock);
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter,
+					&mapping->i_mmap, pgoff, pgoff) {
 		if (vma->vm_flags & VM_SHARED) {
 			unsigned long address = vma_address(page, vma);
 			if (address == -EFAULT)
@@ -1132,15 +1138,18 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
  */
 static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
 {
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct anon_vma *anon_vma;
 	struct anon_vma_chain *avc;
+	struct prio_tree_iter iter;
 	int ret = SWAP_AGAIN;
 
 	anon_vma = page_lock_anon_vma(page);
 	if (!anon_vma)
 		return ret;
 
-	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+	vma_prio_tree_foreach(avc, struct anon_vma_chain, same_anon_vma,
+				&iter, &anon_vma->head, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)
@@ -1182,7 +1191,8 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 	unsigned int mapcount;
 
 	spin_lock(&mapping->i_mmap_lock);
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter,
+					&mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)
 			continue;
@@ -1331,9 +1341,11 @@ int try_to_munlock(struct page *page)
 static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 		struct vm_area_struct *, unsigned long, void *), void *arg)
 {
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 	struct anon_vma *anon_vma;
 	struct anon_vma_chain *avc;
 	int ret = SWAP_AGAIN;
+	struct prio_tree_iter iter;
 
 	/*
 	 * Note: remove_migration_ptes() cannot use page_lock_anon_vma()
@@ -1347,7 +1359,8 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 	if (!anon_vma)
 		return ret;
 	spin_lock(&anon_vma->lock);
-	list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
+	vma_prio_tree_foreach(avc, struct anon_vma_chain, same_anon_vma,
+				&iter, &anon_vma->head, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)
@@ -1372,7 +1385,8 @@ static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
 	if (!mapping)
 		return ret;
 	spin_lock(&mapping->i_mmap_lock);
-	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
+	vma_prio_tree_foreach(vma, struct vm_area_struct, shared, &iter,
+					&mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
 		if (address == -EFAULT)
 			continue;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ