lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20110811064059.GU6342@yookeroo.fritz.box>
Date:	Thu, 11 Aug 2011 16:40:59 +1000
From:	David Gibson <david@...son.dropbear.id.au>
To:	Linus Torvalds <torvalds@...l.org>
Cc:	Avi Kivity <avi@...hat.com>, Marcelo Tosatti <mtosatti@...hat.com>,
	Jan Kiszka <jan.kiszka@....de>, qemu-devel@...gnu.org,
	agraf@...e.de, kvm <kvm@...r.kernel.org>,
	Paul Mackerras <paulus@...ba.org>, linux-kernel@...r.kernel.org
Subject: Fix refcounting in hugetlbfs quota handling

Linus, please apply

hugetlbfs tracks the current usage of hugepages per hugetlbfs
mountpoint.  To correctly track this when hugepages are released, it
must find the right hugetlbfs super_block from the struct page
available in free_huge_page().

It does this by storing a pointer to the hugepage's struct
address_space in the page_private data.  The hugetlb_{get,put}_quota
functions go from this mapping to the inode and thence to the
super_block.

However, this usage is buggy, because nothing ensures that the
address_space is not freed before all the hugepages that belonged to
it are.  In practice that will usually be the case, but if extra page
references have been taken by e.g. drivers or kvm doing
get_user_pages() then the file, inode and address space may be
destroyed before all the pages.

In addition, the quota functions use the mapping only to get the inode
then the super_block.  However, most of the callers already have the
inode anyway and have to get the mapping from there.

This patch, therefore, stores a pointer to the inode instead of the
address_space in the page private data for hugepages.  More
importantly it correctly adjusts the reference count on the inodes
when they're added to the page private data.  This ensures that the
inode (and therefore the super block) will not be freed before we use
it from free_huge_page.

Signed-off-by: David Gibson <david@...son.dropbear.id.au>

Index: working-2.6/fs/hugetlbfs/inode.c
===================================================================
--- working-2.6.orig/fs/hugetlbfs/inode.c	2011-08-10 16:45:47.864758406 +1000
+++ working-2.6/fs/hugetlbfs/inode.c	2011-08-10 17:22:21.899638039 +1000
@@ -874,10 +874,10 @@ out_free:
 	return -ENOMEM;
 }
 
-int hugetlb_get_quota(struct address_space *mapping, long delta)
+int hugetlb_get_quota(struct inode *inode, long delta)
 {
 	int ret = 0;
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb);
 
 	if (sbinfo->free_blocks > -1) {
 		spin_lock(&sbinfo->stat_lock);
@@ -891,9 +891,9 @@ int hugetlb_get_quota(struct address_spa
 	return ret;
 }
 
-void hugetlb_put_quota(struct address_space *mapping, long delta)
+void hugetlb_put_quota(struct inode *inode, long delta)
 {
-	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
+	struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(inode->i_sb);
 
 	if (sbinfo->free_blocks > -1) {
 		spin_lock(&sbinfo->stat_lock);
Index: working-2.6/include/linux/hugetlb.h
===================================================================
--- working-2.6.orig/include/linux/hugetlb.h	2011-08-10 16:58:27.952527484 +1000
+++ working-2.6/include/linux/hugetlb.h	2011-08-10 17:22:08.723572707 +1000
@@ -171,8 +171,8 @@ extern const struct file_operations huge
 extern const struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
 				struct user_struct **user, int creat_flags);
-int hugetlb_get_quota(struct address_space *mapping, long delta);
-void hugetlb_put_quota(struct address_space *mapping, long delta);
+int hugetlb_get_quota(struct inode *inode, long delta);
+void hugetlb_put_quota(struct inode *inode, long delta);
 
 static inline int is_file_hugepages(struct file *file)
 {
Index: working-2.6/mm/hugetlb.c
===================================================================
--- working-2.6.orig/mm/hugetlb.c	2011-08-10 16:44:12.212284092 +1000
+++ working-2.6/mm/hugetlb.c	2011-08-10 17:21:49.603477888 +1000
@@ -533,10 +533,12 @@ static void free_huge_page(struct page *
 	 */
 	struct hstate *h = page_hstate(page);
 	int nid = page_to_nid(page);
-	struct address_space *mapping;
+	struct inode *inode;
 
-	mapping = (struct address_space *) page_private(page);
+	inode = (struct inode *) page_private(page);
 	set_page_private(page, 0);
+	iput(inode);
+
 	page->mapping = NULL;
 	BUG_ON(page_count(page));
 	BUG_ON(page_mapcount(page));
@@ -551,8 +553,8 @@ static void free_huge_page(struct page *
 		enqueue_huge_page(h, page);
 	}
 	spin_unlock(&hugetlb_lock);
-	if (mapping)
-		hugetlb_put_quota(mapping, 1);
+	if (inode)
+		hugetlb_put_quota(inode, 1);
 }
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -1035,7 +1037,7 @@ static struct page *alloc_huge_page(stru
 	if (chg < 0)
 		return ERR_PTR(-VM_FAULT_OOM);
 	if (chg)
-		if (hugetlb_get_quota(inode->i_mapping, chg))
+		if (hugetlb_get_quota(inode, chg))
 			return ERR_PTR(-VM_FAULT_SIGBUS);
 
 	spin_lock(&hugetlb_lock);
@@ -1045,12 +1047,12 @@ static struct page *alloc_huge_page(stru
 	if (!page) {
 		page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
 		if (!page) {
-			hugetlb_put_quota(inode->i_mapping, chg);
+			hugetlb_put_quota(inode, chg);
 			return ERR_PTR(-VM_FAULT_SIGBUS);
 		}
 	}
 
-	set_page_private(page, (unsigned long) mapping);
+	set_page_private(page, (unsigned long) igrab(inode));
 
 	vma_commit_reservation(h, vma, addr);
 
@@ -2086,7 +2088,8 @@ static void hugetlb_vm_op_close(struct v
 
 		if (reserve) {
 			hugetlb_acct_memory(h, -reserve);
-			hugetlb_put_quota(vma->vm_file->f_mapping, reserve);
+			hugetlb_put_quota(vma->vm_file->f_mapping->host,
+					  reserve);
 		}
 	}
 }
@@ -2884,7 +2887,7 @@ int hugetlb_reserve_pages(struct inode *
 		return chg;
 
 	/* There must be enough filesystem quota for the mapping */
-	if (hugetlb_get_quota(inode->i_mapping, chg))
+	if (hugetlb_get_quota(inode, chg))
 		return -ENOSPC;
 
 	/*
@@ -2893,7 +2896,7 @@ int hugetlb_reserve_pages(struct inode *
 	 */
 	ret = hugetlb_acct_memory(h, chg);
 	if (ret < 0) {
-		hugetlb_put_quota(inode->i_mapping, chg);
+		hugetlb_put_quota(inode, chg);
 		return ret;
 	}
 
@@ -2922,7 +2925,7 @@ void hugetlb_unreserve_pages(struct inod
 	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
 	spin_unlock(&inode->i_lock);
 
-	hugetlb_put_quota(inode->i_mapping, (chg - freed));
+	hugetlb_put_quota(inode, (chg - freed));
 	hugetlb_acct_memory(h, -(chg - freed));
 }
 

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ