linux-kernel - [PATCH 17/24] huge tmpfs: map shmem by huge page pmd or by page team ptes

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <alpine.LSU.2.11.1502202016420.14414@eggly.anvils>
Date:	Fri, 20 Feb 2015 20:18:11 -0800 (PST)
From:	Hugh Dickins <hughd@...gle.com>
To:	"Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>
cc:	Andrea Arcangeli <aarcange@...hat.com>, Ning Qu <quning@...il.com>,
	Andrew Morton <akpm@...ux-foundation.org>,
	linux-kernel@...r.kernel.org, linux-mm@...ck.org
Subject: [PATCH 17/24] huge tmpfs: map shmem by huge page pmd or by page team
 ptes

This is the commit which at last gets huge mappings of tmpfs working,
as can be seen from the ShmemPmdMapped line of /proc/meminfo.

The main thing here is the trio of functions map_team_by_pmd(),
unmap_team_by_pmd() and remap_team_by_ptes() added to huge_memory.c;
and of course the enablement of FAULT_FLAG_MAY_HUGE from memory.c
to shmem.c, with VM_FAULT_HUGE back from shmem.c to memory.c.  But
one-line and few-line changes scattered throughout huge_memory.c.

Huge tmpfs is relying on the pmd_trans_huge() page table hooks which
the original Anonymous THP project placed throughout mm; but skips
almost all of its complications, going to its own simpler handling.

One odd little change: removal of the VM_NOHUGEPAGE check from
move_huge_pmd().  That's a helper for mremap() move: the new_vma
should be following the same rules as the old vma, so if there's a
trans_huge pmd in the old vma, then it can go in the new, alignment
permitting.  It was a very minor optimization for Anonymous THP; but
now we can reach the same code for huge tmpfs, which is nowhere else
respecting VM_NOHUGEPAGE (whether it should is a different question;
but for now it's simplest to ignore all the various THP switches).

Signed-off-by: Hugh Dickins <hughd@...gle.com>
---
 include/linux/pageteam.h |   41 ++++++
 mm/huge_memory.c         |  238 ++++++++++++++++++++++++++++++++++---
 mm/memory.c              |   11 +
 3 files changed, 273 insertions(+), 17 deletions(-)

--- thpfs.orig/include/linux/pageteam.h	2015-02-20 19:34:37.851932430 -0800
+++ thpfs/include/linux/pageteam.h	2015-02-20 19:34:48.083909034 -0800
@@ -29,10 +29,49 @@ static inline struct page *team_head(str
 	return head;
 }
 
-/* Temporary stub for mm/rmap.c until implemented in mm/huge_memory.c */
+/*
+ * Returns true if this team is mapped by pmd somewhere.
+ */
+static inline bool team_hugely_mapped(struct page *head)
+{
+	return atomic_long_read(&head->team_usage) > HPAGE_PMD_NR;
+}
+
+/*
+ * Returns true if this was the first mapping by pmd, whereupon mapped stats
+ * need to be updated.
+ */
+static inline bool inc_hugely_mapped(struct page *head)
+{
+	return atomic_long_inc_return(&head->team_usage) == HPAGE_PMD_NR+1;
+}
+
+/*
+ * Returns true if this was the last mapping by pmd, whereupon mapped stats
+ * need to be updated.
+ */
+static inline bool dec_hugely_mapped(struct page *head)
+{
+	return atomic_long_dec_return(&head->team_usage) == HPAGE_PMD_NR;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+int map_team_by_pmd(struct vm_area_struct *vma,
+			unsigned long addr, pmd_t *pmd, struct page *page);
+void unmap_team_by_pmd(struct vm_area_struct *vma,
+			unsigned long addr, pmd_t *pmd, struct page *page);
+#else
+static inline int map_team_by_pmd(struct vm_area_struct *vma,
+			unsigned long addr, pmd_t *pmd, struct page *page)
+{
+	VM_BUG_ON_PAGE(1, page);
+	return 0;
+}
 static inline void unmap_team_by_pmd(struct vm_area_struct *vma,
 			unsigned long addr, pmd_t *pmd, struct page *page)
 {
+	VM_BUG_ON_PAGE(1, page);
 }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 #endif /* _LINUX_PAGETEAM_H */
--- thpfs.orig/mm/huge_memory.c	2015-02-20 19:34:32.367944969 -0800
+++ thpfs/mm/huge_memory.c	2015-02-20 19:34:48.083909034 -0800
@@ -21,6 +21,7 @@
 #include <linux/freezer.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
+#include <linux/pageteam.h>
 #include <linux/migrate.h>
 #include <linux/hashtable.h>
 
@@ -28,6 +29,10 @@
 #include <asm/pgalloc.h>
 #include "internal.h"
 
+static void page_remove_team_rmap(struct page *);
+static void remap_team_by_ptes(struct vm_area_struct *vma, unsigned long addr,
+			       pmd_t *pmd, struct page *page);
+
 /*
  * By default transparent hugepage support is disabled in order that avoid
  * to risk increase the memory footprint of applications without a guaranteed
@@ -901,13 +906,19 @@ int copy_huge_pmd(struct mm_struct *dst_
 		goto out;
 	}
 	src_page = pmd_page(pmd);
-	VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
 	get_page(src_page);
 	page_dup_rmap(src_page);
-	add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-
-	pmdp_set_wrprotect(src_mm, addr, src_pmd);
-	pmd = pmd_mkold(pmd_wrprotect(pmd));
+	if (PageAnon(src_page)) {
+		VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+		pmdp_set_wrprotect(src_mm, addr, src_pmd);
+		pmd = pmd_wrprotect(pmd);
+	} else {
+		VM_BUG_ON_PAGE(!PageTeam(src_page), src_page);
+		inc_hugely_mapped(src_page);
+	}
+	add_mm_counter(dst_mm, PageAnon(src_page) ?
+		MM_ANONPAGES : MM_FILEPAGES, HPAGE_PMD_NR);
+	pmd = pmd_mkold(pmd);
 	pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
 	set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 	atomic_long_inc(&dst_mm->nr_ptes);
@@ -1088,22 +1099,28 @@ int do_huge_pmd_wp_page(struct mm_struct
 {
 	spinlock_t *ptl;
 	int ret = 0;
-	struct page *page = NULL, *new_page;
+	struct page *page, *new_page;
 	struct mem_cgroup *memcg;
 	unsigned long haddr;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
 
 	ptl = pmd_lockptr(mm, pmd);
-	VM_BUG_ON_VMA(!vma->anon_vma, vma);
 	haddr = address & HPAGE_PMD_MASK;
-	if (is_huge_zero_pmd(orig_pmd))
+	page = pmd_page(orig_pmd);
+	if (is_huge_zero_page(page)) {
+		page = NULL;
 		goto alloc;
+	}
+	if (!PageAnon(page)) {
+		remap_team_by_ptes(vma, address, pmd, page);
+		/* Let's just take another fault to do the COW */
+		return 0;
+	}
 	spin_lock(ptl);
 	if (unlikely(!pmd_same(*pmd, orig_pmd)))
 		goto out_unlock;
 
-	page = pmd_page(orig_pmd);
 	VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
 	if (page_mapcount(page) == 1) {
 		pmd_t entry;
@@ -1117,6 +1134,7 @@ int do_huge_pmd_wp_page(struct mm_struct
 	get_user_huge_page(page);
 	spin_unlock(ptl);
 alloc:
+	VM_BUG_ON(!vma->anon_vma);
 	if (transparent_hugepage_enabled(vma) &&
 	    !transparent_hugepage_debug_cow())
 		new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
@@ -1226,7 +1244,7 @@ struct page *follow_trans_huge_pmd(struc
 		goto out;
 
 	page = pmd_page(*pmd);
-	VM_BUG_ON_PAGE(!PageHead(page), page);
+	VM_BUG_ON_PAGE(!PageHead(page) && !PageTeam(page), page);
 	if (flags & FOLL_TOUCH) {
 		pmd_t _pmd;
 		/*
@@ -1251,7 +1269,7 @@ struct page *follow_trans_huge_pmd(struc
 		}
 	}
 	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
-	VM_BUG_ON_PAGE(!PageCompound(page), page);
+	VM_BUG_ON_PAGE(!PageCompound(page) && !PageTeam(page), page);
 	if (flags & FOLL_GET)
 		get_page_foll(page);
 
@@ -1409,10 +1427,12 @@ int zap_huge_pmd(struct mmu_gather *tlb,
 			put_huge_zero_page();
 		} else {
 			page = pmd_page(orig_pmd);
+			if (!PageAnon(page))
+				page_remove_team_rmap(page);
 			page_remove_rmap(page);
 			VM_BUG_ON_PAGE(page_mapcount(page) < 0, page);
-			add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
-			VM_BUG_ON_PAGE(!PageHead(page), page);
+			add_mm_counter(tlb->mm, PageAnon(page) ?
+				MM_ANONPAGES : MM_FILEPAGES, -HPAGE_PMD_NR);
 			atomic_long_dec(&tlb->mm->nr_ptes);
 			spin_unlock(ptl);
 			tlb_remove_page(tlb, page);
@@ -1456,8 +1476,7 @@ int move_huge_pmd(struct vm_area_struct
 
 	if ((old_addr & ~HPAGE_PMD_MASK) ||
 	    (new_addr & ~HPAGE_PMD_MASK) ||
-	    old_end - old_addr < HPAGE_PMD_SIZE ||
-	    (new_vma->vm_flags & VM_NOHUGEPAGE))
+	    old_end - old_addr < HPAGE_PMD_SIZE)
 		goto out;
 
 	/*
@@ -1518,7 +1537,6 @@ int change_huge_pmd(struct vm_area_struc
 			entry = pmd_modify(entry, newprot);
 			ret = HPAGE_PMD_NR;
 			set_pmd_at(mm, addr, pmd, entry);
-			BUG_ON(pmd_write(entry));
 		} else {
 			struct page *page = pmd_page(*pmd);
 
@@ -2864,6 +2882,17 @@ void __split_huge_page_pmd(struct vm_are
 	unsigned long haddr = address & HPAGE_PMD_MASK;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
+	pmd_t pmdval;
+
+	pmdval = *pmd;
+	barrier();
+	if (!pmd_present(pmdval) || !pmd_trans_huge(pmdval))
+		return;
+	page = pmd_page(pmdval);
+	if (!PageAnon(page) && !is_huge_zero_page(page)) {
+		remap_team_by_ptes(vma, address, pmd, page);
+		return;
+	}
 
 	BUG_ON(vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE);
 
@@ -2976,3 +3005,180 @@ void __vma_adjust_trans_huge(struct vm_a
 			split_huge_page_address(next->vm_mm, nstart);
 	}
 }
+
+/*
+ * huge pmd support for huge tmpfs
+ */
+
+static void page_add_team_rmap(struct page *page)
+{
+	VM_BUG_ON_PAGE(PageAnon(page), page);
+	VM_BUG_ON_PAGE(!PageTeam(page), page);
+	if (inc_hugely_mapped(page))
+		__inc_zone_page_state(page, NR_SHMEM_PMDMAPPED);
+}
+
+static void page_remove_team_rmap(struct page *page)
+{
+	VM_BUG_ON_PAGE(PageAnon(page), page);
+	VM_BUG_ON_PAGE(!PageTeam(page), page);
+	if (dec_hugely_mapped(page))
+		__dec_zone_page_state(page, NR_SHMEM_PMDMAPPED);
+}
+
+int map_team_by_pmd(struct vm_area_struct *vma, unsigned long addr,
+		    pmd_t *pmd, struct page *page)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pgtable_t pgtable;
+	spinlock_t *pml;
+	pmd_t pmdval;
+	int ret = VM_FAULT_NOPAGE;
+
+	/*
+	 * Another task may have mapped it in just ahead of us; but we
+	 * have the huge page locked, so others will wait on us now... or,
+	 * is there perhaps some way another might still map in a single pte?
+	 */
+	VM_BUG_ON_PAGE(!PageTeam(page), page);
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+	if (!pmd_none(*pmd))
+		goto raced2;
+
+	addr &= HPAGE_PMD_MASK;
+	pgtable = pte_alloc_one(mm, addr);
+	if (!pgtable) {
+		ret = VM_FAULT_OOM;
+		goto raced2;
+	}
+
+	pml = pmd_lock(mm, pmd);
+	if (!pmd_none(*pmd))
+		goto raced1;
+	pmdval = mk_pmd(page, vma->vm_page_prot);
+	pmdval = pmd_mkhuge(pmd_mkdirty(pmdval));
+	set_pmd_at(mm, addr, pmd, pmdval);
+	page_add_file_rmap(page);
+	page_add_team_rmap(page);
+	update_mmu_cache_pmd(vma, addr, pmd);
+	pgtable_trans_huge_deposit(mm, pmd, pgtable);
+	atomic_long_inc(&mm->nr_ptes);
+	spin_unlock(pml);
+
+	unlock_page(page);
+	add_mm_counter(mm, MM_FILEPAGES, HPAGE_PMD_NR);
+	return ret;
+raced1:
+	spin_unlock(pml);
+	pte_free(mm, pgtable);
+raced2:
+	unlock_page(page);
+	page_cache_release(page);
+	return ret;
+}
+
+void unmap_team_by_pmd(struct vm_area_struct *vma, unsigned long addr,
+		       pmd_t *pmd, struct page *page)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pgtable_t pgtable = NULL;
+	unsigned long end;
+	spinlock_t *pml;
+
+	VM_BUG_ON_PAGE(!PageTeam(page), page);
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+	/*
+	 * But even so there might be a racing zap_huge_pmd() or
+	 * remap_team_by_ptes() while the page_table_lock is dropped.
+	 */
+
+	addr &= HPAGE_PMD_MASK;
+	end = addr + HPAGE_PMD_SIZE;
+
+	mmu_notifier_invalidate_range_start(mm, addr, end);
+	pml = pmd_lock(mm, pmd);
+	if (pmd_trans_huge(*pmd) && pmd_page(*pmd) == page) {
+		pmdp_clear_flush(vma, addr, pmd);
+		pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+		page_remove_team_rmap(page);
+		page_remove_rmap(page);
+		atomic_long_dec(&mm->nr_ptes);
+	}
+	spin_unlock(pml);
+	mmu_notifier_invalidate_range_end(mm, addr, end);
+
+	if (!pgtable)
+		return;
+
+	pte_free(mm, pgtable);
+	update_hiwater_rss(mm);
+	add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR);
+	page_cache_release(page);
+}
+
+static void remap_team_by_ptes(struct vm_area_struct *vma, unsigned long addr,
+			       pmd_t *pmd, struct page *page)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	struct page *head = page;
+	pgtable_t pgtable;
+	unsigned long end;
+	spinlock_t *pml;
+	spinlock_t *ptl;
+	pte_t *pte;
+	pmd_t pmdval;
+	pte_t pteval;
+
+	addr &= HPAGE_PMD_MASK;
+	end = addr + HPAGE_PMD_SIZE;
+
+	mmu_notifier_invalidate_range_start(mm, addr, end);
+	pml = pmd_lock(mm, pmd);
+	if (!pmd_trans_huge(*pmd) || pmd_page(*pmd) != page)
+		goto raced;
+
+	pmdval = pmdp_clear_flush(vma, addr, pmd);
+	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
+	pmd_populate(mm, pmd, pgtable);
+	ptl = pte_lockptr(mm, pmd);
+	if (ptl != pml)
+		spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+	page_remove_team_rmap(page);
+	update_mmu_cache_pmd(vma, addr, pmd);
+
+	/*
+	 * It would be nice to have prepared this page table in advance,
+	 * so we could just switch from pmd to ptes under one lock.
+	 * But a comment in zap_huge_pmd() warns that ppc64 needs
+	 * to look at the deposited page table when clearing the pmd.
+	 */
+	pte = pte_offset_map(pmd, addr);
+	do {
+		pteval = pte_mkdirty(mk_pte(page, vma->vm_page_prot));
+		if (!pmd_young(pmdval))
+			pteval = pte_mkold(pteval);
+		set_pte_at(mm, addr, pte, pteval);
+		if (page != head) {
+			/*
+			 * We did not remove the head's rmap count above: that
+			 * seems better than letting it slip to 0 for a moment.
+			 */
+			page_add_file_rmap(page);
+			page_cache_get(page);
+		}
+		/*
+		 * Move page flags from head to page,
+		 * as __split_huge_page_refcount() does for anon?
+		 * Start off by assuming not, but reconsider later.
+		 */
+	} while (pte++, page++, addr += PAGE_SIZE, addr != end);
+
+	pte -= HPAGE_PMD_NR;
+	addr -= HPAGE_PMD_NR;
+	if (ptl != pml)
+		spin_unlock(ptl);
+	pte_unmap(pte);
+raced:
+	spin_unlock(pml);
+	mmu_notifier_invalidate_range_end(mm, addr, end);
+}
--- thpfs.orig/mm/memory.c	2015-02-20 19:34:42.875920943 -0800
+++ thpfs/mm/memory.c	2015-02-20 19:34:48.083909034 -0800
@@ -45,6 +45,7 @@
 #include <linux/swap.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/pageteam.h>
 #include <linux/ksm.h>
 #include <linux/rmap.h>
 #include <linux/export.h>
@@ -2716,9 +2717,19 @@ static int __do_fault(struct vm_area_str
 	vmf.flags = flags;
 	vmf.page = NULL;
 
+	/*
+	 * Give huge pmd a chance before allocating pte or trying fault around.
+	 */
+	if (unlikely(pmd_none(*pmd)))
+		vmf.flags |= FAULT_FLAG_MAY_HUGE;
+
 	ret = vma->vm_ops->fault(vma, &vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
+	if (unlikely(ret & VM_FAULT_HUGE)) {
+		ret |= map_team_by_pmd(vma, address, pmd, vmf.page);
+		return ret;
+	}
 
 	if (unlikely(!(ret & VM_FAULT_LOCKED)))
 		lock_page(vmf.page);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/