lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1454686440-31218-2-git-send-email-gerald.schaefer@de.ibm.com>
Date:	Fri,  5 Feb 2016 16:34:00 +0100
From:	Gerald Schaefer <gerald.schaefer@...ibm.com>
To:	Andrew Morton <akpm@...ux-foundation.org>
Cc:	linux-mm@...ck.org, linux-kernel@...r.kernel.org,
	Naoya Horiguchi <n-horiguchi@...jp.nec.com>,
	"Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>,
	Konstantin Khlebnikov <koct9i@...il.com>,
	Michal Hocko <mhocko@...e.com>,
	Vlastimil Babka <vbabka@...e.cz>,
	Jerome Marchand <jmarchan@...hat.com>,
	Johannes Weiner <hannes@...xchg.org>,
	Dave Hansen <dave.hansen@...el.com>,
	Mel Gorman <mgorman@...e.de>,
	Dan Williams <dan.j.williams@...el.com>,
	Ross Zwisler <ross.zwisler@...ux.intel.com>,
	Benjamin Herrenschmidt <benh@...nel.crashing.org>,
	Paul Mackerras <paulus@...ba.org>,
	Martin Schwidefsky <schwidefsky@...ibm.com>,
	Heiko Carstens <heiko.carstens@...ibm.com>,
	Michael Holzheu <holzheu@...ux.vnet.ibm.com>
Subject: [PATCH RFC 1/1] numa: fix /proc/<pid>/numa_maps for THP

In gather_pte_stats() a THP pmd is cast into a pte, which is wrong because the
layouts may differ depending on the architecture. On s390 this will lead to
inaccurate numap_maps accounting in /proc because of misguided pte_present()
and pte_dirty() checks on the fake pte.

On other architectures pte_present() and pte_dirty() may work by chance, but
there will be an issue with direct-access (dax) mappings w/o underlying struct
pages when HAVE_PTE_SPECIAL is set and THP is available. In vm_normal_page()
the fake pte will be checked with pte_special() and because there is no
"special" bit in a pmd, this will always return false and the VM_PFNMAP |
VM_MIXEDMAP checking will be skipped. On dax mappings w/o struct pages, an
invalid struct page pointer will then be returned that can crash the kernel.

This patch fixes the numa_maps THP handling by introducing new "_pmd" variants
of the can_gather_numa_stats() and vm_normal_page() functions.

Signed-off-by: Gerald Schaefer <gerald.schaefer@...ibm.com>
Cc: <stable@...r.kernel.org>        [4.3+]
---
 fs/proc/task_mmu.c | 29 ++++++++++++++++++++++++++---
 include/linux/mm.h |  2 ++
 mm/memory.c        | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fa95ab2..0cd0eed 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1504,6 +1504,30 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
 	return page;
 }
 
+static struct page *can_gather_numa_stats_pmd(pmd_t pmd,
+					      struct vm_area_struct *vma,
+					      unsigned long addr)
+{
+	struct page *page;
+	int nid;
+
+	if (!pmd_present(pmd))
+		return NULL;
+
+	page = vm_normal_page_pmd(vma, addr, pmd);
+	if (!page)
+		return NULL;
+
+	if (PageReserved(page))
+		return NULL;
+
+	nid = page_to_nid(page);
+	if (!node_isset(nid, node_states[N_MEMORY]))
+		return NULL;
+
+	return page;
+}
+
 static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 		unsigned long end, struct mm_walk *walk)
 {
@@ -1515,12 +1539,11 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
-		pte_t huge_pte = *(pte_t *)pmd;
 		struct page *page;
 
-		page = can_gather_numa_stats(huge_pte, vma, addr);
+		page = can_gather_numa_stats_pmd(*pmd, vma, addr);
 		if (page)
-			gather_stats(page, md, pte_dirty(huge_pte),
+			gather_stats(page, md, pmd_dirty(*pmd),
 				     HPAGE_PMD_SIZE/PAGE_SIZE);
 		spin_unlock(ptl);
 		return 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a0ad7af..7809b4f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1111,6 +1111,8 @@ struct zap_details {
 
 struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 		pte_t pte);
+struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
+				pmd_t pmd);
 
 int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address,
 		unsigned long size);
diff --git a/mm/memory.c b/mm/memory.c
index d279350..ce0a386 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -792,6 +792,44 @@ out:
 	return pfn_to_page(pfn);
 }
 
+struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
+				pmd_t pmd)
+{
+	unsigned long pfn = pmd_pfn(pmd);
+
+	/*
+	 * There is no pmd_special() but there may be special pmds, e.g.
+	 * in a direct-access (dax) mapping, so let's just replicate the
+	 * !HAVE_PTE_SPECIAL case from vm_normal_page() here.
+	 */
+	if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
+		if (vma->vm_flags & VM_MIXEDMAP) {
+			if (!pfn_valid(pfn))
+				return NULL;
+			goto out;
+		} else {
+			unsigned long off;
+			off = (addr - vma->vm_start) >> PAGE_SHIFT;
+			if (pfn == vma->vm_pgoff + off)
+				return NULL;
+			if (!is_cow_mapping(vma->vm_flags))
+				return NULL;
+		}
+	}
+
+	if (is_zero_pfn(pfn))
+		return NULL;
+	if (unlikely(pfn > highest_memmap_pfn))
+		return NULL;
+
+	/*
+	 * NOTE! We still have PageReserved() pages in the page tables.
+	 * eg. VDSO mappings can cause them to exist.
+	 */
+out:
+	return pfn_to_page(pfn);
+}
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
-- 
2.3.9

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ