lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Sat, 22 Jun 2019 22:48:28 -0700
From:   Song Liu <songliubraving@...com>
To:     <linux-kernel@...r.kernel.org>, <linux-mm@...ck.org>
CC:     <matthew.wilcox@...cle.com>, <kirill.shutemov@...ux.intel.com>,
        <peterz@...radead.org>, <oleg@...hat.com>, <rostedt@...dmis.org>,
        <kernel-team@...com>, <william.kucharski@...cle.com>,
        Song Liu <songliubraving@...com>
Subject: [PATCH v6 5/6] khugepaged: enable collapse pmd for pte-mapped THP

khugepaged needs exclusive mmap_sem to access page table. When it fails
to lock mmap_sem, the page will fault in as pte-mapped THP. As the page
is already a THP, khugepaged will not handle this pmd again.

This patch enables the khugepaged to retry retract_page_tables().

A new flag AS_COLLAPSE_PMD is introduced to show the address_space may
contain pte-mapped THPs. When khugepaged fails to trylock the mmap_sem,
it sets AS_COLLAPSE_PMD. Then, at a later time, khugepaged will retry
compound pages in this address_space.

Since collapse may happen at an later time, some pages may already fault
in. To handle these pages properly, it is necessary to prepare the pmd
before collapsing. prepare_pmd_for_collapse() is introduced to prepare
the pmd by removing rmap, adjusting refcount and mm_counter.

prepare_pmd_for_collapse() also double checks whether all ptes in this
pmd are mapping to the same THP. This is necessary because some subpage
of the THP may be replaced, for example by uprobe. In such cases, it
is not possible to collapse the pmd, so we fall back.

Signed-off-by: Song Liu <songliubraving@...com>
---
 include/linux/pagemap.h |  1 +
 mm/khugepaged.c         | 69 +++++++++++++++++++++++++++++++++++------
 2 files changed, 60 insertions(+), 10 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 9ec3544baee2..eac881de2a46 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -29,6 +29,7 @@ enum mapping_flags {
 	AS_EXITING	= 4, 	/* final truncate in progress */
 	/* writeback related tags are not used */
 	AS_NO_WRITEBACK_TAGS = 5,
+	AS_COLLAPSE_PMD = 6,	/* try collapse pmd for THP */
 };
 
 /**
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a4f90a1b06f5..9b980327fd9b 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1254,7 +1254,47 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
 }
 
 #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
-static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
+
+/* return whether the pmd is ready for collapse */
+bool prepare_pmd_for_collapse(struct vm_area_struct *vma, pgoff_t pgoff,
+			      struct page *hpage, pmd_t *pmd)
+{
+	unsigned long haddr = page_address_in_vma(hpage, vma);
+	unsigned long addr;
+	int i, count = 0;
+
+	/* step 1: check all mapped PTEs are to this huge page */
+	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+		pte_t *pte = pte_offset_map(pmd, addr);
+
+		if (pte_none(*pte))
+			continue;
+
+		if (hpage + i != vm_normal_page(vma, addr, *pte))
+			return false;
+		count++;
+	}
+
+	/* step 2: adjust rmap */
+	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+		pte_t *pte = pte_offset_map(pmd, addr);
+		struct page *page;
+
+		if (pte_none(*pte))
+			continue;
+		page = vm_normal_page(vma, addr, *pte);
+		page_remove_rmap(page, false);
+	}
+
+	/* step 3: set proper refcount and mm_counters. */
+	page_ref_sub(hpage, count);
+	add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
+	return true;
+}
+
+extern pid_t sysctl_dump_pt_pid;
+static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
+				struct page *hpage)
 {
 	struct vm_area_struct *vma;
 	unsigned long addr;
@@ -1273,21 +1313,21 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
 		pmd = mm_find_pmd(vma->vm_mm, addr);
 		if (!pmd)
 			continue;
-		/*
-		 * We need exclusive mmap_sem to retract page table.
-		 * If trylock fails we would end up with pte-mapped THP after
-		 * re-fault. Not ideal, but it's more important to not disturb
-		 * the system too much.
-		 */
 		if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
 			spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
-			/* assume page table is clear */
+
+			if (!prepare_pmd_for_collapse(vma, pgoff, hpage, pmd)) {
+				spin_unlock(ptl);
+				up_write(&vma->vm_mm->mmap_sem);
+				continue;
+			}
 			_pmd = pmdp_collapse_flush(vma, addr, pmd);
 			spin_unlock(ptl);
 			up_write(&vma->vm_mm->mmap_sem);
 			mm_dec_nr_ptes(vma->vm_mm);
 			pte_free(vma->vm_mm, pmd_pgtable(_pmd));
-		}
+		} else
+			set_bit(AS_COLLAPSE_PMD, &mapping->flags);
 	}
 	i_mmap_unlock_write(mapping);
 }
@@ -1561,7 +1601,7 @@ static void collapse_file(struct mm_struct *mm,
 		/*
 		 * Remove pte page tables, so we can re-fault the page as huge.
 		 */
-		retract_page_tables(mapping, start);
+		retract_page_tables(mapping, start, new_page);
 		*hpage = NULL;
 
 		khugepaged_pages_collapsed++;
@@ -1622,6 +1662,7 @@ static void khugepaged_scan_file(struct mm_struct *mm,
 	int present, swap;
 	int node = NUMA_NO_NODE;
 	int result = SCAN_SUCCEED;
+	bool collapse_pmd = false;
 
 	present = 0;
 	swap = 0;
@@ -1640,6 +1681,14 @@ static void khugepaged_scan_file(struct mm_struct *mm,
 		}
 
 		if (PageTransCompound(page)) {
+			if (collapse_pmd ||
+			    test_and_clear_bit(AS_COLLAPSE_PMD,
+					       &mapping->flags)) {
+				collapse_pmd = true;
+				retract_page_tables(mapping, start,
+						    compound_head(page));
+				continue;
+			}
 			result = SCAN_PAGE_COMPOUND;
 			break;
 		}
-- 
2.17.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ