lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20211110105428.32458-14-zhengqi.arch@bytedance.com>
Date:   Wed, 10 Nov 2021 18:54:26 +0800
From:   Qi Zheng <zhengqi.arch@...edance.com>
To:     akpm@...ux-foundation.org, tglx@...utronix.de,
        kirill.shutemov@...ux.intel.com, mika.penttila@...tfour.com,
        david@...hat.com, jgg@...dia.com
Cc:     linux-doc@...r.kernel.org, linux-kernel@...r.kernel.org,
        linux-mm@...ck.org, songmuchun@...edance.com,
        zhouchengming@...edance.com, Qi Zheng <zhengqi.arch@...edance.com>
Subject: [PATCH v3 13/15] mm/pte_ref: free user PTE page table pages

This commit introduces CONFIG_FREE_USER_PTE option and
implements the actual logic to the dummy helpers about
pte_ref.

Signed-off-by: Qi Zheng <zhengqi.arch@...edance.com>
---
 include/linux/mm.h      |  2 ++
 include/linux/pgtable.h |  3 +-
 include/linux/pte_ref.h | 53 ++++++++++++++++++++++++----
 mm/Kconfig              |  4 +++
 mm/debug_vm_pgtable.c   |  2 ++
 mm/memory.c             | 15 ++++++++
 mm/pte_ref.c            | 91 +++++++++++++++++++++++++++++++++++++++++++++----
 7 files changed, 156 insertions(+), 14 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 753a9435e0d0..18fbf9e0996a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -437,6 +437,7 @@ extern pgprot_t protection_map[16];
  * @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
  * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
  * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
+ * @FAULT_FLAG_PTE_GET: This means the refcount of the pte has been got.
  *
  * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
  * whether we would allow page faults to retry by specifying these two
@@ -468,6 +469,7 @@ enum fault_flag {
 	FAULT_FLAG_REMOTE =		1 << 7,
 	FAULT_FLAG_INSTRUCTION =	1 << 8,
 	FAULT_FLAG_INTERRUPTIBLE =	1 << 9,
+	FAULT_FLAG_PTE_GET =		1 << 10,
 };
 
 /*
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index c8f045705c1e..6ac51d58f11a 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -480,7 +480,6 @@ static inline pte_t ptep_get_lockless(pte_t *ptep)
 }
 #endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 					    unsigned long address,
@@ -491,6 +490,8 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 	return pmd;
 }
 #endif /* __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #ifndef __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
 static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
 					    unsigned long address,
diff --git a/include/linux/pte_ref.h b/include/linux/pte_ref.h
index b6d8335bdc59..8a26eaba83ef 100644
--- a/include/linux/pte_ref.h
+++ b/include/linux/pte_ref.h
@@ -8,6 +8,7 @@
 #define _LINUX_PTE_REF_H
 
 #include <linux/pgtable.h>
+#include <linux/page-flags.h>
 
 enum pte_tryget_type {
 	TRYGET_SUCCESSED,
@@ -16,12 +17,49 @@ enum pte_tryget_type {
 	TRYGET_FAILED_HUGE_PMD,
 };
 
-bool pte_get_unless_zero(pmd_t *pmd);
-enum pte_tryget_type pte_try_get(pmd_t *pmd);
 void pte_put_vmf(struct vm_fault *vmf);
+enum pte_tryget_type pte_try_get(pmd_t *pmd);
+bool pte_get_unless_zero(pmd_t *pmd);
+
+#ifdef CONFIG_FREE_USER_PTE
+void free_user_pte_table(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr);
 
 static inline void pte_ref_init(pgtable_t pte, pmd_t *pmd, int count)
 {
+	pte->pmd = pmd;
+	atomic_set(&pte->pte_refcount, count);
+}
+
+static inline pmd_t *pte_to_pmd(pte_t *pte)
+{
+	return virt_to_page(pte)->pmd;
+}
+
+static inline void pte_update_pmd(pmd_t old_pmd, pmd_t *new_pmd)
+{
+	pmd_pgtable(old_pmd)->pmd = new_pmd;
+}
+
+static inline void pte_get_many(pmd_t *pmd, unsigned int nr)
+{
+	pgtable_t pte = pmd_pgtable(*pmd);
+
+	VM_BUG_ON(!PageTable(pte));
+	atomic_add(nr, &pte->pte_refcount);
+}
+
+static inline void pte_put_many(struct mm_struct *mm, pmd_t *pmd,
+				unsigned long addr, unsigned int nr)
+{
+	pgtable_t pte = pmd_pgtable(*pmd);
+
+	VM_BUG_ON(!PageTable(pte));
+	if (atomic_sub_and_test(nr, &pte->pte_refcount))
+		free_user_pte_table(mm, pmd, addr & PMD_MASK);
+}
+#else
+static inline void pte_ref_init(pgtable_t pte, pmd_t *pmd, int count)
+{
 }
 
 static inline pmd_t *pte_to_pmd(pte_t *pte)
@@ -37,6 +75,12 @@ static inline void pte_get_many(pmd_t *pmd, unsigned int nr)
 {
 }
 
+static inline void pte_put_many(struct mm_struct *mm, pmd_t *pmd,
+				unsigned long addr, unsigned int nr)
+{
+}
+#endif /* CONFIG_FREE_USER_PTE */
+
 /*
  * pte_get - Increment refcount for the PTE page table.
  * @pmd: a pointer to the pmd entry corresponding to the PTE page table.
@@ -66,11 +110,6 @@ static inline pte_t *pte_tryget_map_lock(struct mm_struct *mm, pmd_t *pmd,
 	return pte_offset_map_lock(mm, pmd, address, ptlp);
 }
 
-static inline void pte_put_many(struct mm_struct *mm, pmd_t *pmd,
-				unsigned long addr, unsigned int nr)
-{
-}
-
 /*
  * pte_put - Decrement refcount for the PTE page table.
  * @mm: the mm_struct of the target address space.
diff --git a/mm/Kconfig b/mm/Kconfig
index 5c5508fafcec..44549d287869 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -898,6 +898,10 @@ config IO_MAPPING
 config SECRETMEM
 	def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
 
+config FREE_USER_PTE
+	def_bool y
+	depends on X86_64
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index 52f006654664..757dd84ee254 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -1049,8 +1049,10 @@ static void __init destroy_args(struct pgtable_debug_args *args)
 	/* Free page table entries */
 	if (args->start_ptep) {
 		pte_put(args->mm, args->start_pmdp, args->vaddr);
+#ifndef CONFIG_FREE_USER_PTE
 		pte_free(args->mm, args->start_ptep);
 		mm_dec_nr_ptes(args->mm);
+#endif
 	}
 
 	if (args->start_pmdp) {
diff --git a/mm/memory.c b/mm/memory.c
index e360ecd37a71..4d1ede78d1b0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -219,6 +219,17 @@ static void check_sync_rss_stat(struct task_struct *task)
 
 #endif /* SPLIT_RSS_COUNTING */
 
+#ifdef CONFIG_FREE_USER_PTE
+static inline void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+				  unsigned long addr)
+{
+	/*
+	 * We should never reach here since the PTE page tables are
+	 * dynamically freed.
+	 */
+	BUG();
+}
+#else
 /*
  * Note: this doesn't free the actual pages themselves. That
  * has been handled earlier when unmapping all the memory regions.
@@ -231,6 +242,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
 	pte_free_tlb(tlb, token, addr);
 	mm_dec_nr_ptes(tlb->mm);
 }
+#endif /* CONFIG_FREE_USER_PTE */
 
 static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
 				unsigned long addr, unsigned long end,
@@ -4631,6 +4643,9 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 			goto retry;
 		vmf->orig_pte = *vmf->pte;
 
+		if (IS_ENABLED(CONFIG_FREE_USER_PTE))
+			vmf->flags |= FAULT_FLAG_PTE_GET;
+
 		/*
 		 * some architectures can have larger ptes than wordsize,
 		 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
diff --git a/mm/pte_ref.c b/mm/pte_ref.c
index de109905bc8f..728e61cea25e 100644
--- a/mm/pte_ref.c
+++ b/mm/pte_ref.c
@@ -7,7 +7,10 @@
 
 #include <linux/pte_ref.h>
 #include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <asm/tlbflush.h>
 
+#ifdef CONFIG_FREE_USER_PTE
 /*
  * pte_get_unless_zero - Increment refcount for the PTE page table
  *			 unless it is zero.
@@ -15,7 +18,10 @@
  */
 bool pte_get_unless_zero(pmd_t *pmd)
 {
-	return true;
+	pgtable_t pte = pmd_pgtable(*pmd);
+
+	VM_BUG_ON(!PageTable(pte));
+	return atomic_inc_not_zero(&pte->pte_refcount);
 }
 
 /*
@@ -32,12 +38,20 @@ bool pte_get_unless_zero(pmd_t *pmd)
  */
 enum pte_tryget_type pte_try_get(pmd_t *pmd)
 {
-	if (unlikely(pmd_none(*pmd)))
-		return TRYGET_FAILED_NONE;
-	if (unlikely(is_huge_pmd(*pmd)))
-		return TRYGET_FAILED_HUGE_PMD;
+	int retval = TRYGET_SUCCESSED;
+	pmd_t pmdval;
 
-	return TRYGET_SUCCESSED;
+	rcu_read_lock();
+	pmdval = READ_ONCE(*pmd);
+	if (unlikely(pmd_none(pmdval)))
+		retval = TRYGET_FAILED_NONE;
+	else if (unlikely(is_huge_pmd(pmdval)))
+		retval = TRYGET_FAILED_HUGE_PMD;
+	else if (!pte_get_unless_zero(&pmdval))
+		retval = TRYGET_FAILED_ZERO;
+	rcu_read_unlock();
+
+	return retval;
 }
 
 /*
@@ -52,4 +66,69 @@ enum pte_tryget_type pte_try_get(pmd_t *pmd)
  */
 void pte_put_vmf(struct vm_fault *vmf)
 {
+	if (!(vmf->flags & FAULT_FLAG_PTE_GET))
+		return;
+	vmf->flags &= ~FAULT_FLAG_PTE_GET;
+
+	pte_put(vmf->vma->vm_mm, vmf->pmd, vmf->address);
+}
+#else
+bool pte_get_unless_zero(pmd_t *pmd)
+{
+	return true;
+}
+
+enum pte_tryget_type pte_try_get(pmd_t *pmd)
+{
+	if (unlikely(pmd_none(*pmd)))
+		return TRYGET_FAILED_NONE;
+
+	if (unlikely(is_huge_pmd(*pmd)))
+		return TRYGET_FAILED_HUGE_PMD;
+
+	return TRYGET_SUCCESSED;
+}
+
+void pte_put_vmf(struct vm_fault *vmf)
+{
+}
+#endif /* CONFIG_FREE_USER_PTE */
+
+#ifdef CONFIG_DEBUG_VM
+static void pte_free_debug(pmd_t pmd)
+{
+	pte_t *ptep = (pte_t *)pmd_page_vaddr(pmd);
+	int i = 0;
+
+	for (i = 0; i < PTRS_PER_PTE; i++)
+		BUG_ON(!pte_none(*ptep++));
+}
+#else
+static inline void pte_free_debug(pmd_t pmd)
+{
+}
+#endif
+
+static void pte_free_rcu(struct rcu_head *rcu)
+{
+	struct page *page = container_of(rcu, struct page, rcu_head);
+
+	pgtable_pte_page_dtor(page);
+	__free_page(page);
+}
+
+void free_user_pte_table(struct mm_struct *mm, pmd_t *pmd, unsigned long addr)
+{
+	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+	spinlock_t *ptl;
+	pmd_t pmdval;
+
+	ptl = pmd_lock(mm, pmd);
+	pmdval = pmdp_huge_get_and_clear(mm, addr, pmd);
+	flush_tlb_range(&vma, addr, addr + PMD_SIZE);
+	spin_unlock(ptl);
+
+	pte_free_debug(pmdval);
+	mm_dec_nr_ptes(mm);
+	call_rcu(&pmd_pgtable(pmdval)->rcu_head, pte_free_rcu);
 }
-- 
2.11.0

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ