[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <9fb3dc75cb7f023750da2b4645fd098429deaad5.1722861064.git.zhengqi.arch@bytedance.com>
Date: Mon, 5 Aug 2024 20:55:08 +0800
From: Qi Zheng <zhengqi.arch@...edance.com>
To: david@...hat.com,
hughd@...gle.com,
willy@...radead.org,
mgorman@...e.de,
muchun.song@...ux.dev,
vbabka@...nel.org,
akpm@...ux-foundation.org,
zokeefe@...gle.com,
rientjes@...gle.com
Cc: linux-mm@...ck.org,
linux-kernel@...r.kernel.org,
Qi Zheng <zhengqi.arch@...edance.com>
Subject: [RFC PATCH v2 4/7] mm: pgtable: try to reclaim empty PTE pages in zap_page_range_single()
Now in order to pursue high performance, applications mostly use some
high-performance user-mode memory allocators, such as jemalloc or
tcmalloc. These memory allocators use madvise(MADV_DONTNEED or MADV_FREE)
to release physical memory, but neither MADV_DONTNEED nor MADV_FREE will
release page table memory, which may cause huge page table memory usage.
The following are a memory usage snapshot of one process which actually
happened on our server:
VIRT: 55t
RES: 590g
VmPTE: 110g
In this case, most of the page table entries are empty. For such a PTE
page where all entries are empty, we can actually free it back to the
system for others to use.
As a first step, this commit attempts to synchronously free the empty PTE
pages in zap_page_range_single() (MADV_DONTNEED etc will invoke this). In
order to reduce overhead, we only handle the cases with a high probability
of generating empty PTE pages, and other cases will be filtered out, such
as:
- hugetlb vma (unsuitable)
- userfaultfd_wp vma (may reinstall the pte entry)
- writable private file mapping case (COW-ed anon page is not zapped)
- etc
For userfaultfd_wp and private file mapping cases (and MADV_FREE case, of
course), consider scanning and freeing empty PTE pages asynchronously in
the future.
The following code snippet can show the effect of optimization:
mmap 50G
while (1) {
for (; i < 1024 * 25; i++) {
touch 2M memory
madvise MADV_DONTNEED 2M
}
}
As we can see, the memory usage of VmPTE is reduced:
before after
VIRT 50.0 GB 50.0 GB
RES 3.1 MB 3.1 MB
VmPTE 102640 KB 240 KB
Signed-off-by: Qi Zheng <zhengqi.arch@...edance.com>
---
include/linux/pgtable.h | 14 +++++
mm/Makefile | 1 +
mm/huge_memory.c | 3 +
mm/internal.h | 14 +++++
mm/khugepaged.c | 30 +++++++--
mm/memory.c | 2 +
mm/pt_reclaim.c | 131 ++++++++++++++++++++++++++++++++++++++++
7 files changed, 189 insertions(+), 6 deletions(-)
create mode 100644 mm/pt_reclaim.c
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 2a6a3cccfc367..572343650eb0f 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -447,6 +447,20 @@ static inline void arch_check_zapped_pmd(struct vm_area_struct *vma,
}
#endif
+#ifndef arch_flush_tlb_before_set_huge_page
+static inline void arch_flush_tlb_before_set_huge_page(struct mm_struct *mm,
+ unsigned long addr)
+{
+}
+#endif
+
+#ifndef arch_flush_tlb_before_set_pte_page
+static inline void arch_flush_tlb_before_set_pte_page(struct mm_struct *mm,
+ unsigned long addr)
+{
+}
+#endif
+
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long address,
diff --git a/mm/Makefile b/mm/Makefile
index ab5ed56c5c033..8bec86469c1d5 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -145,3 +145,4 @@ obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
obj-$(CONFIG_EXECMEM) += execmem.o
obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
+obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 697fcf89f975b..0afbb1e45cdac 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -999,6 +999,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
folio_add_lru_vma(folio, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
+ arch_flush_tlb_before_set_huge_page(vma->vm_mm, haddr);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -1066,6 +1067,7 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
entry = mk_pmd(&zero_folio->page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ arch_flush_tlb_before_set_huge_page(mm, haddr);
set_pmd_at(mm, haddr, pmd, entry);
mm_inc_nr_ptes(mm);
}
@@ -1173,6 +1175,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
pgtable = NULL;
}
+ arch_flush_tlb_before_set_huge_page(mm, addr);
set_pmd_at(mm, addr, pmd, entry);
update_mmu_cache_pmd(vma, addr, pmd);
diff --git a/mm/internal.h b/mm/internal.h
index dfc992de01115..09bd1cee7a523 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1441,4 +1441,18 @@ static inline bool try_to_accept_memory(struct zone *zone, unsigned int order)
}
#endif /* CONFIG_UNACCEPTED_MEMORY */
+#ifdef CONFIG_PT_RECLAIM
+void try_to_reclaim_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long start_addr, unsigned long end_addr,
+ struct zap_details *details);
+#else
+static inline void try_to_reclaim_pgtables(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long start_addr,
+ unsigned long end_addr,
+ struct zap_details *details)
+{
+}
+#endif /* CONFIG_PT_RECLAIM */
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 91b93259ee214..ffd3963b1c3d1 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1598,7 +1598,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
pml = pmd_lock(mm, pmd);
- start_pte = pte_offset_map_nolock(mm, pmd, NULL, haddr, &ptl);
+ start_pte = pte_offset_map_nolock(mm, pmd, &pgt_pmd, haddr, &ptl);
if (!start_pte) /* mmap_lock + page lock should prevent this */
goto abort;
if (!pml)
@@ -1606,6 +1606,11 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
else if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ /* pmd entry may be changed by others */
+ if (unlikely(IS_ENABLED(CONFIG_PT_RECLAIM) && !pml &&
+ !pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
+ goto abort;
+
/* step 2: clear page table and adjust rmap */
for (i = 0, addr = haddr, pte = start_pte;
i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
@@ -1651,6 +1656,11 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
/* step 4: remove empty page table */
if (!pml) {
pml = pmd_lock(mm, pmd);
+ if (unlikely(IS_ENABLED(CONFIG_PT_RECLAIM) &&
+ !pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
+ spin_unlock(pml);
+ goto pmd_change;
+ }
if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
}
@@ -1682,6 +1692,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
pte_unmap_unlock(start_pte, ptl);
if (pml && pml != ptl)
spin_unlock(pml);
+pmd_change:
if (notified)
mmu_notifier_invalidate_range_end(&range);
drop_folio:
@@ -1703,6 +1714,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
spinlock_t *pml;
spinlock_t *ptl;
bool skipped_uffd = false;
+ pte_t *pte;
/*
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
@@ -1738,11 +1750,17 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
addr, addr + HPAGE_PMD_SIZE);
mmu_notifier_invalidate_range_start(&range);
+ pte = pte_offset_map_nolock(mm, pmd, &pgt_pmd, addr, &ptl);
+ if (!pte)
+ goto skip;
+
pml = pmd_lock(mm, pmd);
- ptl = pte_lockptr(mm, pmd);
if (ptl != pml)
spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+ if (unlikely(IS_ENABLED(CONFIG_PT_RECLAIM) &&
+ !pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
+ goto unlock_skip;
/*
* Huge page lock is still held, so normally the page table
* must remain empty; and we have already skipped anon_vma
@@ -1758,11 +1776,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
pgt_pmd = pmdp_collapse_flush(vma, addr, pmd);
pmdp_get_lockless_sync();
}
-
+unlock_skip:
+ pte_unmap_unlock(pte, ptl);
if (ptl != pml)
- spin_unlock(ptl);
- spin_unlock(pml);
-
+ spin_unlock(pml);
+skip:
mmu_notifier_invalidate_range_end(&range);
if (!skipped_uffd) {
diff --git a/mm/memory.c b/mm/memory.c
index fef1e425e4702..a8108451e4dac 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -423,6 +423,7 @@ void pmd_install(struct mm_struct *mm, pmd_t *pmd, unsigned long addr,
spinlock_t *ptl = pmd_lock(mm, pmd);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
+ arch_flush_tlb_before_set_pte_page(mm, addr);
mm_inc_nr_ptes(mm);
/*
* Ensure all pte setup (eg. pte page lock and page clearing) are
@@ -1931,6 +1932,7 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
* could have been expanded for hugetlb pmd sharing.
*/
unmap_single_vma(&tlb, vma, address, end, details, false);
+ try_to_reclaim_pgtables(&tlb, vma, address, end, details);
mmu_notifier_invalidate_range_end(&range);
tlb_finish_mmu(&tlb);
hugetlb_zap_end(vma, details);
diff --git a/mm/pt_reclaim.c b/mm/pt_reclaim.c
new file mode 100644
index 0000000000000..e375e7f2059f8
--- /dev/null
+++ b/mm/pt_reclaim.c
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/pagewalk.h>
+#include <linux/hugetlb.h>
+#include <asm-generic/tlb.h>
+#include <asm/pgalloc.h>
+
+#include "internal.h"
+
+/*
+ * Locking:
+ * - already held the mmap read lock to traverse the pgtable
+ * - use pmd lock for clearing pmd entry
+ * - use pte lock for checking empty PTE page, and release it after clearing
+ * pmd entry, then we can capture the changed pmd in pte_offset_map_lock()
+ * etc after holding this pte lock. Thanks to this, we don't need to hold the
+ * rmap-related locks.
+ * - users of pte_offset_map_lock() etc all expect the PTE page to be stable by
+ * using rcu lock, so PTE pages should be freed by RCU.
+ */
+static int reclaim_pgtables_pmd_entry(pmd_t *pmd, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ struct mm_struct *mm = walk->mm;
+ struct mmu_gather *tlb = walk->private;
+ pte_t *start_pte, *pte;
+ pmd_t pmdval;
+ spinlock_t *pml = NULL, *ptl;
+ int i;
+
+ start_pte = pte_offset_map_nolock(mm, pmd, &pmdval, addr, &ptl);
+ if (!start_pte)
+ return 0;
+
+ pml = pmd_lock(mm, pmd);
+ if (ptl != pml)
+ spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
+
+ if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd))))
+ goto out_ptl;
+
+ /* Check if it is empty PTE page */
+ for (i = 0, pte = start_pte; i < PTRS_PER_PTE; i++, pte++) {
+ if (!pte_none(ptep_get(pte)))
+ goto out_ptl;
+ }
+ pte_unmap(start_pte);
+
+ pmd_clear(pmd);
+ if (ptl != pml)
+ spin_unlock(ptl);
+ spin_unlock(pml);
+
+ /*
+ * NOTE:
+ * In order to reuse mmu_gather to batch flush tlb and free PTE pages,
+ * here tlb is not flushed before pmd lock is unlocked. This may
+ * result in the following two situations:
+ *
+ * 1) Userland can trigger page fault and fill a huge page, which will
+ * cause the existence of small size TLB and huge TLB for the same
+ * address.
+ *
+ * 2) Userland can also trigger page fault and fill a PTE page, which
+ * will cause the existence of two small size TLBs, but the PTE
+ * page they map are different.
+ *
+ * Some CPUs do not allow these, to solve this, we can define
+ * arch_flush_tlb_before_set_{huge|pte}_page to detect this case and
+ * flush TLB before filling a huge page or a PTE page in page fault
+ * path.
+ */
+ pte_free_tlb(tlb, pmd_pgtable(pmdval), addr);
+ mm_dec_nr_ptes(mm);
+
+ return 0;
+
+out_ptl:
+ pte_unmap_unlock(start_pte, ptl);
+ if (pml != ptl)
+ spin_unlock(pml);
+
+ return 0;
+}
+
+static const struct mm_walk_ops reclaim_pgtables_walk_ops = {
+ .pmd_entry = reclaim_pgtables_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
+void try_to_reclaim_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ unsigned long start_addr, unsigned long end_addr,
+ struct zap_details *details)
+{
+ unsigned long start = max(vma->vm_start, start_addr);
+ unsigned long end;
+
+ if (start >= vma->vm_end)
+ return;
+ end = min(vma->vm_end, end_addr);
+ if (end <= vma->vm_start)
+ return;
+
+ /* Skip hugetlb case */
+ if (is_vm_hugetlb_page(vma))
+ return;
+
+ /* Leave this to the THP path to handle */
+ if (vma->vm_flags & VM_HUGEPAGE)
+ return;
+
+ /* userfaultfd_wp case may reinstall the pte entry, also skip */
+ if (userfaultfd_wp(vma))
+ return;
+
+ /*
+ * For private file mapping, the COW-ed page is an anon page, and it
+ * will not be zapped. For simplicity, skip the all writable private
+ * file mapping cases.
+ */
+ if (details && !vma_is_anonymous(vma) &&
+ !(vma->vm_flags & VM_MAYSHARE) &&
+ (vma->vm_flags & VM_WRITE))
+ return;
+
+ start = ALIGN(start, PMD_SIZE);
+ end = ALIGN_DOWN(end, PMD_SIZE);
+ if (end - start < PMD_SIZE)
+ return;
+
+ walk_page_range_vma(vma, start, end, &reclaim_pgtables_walk_ops, tlb);
+}
--
2.20.1
Powered by blists - more mailing lists