[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20230414142341.354556-2-shiyn.lin@gmail.com>
Date: Fri, 14 Apr 2023 22:23:25 +0800
From: Chih-En Lin <shiyn.lin@...il.com>
To: Andrew Morton <akpm@...ux-foundation.org>,
Qi Zheng <zhengqi.arch@...edance.com>,
David Hildenbrand <david@...hat.com>,
"Matthew Wilcox (Oracle)" <willy@...radead.org>,
Christophe Leroy <christophe.leroy@...roup.eu>,
John Hubbard <jhubbard@...dia.com>,
Nadav Amit <namit@...are.com>, Barry Song <baohua@...nel.org>,
Pasha Tatashin <pasha.tatashin@...een.com>
Cc: Thomas Gleixner <tglx@...utronix.de>,
Ingo Molnar <mingo@...hat.com>, Borislav Petkov <bp@...en8.de>,
Dave Hansen <dave.hansen@...ux.intel.com>,
"H. Peter Anvin" <hpa@...or.com>,
Steven Rostedt <rostedt@...dmis.org>,
Masami Hiramatsu <mhiramat@...nel.org>,
Peter Zijlstra <peterz@...radead.org>,
Arnaldo Carvalho de Melo <acme@...nel.org>,
Mark Rutland <mark.rutland@....com>,
Alexander Shishkin <alexander.shishkin@...ux.intel.com>,
Jiri Olsa <jolsa@...nel.org>,
Namhyung Kim <namhyung@...nel.org>,
Ian Rogers <irogers@...gle.com>,
Adrian Hunter <adrian.hunter@...el.com>,
Yu Zhao <yuzhao@...gle.com>,
Steven Barrett <steven@...uorix.net>,
Juergen Gross <jgross@...e.com>, Peter Xu <peterx@...hat.com>,
Kefeng Wang <wangkefeng.wang@...wei.com>,
Tong Tiangen <tongtiangen@...wei.com>,
Christoph Hellwig <hch@...radead.org>,
"Liam R. Howlett" <Liam.Howlett@...cle.com>,
Yang Shi <shy828301@...il.com>,
Vlastimil Babka <vbabka@...e.cz>,
Alex Sierra <alex.sierra@....com>,
Vincent Whitchurch <vincent.whitchurch@...s.com>,
Anshuman Khandual <anshuman.khandual@....com>,
Li kunyu <kunyu@...china.com>,
Liu Shixin <liushixin2@...wei.com>,
Hugh Dickins <hughd@...gle.com>,
Minchan Kim <minchan@...nel.org>,
Joey Gouly <joey.gouly@....com>,
Chih-En Lin <shiyn.lin@...il.com>,
Michal Hocko <mhocko@...e.com>,
Suren Baghdasaryan <surenb@...gle.com>,
"Zach O'Keefe" <zokeefe@...gle.com>,
Gautam Menghani <gautammenghani201@...il.com>,
Catalin Marinas <catalin.marinas@....com>,
Mark Brown <broonie@...nel.org>,
"Eric W. Biederman" <ebiederm@...ssion.com>,
Andrei Vagin <avagin@...il.com>,
Shakeel Butt <shakeelb@...gle.com>,
Daniel Bristot de Oliveira <bristot@...nel.org>,
"Jason A. Donenfeld" <Jason@...c4.com>,
Greg Kroah-Hartman <gregkh@...uxfoundation.org>,
Alexey Gladkov <legion@...nel.org>, x86@...nel.org,
linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org,
linux-mm@...ck.org, linux-trace-kernel@...r.kernel.org,
linux-perf-users@...r.kernel.org,
Dinglan Peng <peng301@...due.edu>,
Pedro Fonseca <pfonseca@...due.edu>,
Jim Huang <jserv@...s.ncku.edu.tw>,
Huichun Feng <foxhoundsk.tw@...il.com>
Subject: [PATCH v5 01/17] mm: Split out the present cases from zap_pte_range()
As the complexity of zap_pte_range() has increased, The readability
and maintainability are becoming more difficult. To simplfy and
improve the expandability of zap PTE part, split the present and
non-present cases from zap_pte_range() and replace the individual
flag variable by the single flag with bitwise operations.
Signed-off-by: Chih-En Lin <shiyn.lin@...il.com>
---
mm/memory.c | 217 +++++++++++++++++++++++++++++++---------------------
1 file changed, 129 insertions(+), 88 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index 01a23ad48a04..0476cf22ea33 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1351,29 +1351,147 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
}
+#define ZAP_PTE_INIT 0x0000
+#define ZAP_PTE_FORCE_FLUSH 0x0001
+
+struct zap_pte_details {
+ pte_t **pte;
+ unsigned long *addr;
+ unsigned int flags;
+ int rss[NR_MM_COUNTERS];
+};
+
+/* Return 0 to continue, 1 to break. */
+static inline int
+zap_present_pte(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ struct zap_details *details,
+ struct zap_pte_details *pte_details)
+{
+ struct mm_struct *mm = tlb->mm;
+ struct page *page;
+ unsigned int delay_rmap;
+ unsigned long addr = *pte_details->addr;
+ pte_t *pte = *pte_details->pte;
+ pte_t ptent = *pte;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (unlikely(!should_zap_page(details, page)))
+ return 0;
+
+ ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
+ if (unlikely(!page))
+ return 0;
+
+ delay_rmap = 0;
+ if (!PageAnon(page)) {
+ if (pte_dirty(ptent)) {
+ set_page_dirty(page);
+ if (tlb_delay_rmap(tlb)) {
+ delay_rmap = 1;
+ pte_details->flags |= ZAP_PTE_FORCE_FLUSH;
+ }
+ }
+ if (pte_young(ptent) && likely(vma_has_recency(vma)))
+ mark_page_accessed(page);
+
+ }
+ pte_details->rss[mm_counter(page)]--;
+ if (!delay_rmap) {
+ page_remove_rmap(page, vma, false);
+ if (unlikely(page_mapcount(page) < 0))
+ print_bad_pte(vma, addr, ptent, page);
+ }
+ if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
+ *pte_details->addr += PAGE_SIZE;
+ pte_details->flags |= ZAP_PTE_FORCE_FLUSH;
+ return 1;
+ }
+
+ return 0;
+}
+
+static inline void
+zap_nopresent_pte(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ struct zap_details *details,
+ struct zap_pte_details *pte_details)
+{
+ struct mm_struct *mm = tlb->mm;
+ struct page *page;
+ unsigned long addr = *pte_details->addr;
+ pte_t *pte = *pte_details->pte;
+ pte_t ptent = *pte;
+ swp_entry_t entry = pte_to_swp_entry(ptent);
+
+ if (is_device_private_entry(entry) ||
+ is_device_exclusive_entry(entry)) {
+ page = pfn_swap_entry_to_page(entry);
+ if (unlikely(!should_zap_page(details, page)))
+ return;
+ /*
+ * Both device private/exclusive mappings should only
+ * work with anonymous page so far, so we don't need to
+ * consider uffd-wp bit when zap. For more information,
+ * see zap_install_uffd_wp_if_needed().
+ */
+ WARN_ON_ONCE(!vma_is_anonymous(vma));
+ pte_details->rss[mm_counter(page)]--;
+ if (is_device_private_entry(entry))
+ page_remove_rmap(page, vma, false);
+ put_page(page);
+ } else if (!non_swap_entry(entry)) {
+ /* Genuine swap entry, hence a private anon page */
+ if (!should_zap_cows(details))
+ return;
+ pte_details->rss[MM_SWAPENTS]--;
+ if (unlikely(!free_swap_and_cache(entry)))
+ print_bad_pte(vma, addr, ptent, NULL);
+ } else if (is_migration_entry(entry)) {
+ page = pfn_swap_entry_to_page(entry);
+ if (!should_zap_page(details, page))
+ return;
+ pte_details->rss[mm_counter(page)]--;
+ } else if (pte_marker_entry_uffd_wp(entry)) {
+ /* Only drop the uffd-wp marker if explicitly requested */
+ if (!zap_drop_file_uffd_wp(details))
+ return;
+ } else if (is_hwpoison_entry(entry) ||
+ is_swapin_error_entry(entry)) {
+ if (!should_zap_cows(details))
+ return;
+ } else {
+ /* We should have covered all the swap entry types */
+ WARN_ON_ONCE(1);
+ }
+ pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
+ zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
+}
+
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
struct zap_details *details)
{
struct mm_struct *mm = tlb->mm;
- int force_flush = 0;
- int rss[NR_MM_COUNTERS];
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
- swp_entry_t entry;
+ struct zap_pte_details pte_details = {
+ .addr = &addr,
+ .flags = ZAP_PTE_INIT,
+ .pte = &pte,
+ };
tlb_change_page_size(tlb, PAGE_SIZE);
again:
- init_rss_vec(rss);
+ init_rss_vec(pte_details.rss);
start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
pte = start_pte;
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
do {
pte_t ptent = *pte;
- struct page *page;
if (pte_none(ptent))
continue;
@@ -1382,95 +1500,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
break;
if (pte_present(ptent)) {
- unsigned int delay_rmap;
-
- page = vm_normal_page(vma, addr, ptent);
- if (unlikely(!should_zap_page(details, page)))
- continue;
- ptent = ptep_get_and_clear_full(mm, addr, pte,
- tlb->fullmm);
- tlb_remove_tlb_entry(tlb, pte, addr);
- zap_install_uffd_wp_if_needed(vma, addr, pte, details,
- ptent);
- if (unlikely(!page))
- continue;
-
- delay_rmap = 0;
- if (!PageAnon(page)) {
- if (pte_dirty(ptent)) {
- set_page_dirty(page);
- if (tlb_delay_rmap(tlb)) {
- delay_rmap = 1;
- force_flush = 1;
- }
- }
- if (pte_young(ptent) && likely(vma_has_recency(vma)))
- mark_page_accessed(page);
- }
- rss[mm_counter(page)]--;
- if (!delay_rmap) {
- page_remove_rmap(page, vma, false);
- if (unlikely(page_mapcount(page) < 0))
- print_bad_pte(vma, addr, ptent, page);
- }
- if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
- force_flush = 1;
- addr += PAGE_SIZE;
+ if (zap_present_pte(tlb, vma, details, &pte_details))
break;
- }
continue;
}
-
- entry = pte_to_swp_entry(ptent);
- if (is_device_private_entry(entry) ||
- is_device_exclusive_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
- if (unlikely(!should_zap_page(details, page)))
- continue;
- /*
- * Both device private/exclusive mappings should only
- * work with anonymous page so far, so we don't need to
- * consider uffd-wp bit when zap. For more information,
- * see zap_install_uffd_wp_if_needed().
- */
- WARN_ON_ONCE(!vma_is_anonymous(vma));
- rss[mm_counter(page)]--;
- if (is_device_private_entry(entry))
- page_remove_rmap(page, vma, false);
- put_page(page);
- } else if (!non_swap_entry(entry)) {
- /* Genuine swap entry, hence a private anon page */
- if (!should_zap_cows(details))
- continue;
- rss[MM_SWAPENTS]--;
- if (unlikely(!free_swap_and_cache(entry)))
- print_bad_pte(vma, addr, ptent, NULL);
- } else if (is_migration_entry(entry)) {
- page = pfn_swap_entry_to_page(entry);
- if (!should_zap_page(details, page))
- continue;
- rss[mm_counter(page)]--;
- } else if (pte_marker_entry_uffd_wp(entry)) {
- /* Only drop the uffd-wp marker if explicitly requested */
- if (!zap_drop_file_uffd_wp(details))
- continue;
- } else if (is_hwpoison_entry(entry) ||
- is_swapin_error_entry(entry)) {
- if (!should_zap_cows(details))
- continue;
- } else {
- /* We should have covered all the swap entry types */
- WARN_ON_ONCE(1);
- }
- pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
- zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
+ zap_nopresent_pte(tlb, vma, details, &pte_details);
} while (pte++, addr += PAGE_SIZE, addr != end);
- add_mm_rss_vec(mm, rss);
+ add_mm_rss_vec(mm, pte_details.rss);
arch_leave_lazy_mmu_mode();
/* Do the actual TLB flush before dropping ptl */
- if (force_flush) {
+ if (pte_details.flags & ZAP_PTE_FORCE_FLUSH) {
tlb_flush_mmu_tlbonly(tlb);
tlb_flush_rmaps(tlb, vma);
}
@@ -1482,8 +1523,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
* entries before releasing the ptl), free the batched
* memory too. Restart if we didn't do everything.
*/
- if (force_flush) {
- force_flush = 0;
+ if (pte_details.flags & ZAP_PTE_FORCE_FLUSH) {
+ pte_details.flags &= ~ZAP_PTE_FORCE_FLUSH;
tlb_flush_mmu(tlb);
}
--
2.34.1
Powered by blists - more mailing lists