From 552d121375f88ba4db55460cd378c9150f994945 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 29 Oct 2022 11:45:07 -0700 Subject: [PATCH 4/4] mm: delay rmap removal until after TLB flush When we remove a page table entry, we are very careful to only free the page after we have flushed the TLB, because other CPUs could still be using the page through stale TLB entries until after the flush. However, we have removed the rmap entry for that page early, which means that functions like folio_mkclean() would end up not serializing with the page table lock because the page had already been made invisible to rmap. And that is a problem, because while the TLB entry exists, we could end up with the followign situation: (a) one CPU could come in and clean it, never seeing our mapping of the page (b) another CPU could continue to use the stale and dirty TLB entry and continue to write to said page resulting in a page that has been dirtied, but then marked clean again, all while another CPU might have dirtied it some more. End result: possibly lost dirty data. This commit uses the same old TLB gather array that we use to delay the freeing of the page to also say 'remove from rmap after flush', so that we can keep the rmap entries alive until all TLB entries have been flushed. NOTE! While the "possibly lost dirty data" sounds catastrophic, for this all to happen you need to have a user thread doing either madvise() with MADV_DONTNEED or a full re-mmap() of the area concurrently with another thread continuing to use said mapping. So arguably this is about user space doing crazy things, but from a VM consistency standpoint it's better if we track the dirty bit properly even then. Reported-by: Nadav Amit Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 36 +++++++++++++++++++++++++++++----- mm/memory.c | 3 +-- mm/mmu_gather.c | 41 +++++++++++++++++++++++++++++++++++---- mm/rmap.c | 4 +--- 4 files changed, 70 insertions(+), 14 deletions(-) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 492dce43236e..a5c9c9989fd2 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -238,11 +238,37 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table); */ #define MMU_GATHER_BUNDLE 8 +/* + * Fake type for an encoded page with flag bits in the low bits. + * + * Right now just one bit, but we could have more depending on the + * alignment of 'struct page'. + */ +struct encoded_page; +#define TLB_ZAP_RMAP 1ul +#define ENCODE_PAGE_BITS (TLB_ZAP_RMAP) + +static inline struct encoded_page *encode_page(struct page *page, unsigned long flags) +{ + flags &= ENCODE_PAGE_BITS; + return (struct encoded_page *)(flags | (unsigned long)page); +} + +static inline bool encoded_page_flags(struct encoded_page *page) +{ + return ENCODE_PAGE_BITS & (unsigned long)page; +} + +static inline struct page *encoded_page_ptr(struct encoded_page *page) +{ + return (struct page *)(~ENCODE_PAGE_BITS & (unsigned long)page); +} + struct mmu_gather_batch { struct mmu_gather_batch *next; unsigned int nr; unsigned int max; - struct page *pages[]; + struct encoded_page *encoded_pages[]; }; #define MAX_GATHER_BATCH \ @@ -257,7 +283,7 @@ struct mmu_gather_batch { #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, - int page_size); + int page_size, unsigned int flags); #endif /* @@ -431,13 +457,13 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - if (__tlb_remove_page_size(tlb, page, page_size)) + if (__tlb_remove_page_size(tlb, page, page_size, 0)) tlb_flush_mmu(tlb); } -static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page, unsigned int flags) { - return __tlb_remove_page_size(tlb, page, PAGE_SIZE); + return __tlb_remove_page_size(tlb, page, PAGE_SIZE, flags); } /* tlb_remove_page diff --git a/mm/memory.c b/mm/memory.c index c893f5ffc5a8..230946536115 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1452,12 +1452,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); } - page_zap_pte_rmap(page); munlock_vma_page(page, vma, false); rss[mm_counter(page)]--; if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); - if (unlikely(__tlb_remove_page(tlb, page))) { + if (unlikely(__tlb_remove_page(tlb, page, TLB_ZAP_RMAP))) { force_flush = 1; addr += PAGE_SIZE; break; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index add4244e5790..587873e5984c 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -43,12 +44,43 @@ static bool tlb_next_batch(struct mmu_gather *tlb) return true; } +/* + * We get an 'encoded page' array, which has page pointers with + * encoded flags in the low bits of the array. + * + * The TLB has been flushed, now we need to react to the flag bits + * the 'struct page', clean the array in-place, and then free the + * pages and their swap cache. + */ +static void clean_and_free_pages_and_swap_cache(struct encoded_page **pages, unsigned int nr) +{ + for (unsigned int i = 0; i < nr; i++) { + struct encoded_page *encoded = pages[i]; + unsigned int flags = encoded_page_flags(encoded); + if (flags) { + /* Clean the flagged pointer in-place */ + struct page *page = encoded_page_ptr(encoded); + pages[i] = encode_page(page, 0); + + /* The flag bit being set means that we should zap the rmap */ + page_zap_pte_rmap(page); + } + } + + /* + * Now all entries have been un-encoded, and changed to plain + * page pointers, so we can cast the 'encoded_page' array to + * a plain page array and free them + */ + free_pages_and_swap_cache((struct page **)pages, nr); +} + static void tlb_batch_pages_flush(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { - struct page **pages = batch->pages; + struct encoded_page **pages = batch->encoded_pages; do { /* @@ -56,7 +88,7 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb) */ unsigned int nr = min(512U, batch->nr); - free_pages_and_swap_cache(pages, nr); + clean_and_free_pages_and_swap_cache(pages, nr); pages += nr; batch->nr -= nr; @@ -77,11 +109,12 @@ static void tlb_batch_list_free(struct mmu_gather *tlb) tlb->local.next = NULL; } -bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size, unsigned int flags) { struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); + VM_BUG_ON(flags & ~ENCODE_PAGE_BITS); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE VM_WARN_ON(tlb->page_size != page_size); @@ -92,7 +125,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ * Add the page and check if we are full. If so * force a flush. */ - batch->pages[batch->nr++] = page; + batch->encoded_pages[batch->nr++] = encode_page(page, flags); if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; diff --git a/mm/rmap.c b/mm/rmap.c index 28b51a31ebb0..416b7078b75f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1422,8 +1422,6 @@ static void page_remove_anon_compound_rmap(struct page *page) * separately. * * This allows for a much simpler calling convention and code. - * - * The caller holds the pte lock. */ void page_zap_pte_rmap(struct page *page) { @@ -1431,7 +1429,7 @@ void page_zap_pte_rmap(struct page *page) return; lock_page_memcg(page); - __dec_lruvec_page_state(page, + dec_lruvec_page_state(page, PageAnon(page) ? NR_ANON_MAPPED : NR_FILE_MAPPED); unlock_page_memcg(page); } -- 2.37.1.289.g45aa1e5c72.dirty