From 86d1a3807c013abca72086278d9308e398e7b41d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 29 Oct 2022 11:45:07 -0700 Subject: [PATCH 2/2] mm: make sure to flush TLB before marking page dcirty When we remove a page table entry, we are very careful to only free the page after we have flushed the TLB, because other CPU's could still be using the page through stale TLB entries until after the flush. However, we mark the underlying page dirty immediately, and then remove the rmap entry for the page, which means that (a) another CPU could come in and clean it, never seeing our mapping of the page (b) yet another CPU could continue to use the stale and dirty TLB entry and continue to write to said page resulting in a page that has been dirtied, but then marked clean again, all while another CPU might have dirtied it some more. End result: possibly lost dirty data. This commit uses the same old TLB gather array that we use to delay the freeing of the page to also keep the dirty state of the page table entry, so that the 'set_page_dirty()' from the page table can be done after the TLB flush, closing the race. Reported-by: Nadav Amit Signed-off-by: Linus Torvalds --- include/asm-generic/tlb.h | 28 +++++++++++++++++++++++----- mm/memory.c | 10 +++++----- mm/mmu_gather.c | 36 ++++++++++++++++++++++++++++++++---- 3 files changed, 60 insertions(+), 14 deletions(-) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 492dce43236e..a95085f6dd47 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -238,11 +238,29 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table); */ #define MMU_GATHER_BUNDLE 8 +/* Fake type for an encoded page pointer with the dirty bit in the low bit */ +struct encoded_page; + +static inline struct encoded_page *encode_page(struct page *page, bool dirty) +{ + return (struct encoded_page *)(dirty | (unsigned long)page); +} + +static inline bool encoded_page_dirty(struct encoded_page *page) +{ + return 1 & (unsigned long)page; +} + +static inline struct page *encoded_page_ptr(struct encoded_page *page) +{ + return (struct page *)(~1ul & (unsigned long)page); +} + struct mmu_gather_batch { struct mmu_gather_batch *next; unsigned int nr; unsigned int max; - struct page *pages[]; + struct encoded_page *encoded_pages[]; }; #define MAX_GATHER_BATCH \ @@ -257,7 +275,7 @@ struct mmu_gather_batch { #define MAX_GATHER_BATCH_COUNT (10000UL/MAX_GATHER_BATCH) extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, - int page_size); + int page_size, bool dirty); #endif /* @@ -431,13 +449,13 @@ static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - if (__tlb_remove_page_size(tlb, page, page_size)) + if (__tlb_remove_page_size(tlb, page, page_size, false)) tlb_flush_mmu(tlb); } -static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page, bool dirty) { - return __tlb_remove_page_size(tlb, page, PAGE_SIZE); + return __tlb_remove_page_size(tlb, page, PAGE_SIZE, dirty); } /* tlb_remove_page diff --git a/mm/memory.c b/mm/memory.c index d52f5a68c561..8ab4c0d7e99e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1432,6 +1432,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_present(ptent)) { struct page *page = vm_normal_page(vma, addr, ptent); + int dirty; + if (unlikely(!should_zap_page(details, page))) continue; ptent = ptep_get_and_clear_full(mm, addr, pte, @@ -1442,11 +1444,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (unlikely(!page)) continue; + dirty = 0; if (!PageAnon(page)) { - if (pte_dirty(ptent)) { - force_flush = 1; - set_page_dirty(page); - } + dirty = pte_dirty(ptent); if (pte_young(ptent) && likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); @@ -1455,7 +1455,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, page_remove_rmap(page, vma, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); - if (unlikely(__tlb_remove_page(tlb, page))) { + if (unlikely(__tlb_remove_page(tlb, page, dirty))) { force_flush = 1; addr += PAGE_SIZE; break; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index add4244e5790..fa79e054413a 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -43,12 +43,40 @@ static bool tlb_next_batch(struct mmu_gather *tlb) return true; } +/* + * We get an 'encoded page' array, which has page pointers with + * the dirty bit in the low bit of the array. + * + * The TLB has been flushed, now we need to move the dirty bit into + * the 'struct page', clean the array in-place, and then free the + * pages and their swap cache. + */ +static void clean_and_free_pages_and_swap_cache(struct encoded_page **pages, unsigned int nr) +{ + for (unsigned int i = 0; i < nr; i++) { + struct encoded_page *encoded = pages[i]; + if (encoded_page_dirty(encoded)) { + struct page *page = encoded_page_ptr(encoded); + /* Clean the dirty pointer in-place */ + pages[i] = encode_page(page, 0); + set_page_dirty(page); + } + } + + /* + * Now all entries have been un-encoded, and changed to plain + * page pointers, so we can cast the 'encoded_page' array to + * a plain page array and free them + */ + free_pages_and_swap_cache((struct page **)pages, nr); +} + static void tlb_batch_pages_flush(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { - struct page **pages = batch->pages; + struct encoded_page **pages = batch->encoded_pages; do { /* @@ -56,7 +84,7 @@ static void tlb_batch_pages_flush(struct mmu_gather *tlb) */ unsigned int nr = min(512U, batch->nr); - free_pages_and_swap_cache(pages, nr); + clean_and_free_pages_and_swap_cache(pages, nr); pages += nr; batch->nr -= nr; @@ -77,7 +105,7 @@ static void tlb_batch_list_free(struct mmu_gather *tlb) tlb->local.next = NULL; } -bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) +bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size, bool dirty) { struct mmu_gather_batch *batch; @@ -92,7 +120,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ * Add the page and check if we are full. If so * force a flush. */ - batch->pages[batch->nr++] = page; + batch->encoded_pages[batch->nr++] = encode_page(page, dirty); if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; -- 2.37.1.289.g45aa1e5c72.dirty