[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240717200709.1552558-15-Liam.Howlett@oracle.com>
Date: Wed, 17 Jul 2024 16:07:02 -0400
From: "Liam R. Howlett" <Liam.Howlett@...cle.com>
To: linux-mm@...ck.org, Andrew Morton <akpm@...ux-foundation.org>
Cc: Suren Baghdasaryan <surenb@...gle.com>, Vlastimil Babka <vbabka@...e.cz>,
Lorenzo Stoakes <lstoakes@...il.com>,
Matthew Wilcox <willy@...radead.org>, sidhartha.kumar@...cle.com,
"Paul E . McKenney" <paulmck@...nel.org>,
Bert Karwatzki <spasswolf@....de>, Jiri Olsa <olsajiri@...il.com>,
linux-kernel@...r.kernel.org, Kees Cook <kees@...nel.org>,
Jeff Xu <jeffxu@...omium.org>,
"Liam R. Howlett" <Liam.Howlett@...cle.com>
Subject: [PATCH v5 14/21] mm/mmap: Avoid zeroing vma tree in mmap_region()
From: "Liam R. Howlett" <Liam.Howlett@...cle.com>
Instead of zeroing the vma tree and then overwriting the area, let the
area be overwritten and then clean up the gathered vmas using
vms_complete_munmap_vmas().
If a driver is mapping over an existing vma, then clear the ptes before
the call_mmap() invocation. If the vma has a vm_ops->close(), then call
the close() function. This is done using the vms_clear_ptes() and
vms_close_vmas() helpers. This has the side effect of needing to call
open() on the vmas if the mmap_region() fails later on.
Temporarily keep track of the number of pages that will be removed and
reduce the charged amount.
This commit drops the validate_mm() call in the vma_expand() function.
It is necessary to drop the validate as it would fail since the mm
map_count would be incorrect during a vma expansion, prior to the
cleanup from vms_complete_munmap_vmas().
Clean up the error handing of the vms_gather_munmap_vmas() by calling
the verification within the function.
Note that before this change, a MAP_FIXED could fail and leave a gap in
the vma tree. With this change, a MAP_FIXED failure will fail without
creating a gap and leave *a* vma in the area (may have been split) and
attept to restore them to an operational state (re-attached and
vm_ops->open()'ed if they were vm_ops->close()'d).
Signed-off-by: Liam R. Howlett <Liam.Howlett@...cle.com>
---
mm/internal.h | 2 +
mm/mmap.c | 119 +++++++++++++++++++++++++++++++-------------------
2 files changed, 76 insertions(+), 45 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index ec8441362c28..5bd60cb9fcbb 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1503,6 +1503,8 @@ struct vma_munmap_struct {
unsigned long stack_vm;
unsigned long data_vm;
bool unlock; /* Unlock after the munmap */
+ bool clear_ptes; /* If there are outstanding PTE to be cleared */
+ bool closed; /* vma->vm_ops->close() called already */
};
void __meminit __init_single_page(struct page *page, unsigned long pfn,
diff --git a/mm/mmap.c b/mm/mmap.c
index 20da0d039c95..0b7aa2c46cec 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -170,10 +170,11 @@ void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
/*
* Close a vm structure and free it.
*/
-static void remove_vma(struct vm_area_struct *vma, bool unreachable)
+static
+void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed)
{
might_sleep();
- if (vma->vm_ops && vma->vm_ops->close)
+ if (!closed && vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
if (vma->vm_file)
fput(vma->vm_file);
@@ -401,17 +402,21 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
}
static unsigned long count_vma_pages_range(struct mm_struct *mm,
- unsigned long addr, unsigned long end)
+ unsigned long addr, unsigned long end,
+ unsigned long *nr_accounted)
{
VMA_ITERATOR(vmi, mm, addr);
struct vm_area_struct *vma;
unsigned long nr_pages = 0;
+ *nr_accounted = 0;
for_each_vma_range(vmi, vma, end) {
unsigned long vm_start = max(addr, vma->vm_start);
unsigned long vm_end = min(end, vma->vm_end);
nr_pages += PHYS_PFN(vm_end - vm_start);
+ if (vma->vm_flags & VM_ACCOUNT)
+ *nr_accounted += PHYS_PFN(vm_end - vm_start);
}
return nr_pages;
@@ -527,6 +532,8 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
vms->unmap_start = FIRST_USER_ADDRESS;
vms->unmap_end = USER_PGTABLES_CEILING;
+ vms->clear_ptes = false; /* No PTEs to clear yet */
+ vms->closed = false;
}
/*
@@ -735,7 +742,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
vma_iter_store(vmi, vma);
vma_complete(&vp, vmi, vma->vm_mm);
- validate_mm(vma->vm_mm);
return 0;
nomem:
@@ -2597,23 +2603,31 @@ struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
*
* Reattach any detached vmas and free up the maple tree used to track the vmas.
*/
-static inline void abort_munmap_vmas(struct ma_state *mas_detach)
+static inline void abort_munmap_vmas(struct ma_state *mas_detach, bool closed)
{
struct vm_area_struct *vma;
mas_set(mas_detach, 0);
- mas_for_each(mas_detach, vma, ULONG_MAX)
+ mas_for_each(mas_detach, vma, ULONG_MAX) {
+ if (closed && vma->vm_ops && vma->vm_ops->close &&
+ vma->vm_ops->open)
+ vma->vm_ops->open(vma);
+
vma_mark_detached(vma, false);
+ }
__mt_destroy(mas_detach->tree);
}
-static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
+static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
struct ma_state *mas_detach, bool mm_wr_locked)
{
struct mmu_gather tlb;
+ if (!vms->clear_ptes)
+ return;
+
/*
* We can free page tables without write-locking mmap_lock because VMAs
* were isolated before we downgraded mmap_lock.
@@ -2629,6 +2643,23 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
vms->unmap_end, mm_wr_locked);
tlb_finish_mmu(&tlb);
+ vms->clear_ptes = false;
+}
+
+static inline void
+vms_close_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach)
+{
+ struct vm_area_struct *vma;
+
+ if (!vms->vma_count)
+ return;
+
+ mas_set(mas_detach, 0);
+ mas_for_each(mas_detach, vma, ULONG_MAX)
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
+
+ vms->closed = true;
}
/*
@@ -2652,7 +2683,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
if (vms->unlock)
mmap_write_downgrade(mm);
- vms_complete_pte_clear(vms, mas_detach, !vms->unlock);
+ vms_clear_ptes(vms, mas_detach, !vms->unlock);
/* Update high watermark before we lower total_vm */
update_hiwater_vm(mm);
/* Stat accounting */
@@ -2663,7 +2694,7 @@ static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
/* Remove and clean up vmas */
mas_set(mas_detach, 0);
mas_for_each(mas_detach, vma, ULONG_MAX)
- remove_vma(vma, false);
+ remove_vma(vma, /* unreachable = */ false, vms->closed);
vm_unacct_memory(vms->nr_accounted);
validate_mm(mm);
@@ -2804,14 +2835,18 @@ static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
while (vma_iter_addr(vms->vmi) > vms->start)
vma_iter_prev_range(vms->vmi);
+ /* There are now PTEs that need to be cleared */
+ vms->clear_ptes = true;
+
return 0;
userfaultfd_error:
munmap_gather_failed:
end_split_failed:
- abort_munmap_vmas(mas_detach);
+ abort_munmap_vmas(mas_detach, /* closed = */ false);
start_split_failed:
map_count_exceeded:
+ validate_mm(vms->mm);
return error;
}
@@ -2855,9 +2890,9 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
return 0;
clear_tree_failed:
- abort_munmap_vmas(&mas_detach);
-gather_failed:
+ abort_munmap_vmas(&mas_detach, /* closed = */ false);
validate_mm(mm);
+gather_failed:
return error;
}
@@ -2945,24 +2980,19 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long merge_start = addr, merge_end = end;
bool writable_file_mapping = false;
pgoff_t vm_pgoff;
- int error;
+ int error = -ENOMEM;
VMA_ITERATOR(vmi, mm, addr);
+ unsigned long nr_pages, nr_accounted;
- /* Check against address space limit. */
- if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
- unsigned long nr_pages;
-
- /*
- * MAP_FIXED may remove pages of mappings that intersects with
- * requested mapping. Account for the pages it would unmap.
- */
- nr_pages = count_vma_pages_range(mm, addr, end);
-
- if (!may_expand_vm(mm, vm_flags,
- (len >> PAGE_SHIFT) - nr_pages))
- return -ENOMEM;
- }
+ nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted);
+ /*
+ * Check against address space limit.
+ * MAP_FIXED may remove pages of mappings that intersects with requested
+ * mapping. Account for the pages it would unmap.
+ */
+ if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages))
+ return -ENOMEM;
if (unlikely(!can_modify_mm(mm, addr, end)))
return -EPERM;
@@ -2979,14 +3009,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
mas_init(&mas_detach, &mt_detach, /* addr = */ 0);
/* Prepare to unmap any existing mapping in the area */
if (vms_gather_munmap_vmas(&vms, &mas_detach))
- goto gather_failed;
-
- /* Remove any existing mappings from the vma tree */
- if (vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL))
- goto clear_tree_failed;
+ return -ENOMEM;
- /* Unmap any existing mapping in the area */
- vms_complete_munmap_vmas(&vms, &mas_detach);
next = vms.next;
prev = vms.prev;
vma = NULL;
@@ -3002,8 +3026,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
*/
if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
+ charged -= nr_accounted;
if (security_vm_enough_memory_mm(mm, charged))
- return -ENOMEM;
+ goto abort_munmap;
+ vms.nr_accounted = 0;
vm_flags |= VM_ACCOUNT;
}
@@ -3052,10 +3078,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* not unmapped, but the maps are removed from the list.
*/
vma = vm_area_alloc(mm);
- if (!vma) {
- error = -ENOMEM;
+ if (!vma)
goto unacct_error;
- }
vma_iter_config(&vmi, addr, end);
vma_set_range(vma, addr, end, pgoff);
@@ -3064,6 +3088,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
if (file) {
vma->vm_file = get_file(file);
+ /* call_mmap() may map PTE, so ensure there are no existing PTEs */
+ vms_clear_ptes(&vms, &mas_detach, /* mm_wr_locked = */ true);
+ vms_close_vmas(&vms, &mas_detach);
error = call_mmap(file, vma);
if (error)
goto unmap_and_free_vma;
@@ -3154,6 +3181,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
expanded:
perf_event_mmap(vma);
+ /* Unmap any existing mapping in the area */
+ if (vms.nr_pages)
+ vms_complete_munmap_vmas(&vms, &mas_detach);
+
vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
if (vm_flags & VM_LOCKED) {
if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
@@ -3201,14 +3232,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
unacct_error:
if (charged)
vm_unacct_memory(charged);
- validate_mm(mm);
- return error;
-clear_tree_failed:
- abort_munmap_vmas(&mas_detach);
-gather_failed:
+abort_munmap:
+ if (vms.nr_pages)
+ abort_munmap_vmas(&mas_detach, vms.closed);
validate_mm(mm);
- return -ENOMEM;
+ return error;
}
static int __vm_munmap(unsigned long start, size_t len, bool unlock)
@@ -3549,7 +3578,7 @@ void exit_mmap(struct mm_struct *mm)
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
- remove_vma(vma, true);
+ remove_vma(vma, /* unreachable = */ true, /* closed = */ false);
count++;
cond_resched();
vma = vma_next(&vmi);
--
2.43.0
Powered by blists - more mailing lists