[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2iq3asat5h5krrwf5vm5ewhmbx3f6xyvmbpcgqfugjmsyxmdqb@twqoaancgzqv>
Date: Wed, 23 Oct 2024 11:02:54 -0400
From: "Liam R. Howlett" <Liam.Howlett@...cle.com>
To: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
Vlastimil Babka <vbabka@...e.cz>, Jann Horn <jannh@...gle.com>,
linux-kernel@...r.kernel.org, linux-mm@...ck.org,
Linus Torvalds <torvalds@...ux-foundation.org>,
Peter Xu <peterx@...hat.com>
Subject: Re: [PATCH hotfix 6.12 4/8] mm: resolve faulty mmap_region() error
path behaviour
* Lorenzo Stoakes <lorenzo.stoakes@...cle.com> [241022 16:41]:
> The mmap_region() function is somewhat terrifying, with spaghetti-like
> control flow and numerous means by which issues can arise and incomplete
> state, memory leaks and other unpleasantness can occur.
>
> A large amount of the complexity arises from trying to handle errors late
> in the process of mapping a VMA, which forms the basis of recently observed
> issues with resource leaks and observable inconsistent state.
>
> Taking advantage of previous patches in this series we move a number of
> checks earlier in the code, simplifying things by moving the core of the
> logic into a static internal function __mmap_region().
>
> Doing this allows us to perform a number of checks up front before we do
> any real work, and allows us to unwind the writable unmap check
> unconditionally as required and to perform a CONFIG_DEBUG_VM_MAPLE_TREE
> validation unconditionally also.
>
> We move a number of things here:
>
> 1. We preallocate memory for the iterator before we call the file-backed
> memory hook, allowing us to exit early and avoid having to perform
> complicated and error-prone close/free logic. We carefully free
> iterator state on both success and error paths.
>
> 2. The enclosing mmap_region() function handles the mapping_map_writable()
> logic early. Previously the logic had the mapping_map_writable() at the
> point of mapping a newly allocated file-backed VMA, and a matching
> mapping_unmap_writable() on success and error paths.
>
> We now do this unconditionally if this is a file-backed, shared writable
> mapping. If a driver changes the flags to eliminate VM_MAYWRITE, however
> doing so does not invalidate the seal check we just performed, and we in
> any case always decrement the counter in the wrapper.
>
> We perform a debug assert to ensure a driver does not attempt to do the
> opposite.
>
> 3. We also move arch_validate_flags() up into the mmap_region()
> function. This is only relevant on arm64 and sparc64, and the check is
> only meaningful for SPARC with ADI enabled. We explicitly add a warning
> for this arch if a driver invalidates this check, though the code ought
> eventually to be fixed to eliminate the need for this.
>
> With all of these measures in place, we no longer need to explicitly close
> the VMA on error paths, as we place all checks which might fail prior to a
> call to any driver mmap hook.
>
> This eliminates an entire class of errors, makes the code easier to reason
> about and more robust.
>
> Reported-by: Jann Horn <jannh@...gle.com>
> Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails")
> Cc: stable <stable@...nel.org>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@...cle.com>
> ---
> mm/mmap.c | 120 ++++++++++++++++++++++++++++++------------------------
> 1 file changed, 66 insertions(+), 54 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 66edf0ebba94..7d02b47a1895 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1361,20 +1361,18 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
> return do_vmi_munmap(&vmi, mm, start, len, uf, false);
> }
>
> -unsigned long mmap_region(struct file *file, unsigned long addr,
> +static unsigned long __mmap_region(struct file *file, unsigned long addr,
> unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
> struct list_head *uf)
> {
> struct mm_struct *mm = current->mm;
> struct vm_area_struct *vma = NULL;
> pgoff_t pglen = PHYS_PFN(len);
> - struct vm_area_struct *merge;
> unsigned long charged = 0;
> struct vma_munmap_struct vms;
> struct ma_state mas_detach;
> struct maple_tree mt_detach;
> unsigned long end = addr + len;
> - bool writable_file_mapping = false;
> int error;
> VMA_ITERATOR(vmi, mm, addr);
> VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff);
> @@ -1448,28 +1446,26 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> vm_flags_init(vma, vm_flags);
> vma->vm_page_prot = vm_get_page_prot(vm_flags);
>
> + if (vma_iter_prealloc(&vmi, vma)) {
> + error = -ENOMEM;
> + goto free_vma;
> + }
> +
> if (file) {
> vma->vm_file = get_file(file);
> error = mmap_file(file, vma);
> if (error)
> - goto unmap_and_free_vma;
> -
> - if (vma_is_shared_maywrite(vma)) {
> - error = mapping_map_writable(file->f_mapping);
> - if (error)
> - goto close_and_free_vma;
> -
> - writable_file_mapping = true;
> - }
> + goto unmap_and_free_file_vma;
>
> + /* Drivers cannot alter the address of the VMA. */
> + WARN_ON_ONCE(addr != vma->vm_start);
> /*
> - * Expansion is handled above, merging is handled below.
> - * Drivers should not alter the address of the VMA.
> + * Drivers should not permit writability when previously it was
> + * disallowed.
> */
> - if (WARN_ON((addr != vma->vm_start))) {
> - error = -EINVAL;
> - goto close_and_free_vma;
> - }
> + VM_WARN_ON_ONCE(vm_flags != vma->vm_flags &&
> + !(vm_flags & VM_MAYWRITE) &&
> + (vma->vm_flags & VM_MAYWRITE));
>
> vma_iter_config(&vmi, addr, end);
> /*
> @@ -1477,6 +1473,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> * vma again as we may succeed this time.
> */
> if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
> + struct vm_area_struct *merge;
> +
> vmg.flags = vma->vm_flags;
> /* If this fails, state is reset ready for a reattempt. */
> merge = vma_merge_new_range(&vmg);
> @@ -1491,10 +1489,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> */
> fput(vma->vm_file);
> vm_area_free(vma);
> + vma_iter_free(&vmi);
> vma = merge;
Nit: Might be worth moving the vma_iter_free() down (if not removed) so
that the vma pointer remains sane. Just in case more stuff gets stuffed
between the two calls.
> /* Update vm_flags to pick up the change. */
> vm_flags = vma->vm_flags;
> - goto unmap_writable;
> + goto file_expanded;
> }
> vma_iter_config(&vmi, addr, end);
> }
> @@ -1503,26 +1502,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> } else if (vm_flags & VM_SHARED) {
> error = shmem_zero_setup(vma);
> if (error)
> - goto free_vma;
> + goto free_iter_vma;
> } else {
> vma_set_anonymous(vma);
> }
>
> - if (map_deny_write_exec(vma->vm_flags, vma->vm_flags)) {
> - error = -EACCES;
> - goto close_and_free_vma;
> - }
> -
> - /* Allow architectures to sanity-check the vm_flags */
> - if (!arch_validate_flags(vma->vm_flags)) {
> - error = -EINVAL;
> - goto close_and_free_vma;
> - }
> -
> - if (vma_iter_prealloc(&vmi, vma)) {
> - error = -ENOMEM;
> - goto close_and_free_vma;
> - }
> +#ifdef CONFIG_SPARC64
> + /* TODO: Fix SPARC ADI! */
> + WARN_ON_ONCE(!arch_validate_flags(vm_flags));
> +#endif
>
> /* Lock the VMA since it is modified after insertion into VMA tree */
> vma_start_write(vma);
> @@ -1536,10 +1524,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> */
> khugepaged_enter_vma(vma, vma->vm_flags);
>
> - /* Once vma denies write, undo our temporary denial count */
> -unmap_writable:
> - if (writable_file_mapping)
> - mapping_unmap_writable(file->f_mapping);
> +file_expanded:
> file = vma->vm_file;
> ksm_add_vma(vma);
> expanded:
> @@ -1572,23 +1557,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>
> vma_set_page_prot(vma);
>
> - validate_mm(mm);
> return addr;
>
> -close_and_free_vma:
> - vma_close(vma);
> -
> - if (file || vma->vm_file) {
> -unmap_and_free_vma:
> - fput(vma->vm_file);
> - vma->vm_file = NULL;
> +unmap_and_free_file_vma:
> + fput(vma->vm_file);
> + vma->vm_file = NULL;
>
> - vma_iter_set(&vmi, vma->vm_end);
> - /* Undo any partial mapping done by a device driver. */
> - unmap_region(&vmi.mas, vma, vmg.prev, vmg.next);
> - }
> - if (writable_file_mapping)
> - mapping_unmap_writable(file->f_mapping);
> + vma_iter_set(&vmi, vma->vm_end);
> + /* Undo any partial mapping done by a device driver. */
> + unmap_region(&vmi.mas, vma, vmg.prev, vmg.next);
> +free_iter_vma:
> + vma_iter_free(&vmi);
> free_vma:
> vm_area_free(vma);
> unacct_error:
> @@ -1598,10 +1577,43 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
> abort_munmap:
> vms_abort_munmap_vmas(&vms, &mas_detach);
> gather_failed:
> - validate_mm(mm);
> return error;
> }
>
> +unsigned long mmap_region(struct file *file, unsigned long addr,
> + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
> + struct list_head *uf)
> +{
> + unsigned long ret;
> + bool writable_file_mapping = false;
> +
> + /* Allow architectures to sanity-check the vm_flags. */
> + if (!arch_validate_flags(vm_flags))
> + return -EINVAL;
> +
> + /* Check to see if MDWE is applicable. */
> + if (map_deny_write_exec(vm_flags, vm_flags))
> + return -EACCES;
> +
> + /* Map writable and ensure this isn't a sealed memfd. */
> + if (file && is_shared_maywrite(vm_flags)) {
> + int error = mapping_map_writable(file->f_mapping);
> +
> + if (error)
> + return error;
> + writable_file_mapping = true;
> + }
> +
> + ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf);
> +
> + /* Clear our write mapping regardless of error. */
> + if (writable_file_mapping)
> + mapping_unmap_writable(file->f_mapping);
> +
> + validate_mm(current->mm);
> + return ret;
> +}
> +
> static int __vm_munmap(unsigned long start, size_t len, bool unlock)
> {
> int ret;
> --
> 2.47.0
Powered by blists - more mailing lists