[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <990f7ab6-ba06-4564-8d4c-baa5a3f3d8b0@lucifer.local>
Date: Fri, 25 Oct 2024 11:19:29 +0100
From: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
To: Vlastimil Babka <vbabka@...e.cz>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
"Liam R . Howlett" <Liam.Howlett@...cle.com>,
Jann Horn <jannh@...gle.com>, linux-kernel@...r.kernel.org,
linux-mm@...ck.org, Linus Torvalds <torvalds@...ux-foundation.org>,
Peter Xu <peterx@...hat.com>
Subject: Re: [PATCH v2 7/8] mm: refactor __mmap_region()
On Fri, Oct 25, 2024 at 10:35:20AM +0200, Vlastimil Babka wrote:
> On 10/23/24 22:38, Lorenzo Stoakes wrote:
> > We have seen bugs and resource leaks arise from the complexity of the
> > __mmap_region() function. This, and the generally deeply fragile error
> > handling logic and complexity which makes understanding the function
> > difficult make it highly desirable to refactor it into something readable.
> >
> > Achieve this by separating the function into smaller logical parts which
> > are easier to understand and follow, and which importantly very
> > significantly simplify the error handling.
> >
> > Note that we now call vms_abort_munmap_vmas() in more error paths than we
> > used to, however in cases where no abort need occur, vms->nr_pages will be
> > equal to zero and we simply exit this function without doing more than we
> > would have done previously.
> >
> > Importantly, the invocation of the driver mmap hook via mmap_file() now has
> > very simple and obvious handling (this was previously the most problematic
> > part of the mmap() operation).
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
>
> Reviewed-by: Vlastimil Babka <vbabka@...e.cz>
>
> Some nits:
>
> > +static int __mmap_prepare(struct mmap_state *map, struct vma_merge_struct *vmg,
> > + struct list_head *uf)
> > {
> > - struct mm_struct *mm = current->mm;
> > - struct vm_area_struct *vma = NULL;
> > - pgoff_t pglen = PHYS_PFN(len);
> > - unsigned long charged = 0;
> > - struct vma_munmap_struct vms;
> > - struct ma_state mas_detach;
> > - struct maple_tree mt_detach;
> > - unsigned long end = addr + len;
> > int error;
> > - VMA_ITERATOR(vmi, mm, addr);
> > - VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff);
> > -
> > - vmg.file = file;
> > - /* Find the first overlapping VMA */
> > - vma = vma_find(&vmi, end);
> > - init_vma_munmap(&vms, &vmi, vma, addr, end, uf, /* unlock = */ false);
> > - if (vma) {
> > - mt_init_flags(&mt_detach, vmi.mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
> > - mt_on_stack(mt_detach);
> > - mas_init(&mas_detach, &mt_detach, /* addr = */ 0);
> > + struct vma_iterator *vmi = map->vmi;
> > + struct vma_munmap_struct *vms = &map->vms;
> > +
> > + /* Find the first overlapping VMA and initialise unmap state. */
> > + vms->vma = vma_find(vmi, map->end);
> > + init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf,
> > + /* unlock = */ false);
> > +
> > + /* OK, we have overlapping VMAs - prepare to unmap them. */
> > + if (vms->vma) {
> > + mt_init_flags(&map->mt_detach,
> > + vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
> > + mt_on_stack(map->mt_detach);
> > + mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0);
> > /* Prepare to unmap any existing mapping in the area */
> > - error = vms_gather_munmap_vmas(&vms, &mas_detach);
> > - if (error)
> > - goto gather_failed;
> > + error = vms_gather_munmap_vmas(vms, &map->mas_detach);
> > + if (error) {
> > + /* On error VMAs will already have been reattached. */
> > + vms->nr_pages = 0;
> > + return error;
> > + }
> >
> > - vmg.next = vms.next;
> > - vmg.prev = vms.prev;
> > - vma = NULL;
> > + map->next = vms->next;
> > + map->prev = vms->prev;
> > } else {
> > - vmg.next = vma_iter_next_rewind(&vmi, &vmg.prev);
> > + map->next = vma_iter_next_rewind(vmi, &map->prev);
> > }
> >
> > + /* Set up vmg for merge attempt. */
> > + vmg->file = map->file;
> > + vmg->prev = map->prev;
> > + vmg->next = map->next;
>
> It seems arbitrary to do this here. vmg->flags are set in __mmap_region().
> Why not all of this? We wouldn't need to pass vmg to __mmap_prepare() at
> all? Do any of the values chenge between here and returning and vmg needs to
> capture them as they are now? AFAICS not.
Yeah I sort of felt we are 'preparing' things.
This _was_ necessary before when we were ickily leaning on the vmg to store
state and not mutate it which is not ideal.
So yeah I think probably it's better to just put _all_ the merge stuff in
__mmap_region() rather than just some.
This aligns with the need to strip out the 'reset' logic from
vma_merge_new_range().
>
> > +
> > /* Check against address space limit. */
> > - if (!may_expand_vm(mm, vm_flags, pglen - vms.nr_pages)) {
> > - error = -ENOMEM;
> > - goto abort_munmap;
> > - }
> > + if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages))
> > + return -ENOMEM;
> >
> > - /*
> > - * Private writable mapping: check memory availability
> > - */
> > - if (accountable_mapping(file, vm_flags)) {
> > - charged = pglen;
> > - charged -= vms.nr_accounted;
> > - if (charged) {
> > - error = security_vm_enough_memory_mm(mm, charged);
> > + /* Private writable mapping: check memory availability. */
> > + if (accountable_mapping(map->file, map->flags)) {
> > + map->charged = map->pglen;
> > + map->charged -= vms->nr_accounted;
> > + if (map->charged) {
> > + error = security_vm_enough_memory_mm(map->mm, map->charged);
> > if (error)
> > - goto abort_munmap;
> > + return error;
> > }
> >
> > - vms.nr_accounted = 0;
> > - vm_flags |= VM_ACCOUNT;
> > - vmg.flags = vm_flags;
> > + vms->nr_accounted = 0;
> > + map->flags |= VM_ACCOUNT;
> > }
> >
> > /*
> > - * clear PTEs while the vma is still in the tree so that rmap
> > + * Clear PTEs while the vma is still in the tree so that rmap
> > * cannot race with the freeing later in the truncate scenario.
> > * This is also needed for mmap_file(), which is why vm_ops
> > * close function is called.
> > */
> > - vms_clean_up_area(&vms, &mas_detach);
> > - vma = vma_merge_new_range(&vmg);
> > - if (vma)
> > - goto expanded;
> > + vms_clean_up_area(vms, &map->mas_detach);
> > +
> > + return 0;
> > +}
> > +
>
> <snip>
>
> > +static int __mmap_new_vma(struct mmap_state *map, struct vma_merge_struct *vmg,
> > + struct vm_area_struct **vmap)
> > +{
> > + struct vma_iterator *vmi = map->vmi;
> > + int error = 0;
> > + bool merged = false;
> > + struct vm_area_struct *vma;
> > +
> > /*
> > * Determine the object being mapped and call the appropriate
> > * specific mapper. the address has already been validated, but
> > * not unmapped, but the maps are removed from the list.
> > */
> > - vma = vm_area_alloc(mm);
> > - if (!vma) {
> > - error = -ENOMEM;
> > - goto unacct_error;
> > - }
> > + vma = vm_area_alloc(map->mm);
> > + if (!vma)
> > + return -ENOMEM;
> >
> > - vma_iter_config(&vmi, addr, end);
> > - vma_set_range(vma, addr, end, pgoff);
> > - vm_flags_init(vma, vm_flags);
> > - vma->vm_page_prot = vm_get_page_prot(vm_flags);
> > + vma_iter_config(vmi, map->addr, map->end);
> > + vma_set_range(vma, map->addr, map->end, map->pgoff);
> > + vm_flags_init(vma, map->flags);
> > + vma->vm_page_prot = vm_get_page_prot(map->flags);
> >
> > - if (vma_iter_prealloc(&vmi, vma)) {
> > + if (vma_iter_prealloc(vmi, vma)) {
> > error = -ENOMEM;
> > goto free_vma;
> > }
> >
> > - if (file) {
> > - vma->vm_file = get_file(file);
> > - error = mmap_file(file, vma);
> > - if (error)
> > - goto unmap_and_free_file_vma;
> > -
> > - /* Drivers cannot alter the address of the VMA. */
> > - WARN_ON_ONCE(addr != vma->vm_start);
> > - /*
> > - * Drivers should not permit writability when previously it was
> > - * disallowed.
> > - */
> > - VM_WARN_ON_ONCE(vm_flags != vma->vm_flags &&
> > - !(vm_flags & VM_MAYWRITE) &&
> > - (vma->vm_flags & VM_MAYWRITE));
> > -
> > - vma_iter_config(&vmi, addr, end);
> > - /*
> > - * If vm_flags changed after mmap_file(), we should try merge
> > - * vma again as we may succeed this time.
> > - */
> > - if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
> > - struct vm_area_struct *merge;
> > -
> > - vmg.flags = vma->vm_flags;
> > - /* If this fails, state is reset ready for a reattempt. */
> > - merge = vma_merge_new_range(&vmg);
> > -
> > - if (merge) {
> > - /*
> > - * ->mmap() can change vma->vm_file and fput
> > - * the original file. So fput the vma->vm_file
> > - * here or we would add an extra fput for file
> > - * and cause general protection fault
> > - * ultimately.
> > - */
> > - fput(vma->vm_file);
> > - vm_area_free(vma);
> > - vma = merge;
> > - /* Update vm_flags to pick up the change. */
> > - vm_flags = vma->vm_flags;
> > - goto file_expanded;
> > - }
> > - vma_iter_config(&vmi, addr, end);
> > - }
> > -
> > - vm_flags = vma->vm_flags;
> > - } else if (vm_flags & VM_SHARED) {
> > + if (map->file)
> > + error = __mmap_new_file_vma(map, vmg, &vma, &merged);
> > + else if (map->flags & VM_SHARED)
> > error = shmem_zero_setup(vma);
> > - if (error)
> > - goto free_iter_vma;
> > - } else {
> > + else
> > vma_set_anonymous(vma);
> > - }
> > +
> > + if (error)
> > + goto free_iter_vma;
> > +
> > + if (merged)
> > + goto file_expanded;
> >
> > #ifdef CONFIG_SPARC64
> > /* TODO: Fix SPARC ADI! */
> > - WARN_ON_ONCE(!arch_validate_flags(vm_flags));
> > + WARN_ON_ONCE(!arch_validate_flags(map->flags));
> > #endif
> >
> > /* Lock the VMA since it is modified after insertion into VMA tree */
> > vma_start_write(vma);
> > - vma_iter_store(&vmi, vma);
> > - mm->map_count++;
> > + vma_iter_store(vmi, vma);
> > + map->mm->map_count++;
> > vma_link_file(vma);
> >
> > /*
> > * vma_merge_new_range() calls khugepaged_enter_vma() too, the below
> > * call covers the non-merge case.
> > */
> > - khugepaged_enter_vma(vma, vma->vm_flags);
> > + khugepaged_enter_vma(vma, map->flags);
> >
> > file_expanded:
> > - file = vma->vm_file;
> > ksm_add_vma(vma);
>
> I'm wondering if this could go to the "non file expanded" region above. If
> we expanded a vma, it was either in ksm and stays in ksm, or it was not
> eligible and that couldn't have changed by expanding?
We could change this, but the next commit removes the need for this :)
>
> > -expanded:
> > + *vmap = vma;
> > + return 0;
> > +
> > +free_iter_vma:
> > + vma_iter_free(vmi);
> > +free_vma:
> > + vm_area_free(vma);
> > + return error;
> > +}
> > +
Powered by blists - more mailing lists