linux-kernel - Re: [PATCH v6 14/20] mm/mmap: Avoid zeroing vma tree in mmap

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <22b6346d-b8a0-4d89-b886-1ac2c9ce6ba4@lucifer.local>
Date: Wed, 21 Aug 2024 12:02:06 +0100
From: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>
To: "Liam R. Howlett" <Liam.Howlett@...cle.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>, linux-mm@...ck.org,
        linux-kernel@...r.kernel.org, Suren Baghdasaryan <surenb@...gle.com>,
        Matthew Wilcox <willy@...radead.org>, Vlastimil Babka <vbabka@...e.cz>,
        sidhartha.kumar@...cle.com, Bert Karwatzki <spasswolf@....de>,
        Jiri Olsa <olsajiri@...il.com>, Kees Cook <kees@...nel.org>,
        "Paul E . McKenney" <paulmck@...nel.org>
Subject: Re: [PATCH v6 14/20] mm/mmap: Avoid zeroing vma tree in mmap_region()

On Tue, Aug 20, 2024 at 07:57:23PM GMT, Liam R. Howlett wrote:
> From: "Liam R. Howlett" <Liam.Howlett@...cle.com>
>
> Instead of zeroing the vma tree and then overwriting the area, let the
> area be overwritten and then clean up the gathered vmas using
> vms_complete_munmap_vmas().
>
> To ensure locking is downgraded correctly, the mm is set regardless of
> MAP_FIXED or not (NULL vma).
>
> If a driver is mapping over an existing vma, then clear the ptes before
> the call_mmap() invocation.  This is done using the vms_clean_up_area()
> helper.  If there is a close vm_ops, that must also be called to ensure
> any cleanup is done before mapping over the area.  This also means that
> calling open has been added to the abort of an unmap operation, for now.

Might be worth explicitly expanding this to say that this isn't a permanent
solution because of asymmetric vm_ops->open() / close().

>
> Temporarily keep track of the number of pages that will be removed and
> reduce the charged amount.
>
> This also drops the validate_mm() call in the vma_expand() function.
> It is necessary to drop the validate as it would fail since the mm
> map_count would be incorrect during a vma expansion, prior to the
> cleanup from vms_complete_munmap_vmas().
>
> Clean up the error handing of the vms_gather_munmap_vmas() by calling
> the verification within the function.
>
> Signed-off-by: Liam R. Howlett <Liam.Howlett@...cle.com>

Broadly looks good, some nits and questions below, but generally:

Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@...cle.com>

> ---
>  mm/mmap.c | 62 ++++++++++++++++++++++++++-----------------------------
>  mm/vma.c  | 54 +++++++++++++++++++++++++++++++++++++-----------
>  mm/vma.h  | 22 ++++++++++++++------
>  3 files changed, 87 insertions(+), 51 deletions(-)
>
> diff --git a/mm/mmap.c b/mm/mmap.c
> index 71b2bad717b6..6550d9470d3a 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -1373,23 +1373,19 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  	unsigned long merge_start = addr, merge_end = end;
>  	bool writable_file_mapping = false;
>  	pgoff_t vm_pgoff;
> -	int error;
> +	int error = -ENOMEM;
>  	VMA_ITERATOR(vmi, mm, addr);
> +	unsigned long nr_pages, nr_accounted;
>
> -	/* Check against address space limit. */
> -	if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
> -		unsigned long nr_pages;
> +	nr_pages = count_vma_pages_range(mm, addr, end, &nr_accounted);
>
> -		/*
> -		 * MAP_FIXED may remove pages of mappings that intersects with
> -		 * requested mapping. Account for the pages it would unmap.
> -		 */
> -		nr_pages = count_vma_pages_range(mm, addr, end);
> -
> -		if (!may_expand_vm(mm, vm_flags,
> -					(len >> PAGE_SHIFT) - nr_pages))
> -			return -ENOMEM;
> -	}
> +	/*
> +	 * Check against address space limit.
> +	 * MAP_FIXED may remove pages of mappings that intersects with requested
> +	 * mapping. Account for the pages it would unmap.
> +	 */
> +	if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages))
> +		return -ENOMEM;
>
>  	/* Find the first overlapping VMA */
>  	vma = vma_find(&vmi, end);
> @@ -1400,14 +1396,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  		mas_init(&mas_detach, &mt_detach, /* addr = */ 0);
>  		/* Prepare to unmap any existing mapping in the area */
>  		if (vms_gather_munmap_vmas(&vms, &mas_detach))
> -			goto gather_failed;
> -
> -		/* Remove any existing mappings from the vma tree */
> -		if (vma_iter_clear_gfp(&vmi, addr, end, GFP_KERNEL))
> -			goto clear_tree_failed;
> +			return -ENOMEM;
>
> -		/* Unmap any existing mapping in the area */
> -		vms_complete_munmap_vmas(&vms, &mas_detach);
>  		next = vms.next;
>  		prev = vms.prev;
>  		vma = NULL;
> @@ -1423,8 +1413,10 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  	 */
>  	if (accountable_mapping(file, vm_flags)) {
>  		charged = len >> PAGE_SHIFT;
> +		charged -= nr_accounted;
>  		if (security_vm_enough_memory_mm(mm, charged))
> -			return -ENOMEM;
> +			goto abort_munmap;
> +		vms.nr_accounted = 0;
>  		vm_flags |= VM_ACCOUNT;
>  	}
>
> @@ -1473,10 +1465,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  	 * not unmapped, but the maps are removed from the list.
>  	 */
>  	vma = vm_area_alloc(mm);
> -	if (!vma) {
> -		error = -ENOMEM;
> +	if (!vma)
>  		goto unacct_error;
> -	}
>
>  	vma_iter_config(&vmi, addr, end);
>  	vma_set_range(vma, addr, end, pgoff);
> @@ -1485,6 +1475,11 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>
>  	if (file) {
>  		vma->vm_file = get_file(file);
> +		/*
> +		 * call_mmap() may map PTE, so ensure there are no existing PTEs
> +		 * call the vm_ops close function if one exists.

Super-nit, but maybe add an 'and' here.

> +		 */
> +		vms_clean_up_area(&vms, &mas_detach, true);

I hate that we have to do this. These kind of hooks are the devil's works...

>  		error = call_mmap(file, vma);
>  		if (error)
>  			goto unmap_and_free_vma;
> @@ -1575,6 +1570,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  expanded:
>  	perf_event_mmap(vma);
>
> +	/* Unmap any existing mapping in the area */
> +	vms_complete_munmap_vmas(&vms, &mas_detach);
> +
>  	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
>  	if (vm_flags & VM_LOCKED) {
>  		if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
> @@ -1603,7 +1601,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  	return addr;
>
>  close_and_free_vma:
> -	if (file && vma->vm_ops && vma->vm_ops->close)
> +	if (file && !vms.closed_vm_ops && vma->vm_ops && vma->vm_ops->close)
>  		vma->vm_ops->close(vma);
>
>  	if (file || vma->vm_file) {
> @@ -1622,14 +1620,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
>  unacct_error:
>  	if (charged)
>  		vm_unacct_memory(charged);
> -	validate_mm(mm);
> -	return error;
>
> -clear_tree_failed:
> -	abort_munmap_vmas(&mas_detach);
> -gather_failed:
> +abort_munmap:
> +	if (vms.nr_pages)
> +		abort_munmap_vmas(&mas_detach, vms.closed_vm_ops);
>  	validate_mm(mm);
> -	return -ENOMEM;
> +	return error;
>  }
>
>  static int __vm_munmap(unsigned long start, size_t len, bool unlock)
> @@ -1959,7 +1955,7 @@ void exit_mmap(struct mm_struct *mm)
>  	do {
>  		if (vma->vm_flags & VM_ACCOUNT)
>  			nr_accounted += vma_pages(vma);
> -		remove_vma(vma, true);
> +		remove_vma(vma, /* unreachable = */ true, /* closed = */ false);
>  		count++;
>  		cond_resched();
>  		vma = vma_next(&vmi);
> diff --git a/mm/vma.c b/mm/vma.c
> index 7104c2c080bb..5b33f7460ab7 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -136,10 +136,10 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
>  /*
>   * Close a vm structure and free it.
>   */
> -void remove_vma(struct vm_area_struct *vma, bool unreachable)
> +void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed)
>  {
>  	might_sleep();
> -	if (vma->vm_ops && vma->vm_ops->close)
> +	if (!closed && vma->vm_ops && vma->vm_ops->close)
>  		vma->vm_ops->close(vma);
>  	if (vma->vm_file)
>  		fput(vma->vm_file);
> @@ -521,7 +521,6 @@ int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  	vma_iter_store(vmi, vma);
>
>  	vma_complete(&vp, vmi, vma->vm_mm);
> -	validate_mm(vma->vm_mm);
>  	return 0;
>
>  nomem:
> @@ -645,11 +644,14 @@ void vma_complete(struct vma_prepare *vp,
>  		uprobe_mmap(vp->insert);
>  }
>
> -static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
> -		struct ma_state *mas_detach, bool mm_wr_locked)
> +static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
> +		    struct ma_state *mas_detach, bool mm_wr_locked)
>  {
>  	struct mmu_gather tlb;
>
> +	if (!vms->clear_ptes) /* Nothing to do */
> +		return;
> +
>  	/*
>  	 * We can free page tables without write-locking mmap_lock because VMAs
>  	 * were isolated before we downgraded mmap_lock.
> @@ -658,11 +660,31 @@ static void vms_complete_pte_clear(struct vma_munmap_struct *vms,
>  	lru_add_drain();
>  	tlb_gather_mmu(&tlb, vms->mm);
>  	update_hiwater_rss(vms->mm);
> -	unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, vms->vma_count, mm_wr_locked);
> +	unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
> +		   vms->vma_count, mm_wr_locked);
> +
>  	mas_set(mas_detach, 1);
>  	/* start and end may be different if there is no prev or next vma. */
> -	free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start, vms->unmap_end, mm_wr_locked);
> +	free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
> +		      vms->unmap_end, mm_wr_locked);
>  	tlb_finish_mmu(&tlb);
> +	vms->clear_ptes = false;
> +}
> +
> +void vms_clean_up_area(struct vma_munmap_struct *vms,
> +		struct ma_state *mas_detach, bool mm_wr_locked)

The only invocation of this function has mm_wr_locked set, is this
parameter necessary?

> +{
> +	struct vm_area_struct *vma;
> +
> +	if (!vms->nr_pages)
> +		return;
> +
> +	vms_clear_ptes(vms, mas_detach, mm_wr_locked);
> +	mas_set(mas_detach, 0);
> +	mas_for_each(mas_detach, vma, ULONG_MAX)
> +		if (vma->vm_ops && vma->vm_ops->close)
> +			vma->vm_ops->close(vma);
> +	vms->closed_vm_ops = true;
>  }
>
>  /*
> @@ -686,7 +708,10 @@ void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
>  	if (vms->unlock)
>  		mmap_write_downgrade(mm);
>
> -	vms_complete_pte_clear(vms, mas_detach, !vms->unlock);
> +	if (!vms->nr_pages)
> +		return;
> +
> +	vms_clear_ptes(vms, mas_detach, !vms->unlock);
>  	/* Update high watermark before we lower total_vm */
>  	update_hiwater_vm(mm);
>  	/* Stat accounting */
> @@ -697,7 +722,7 @@ void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
>  	/* Remove and clean up vmas */
>  	mas_set(mas_detach, 0);
>  	mas_for_each(mas_detach, vma, ULONG_MAX)
> -		remove_vma(vma, false);
> +		remove_vma(vma, /* = */ false, vms->closed_vm_ops);
>
>  	vm_unacct_memory(vms->nr_accounted);
>  	validate_mm(mm);
> @@ -849,13 +874,14 @@ int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
>  	while (vma_iter_addr(vms->vmi) > vms->start)
>  		vma_iter_prev_range(vms->vmi);
>
> +	vms->clear_ptes = true;
>  	return 0;
>
>  userfaultfd_error:
>  munmap_gather_failed:
>  end_split_failed:
>  modify_vma_failed:
> -	abort_munmap_vmas(mas_detach);
> +	abort_munmap_vmas(mas_detach, /* closed = */ false);
>  start_split_failed:
>  map_count_exceeded:
>  	return error;
> @@ -900,7 +926,7 @@ int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
>  	return 0;
>
>  clear_tree_failed:
> -	abort_munmap_vmas(&mas_detach);
> +	abort_munmap_vmas(&mas_detach, /* closed = */ false);
>  gather_failed:
>  	validate_mm(mm);
>  	return error;
> @@ -1618,17 +1644,21 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
>  }
>
>  unsigned long count_vma_pages_range(struct mm_struct *mm,
> -				    unsigned long addr, unsigned long end)
> +		unsigned long addr, unsigned long end,
> +		unsigned long *nr_accounted)
>  {
>  	VMA_ITERATOR(vmi, mm, addr);
>  	struct vm_area_struct *vma;
>  	unsigned long nr_pages = 0;
>
> +	*nr_accounted = 0;
>  	for_each_vma_range(vmi, vma, end) {
>  		unsigned long vm_start = max(addr, vma->vm_start);
>  		unsigned long vm_end = min(end, vma->vm_end);
>
>  		nr_pages += PHYS_PFN(vm_end - vm_start);
> +		if (vma->vm_flags & VM_ACCOUNT)
> +			*nr_accounted += PHYS_PFN(vm_end - vm_start);

Nitty, but maybe:
		...
  		unsigned long pages = PHYS_PFN(vm_end - vm_start);

 		nr_pages += pages;
		if (vma->vm_flags & VM_ACCOUNT)
			*nr_accounted += pages;

>  	}
>
>  	return nr_pages;
> diff --git a/mm/vma.h b/mm/vma.h
> index 6028fdf79257..756dd42a6ec4 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -48,6 +48,8 @@ struct vma_munmap_struct {
>  	unsigned long stack_vm;
>  	unsigned long data_vm;
>  	bool unlock;                    /* Unlock after the munmap */
> +	bool clear_ptes;                /* If there are outstanding PTE to be cleared */
> +	bool closed_vm_ops;		/* call_mmap() was encountered, so vmas may be closed */
>  };
>
>  #ifdef CONFIG_DEBUG_VM_MAPLE_TREE
> @@ -95,14 +97,13 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
>  		unsigned long start, unsigned long end, struct list_head *uf,
>  		bool unlock)
>  {
> +	vms->mm = current->mm;

I'm guessing there's no circumstances under which we'd be looking at a
remote mm_struct?

This does sort of beg the question as to why we're bothering to store the
field if we can't just grab it from current->mm? Perhaps because the cache
line for the start of vms will be populated and current's containing ->mm
may not?

>  	vms->vmi = vmi;
>  	vms->vma = vma;
>  	if (vma) {
> -		vms->mm = vma->vm_mm;
>  		vms->start = start;
>  		vms->end = end;
>  	} else {
> -		vms->mm = NULL;

I guess as well there's no drawback to having an otherwise empty vms have a
populated mm?

>  		vms->start = vms->end = 0;
>  	}
>  	vms->unlock = unlock;
> @@ -112,6 +113,8 @@ static inline void init_vma_munmap(struct vma_munmap_struct *vms,
>  	vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
>  	vms->unmap_start = FIRST_USER_ADDRESS;
>  	vms->unmap_end = USER_PGTABLES_CEILING;
> +	vms->clear_ptes = false;
> +	vms->closed_vm_ops = false;
>  }
>
>  int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
> @@ -120,18 +123,24 @@ int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
>  void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
>  		struct ma_state *mas_detach);
>
> +void vms_clean_up_area(struct vma_munmap_struct *vms,
> +		struct ma_state *mas_detach, bool mm_wr_locked);
> +
>  /*
>   * abort_munmap_vmas - Undo any munmap work and free resources
>   *
>   * Reattach any detached vmas and free up the maple tree used to track the vmas.
>   */
> -static inline void abort_munmap_vmas(struct ma_state *mas_detach)
> +static inline void abort_munmap_vmas(struct ma_state *mas_detach, bool closed)
>  {
>  	struct vm_area_struct *vma;
>
>  	mas_set(mas_detach, 0);
> -	mas_for_each(mas_detach, vma, ULONG_MAX)
> +	mas_for_each(mas_detach, vma, ULONG_MAX) {
>  		vma_mark_detached(vma, false);
> +		if (closed && vma->vm_ops && vma->vm_ops->open)
> +			vma->vm_ops->open(vma);
> +	}

Hang on, I thought we eliminated this approach? OK I see you change this in
the next commmit.

Not necessarily a huge fan of having a commit in the tree that's broken for
(hideous, asymmetric) drivers + such but I guess it's okay given we address
it immediately and it helps document the thinking process + split up the
code.

>
>  	__mt_destroy(mas_detach->tree);
>  }
> @@ -145,7 +154,7 @@ int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
>  		  unsigned long start, size_t len, struct list_head *uf,
>  		  bool unlock);
>
> -void remove_vma(struct vm_area_struct *vma, bool unreachable);
> +void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed);
>
>  void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
>  		struct vm_area_struct *prev, struct vm_area_struct *next);
> @@ -259,7 +268,8 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
>  int mm_take_all_locks(struct mm_struct *mm);
>  void mm_drop_all_locks(struct mm_struct *mm);
>  unsigned long count_vma_pages_range(struct mm_struct *mm,
> -				    unsigned long addr, unsigned long end);
> +				    unsigned long addr, unsigned long end,
> +				    unsigned long *nr_accounted);
>
>  static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
>  {
> --
> 2.43.0
>