lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Thu, 20 Jan 2022 18:41:29 +0100
From:   Vlastimil Babka <vbabka@...e.cz>
To:     Liam Howlett <liam.howlett@...cle.com>,
        "maple-tree@...ts.infradead.org" <maple-tree@...ts.infradead.org>,
        "linux-mm@...ck.org" <linux-mm@...ck.org>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        Andrew Morton <akpm@...ux-foundation.org>
Cc:     Song Liu <songliubraving@...com>,
        Davidlohr Bueso <dave@...olabs.net>,
        "Paul E . McKenney" <paulmck@...nel.org>,
        Matthew Wilcox <willy@...radead.org>,
        Laurent Dufour <ldufour@...ux.ibm.com>,
        David Rientjes <rientjes@...gle.com>,
        Axel Rasmussen <axelrasmussen@...gle.com>,
        Suren Baghdasaryan <surenb@...gle.com>,
        Rik van Riel <riel@...riel.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Michel Lespinasse <walken.cr@...il.com>,
        Jerome Glisse <jglisse@...hat.com>,
        Minchan Kim <minchan@...gle.com>,
        Joel Fernandes <joelaf@...gle.com>,
        Rom Lemarchand <romlem@...gle.com>
Subject: Re: [PATCH v4 65/66] mm: Remove the vma linked list

On 12/1/21 15:30, Liam Howlett wrote:
> From: "Liam R. Howlett" <Liam.Howlett@...cle.com>
> 
> Replace any vm_next use with vma_find().
> 
> Update free_pgtables(), unmap_vmas(), and zap_page_range() to use the
> maple tree.

> Use the new free_pgtables() and unmap_vmas() in do_mas_align_munmap().
> At the same time, alter the loop to be more compact.
> 
> Now that free_pgtables() and unmap_vmas() take a maple tree as an
> argument, rearrange do_mas_align_munmap() to use the new table to hold
> the lock

table or tree?

> Remove __vma_link_list() and __vma_unlink_list() as they are exclusively
> used to update the linked list
> 
> Rework validation of tree as it was depending on the linked list.
> 
> Signed-off-by: Liam R. Howlett <Liam.Howlett@...cle.com>

git grep shows that some usages of 'vm_next' and 'vm_prev' remain after this
patch, including some exotic arch code.

> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -398,12 +398,21 @@ void free_pgd_range(struct mmu_gather *tlb,
>  	} while (pgd++, addr = next, addr != end);
>  }
>  
> -void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
> -		unsigned long floor, unsigned long ceiling)
> +void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
> +		   struct vm_area_struct *vma, unsigned long floor,
> +		   unsigned long ceiling)
>  {
> -	while (vma) {
> -		struct vm_area_struct *next = vma->vm_next;
> +	MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
> +
> +	do {
>  		unsigned long addr = vma->vm_start;
> +		struct vm_area_struct *next;
> +
> +		/*
> +		 * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
> +		 * be 0.  This will underflow and is okay.
> +		 */
> +		next = mas_find(&mas, ceiling - 1);
>  
>  		/*
>  		 * Hide vma from rmap and truncate_pagecache before freeing
> @@ -422,7 +431,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
>  			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
>  			       && !is_vm_hugetlb_page(next)) {
>  				vma = next;
> -				next = vma->vm_next;
> +				next = mas_find(&mas, ceiling - 1);
>  				unlink_anon_vmas(vma);
>  				unlink_file_vma(vma);
>  			}
> @@ -430,7 +439,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
>  				floor, next ? next->vm_start : ceiling);
>  		}
>  		vma = next;
> -	}
> +	} while (vma);
>  }
>  
>  void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
> @@ -1602,17 +1611,19 @@ static void unmap_single_vma(struct mmu_gather *tlb,
>   * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
>   * drops the lock and schedules.
>   */
> -void unmap_vmas(struct mmu_gather *tlb,
> +void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
>  		struct vm_area_struct *vma, unsigned long start_addr,
>  		unsigned long end_addr)
>  {
>  	struct mmu_notifier_range range;
> +	MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
>  
>  	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
>  				start_addr, end_addr);
>  	mmu_notifier_invalidate_range_start(&range);
> -	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
> +	do {
>  		unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
> +	} while ((vma = mas_find(&mas, end_addr - 1)) != NULL);
>  	mmu_notifier_invalidate_range_end(&range);
>  }
>  
> @@ -1627,8 +1638,11 @@ void unmap_vmas(struct mmu_gather *tlb,
>  void zap_page_range(struct vm_area_struct *vma, unsigned long start,
>  		unsigned long size)
>  {
> +	struct maple_tree *mt = &vma->vm_mm->mm_mt;

Well looks like that's also an option to avoid a new parameter :)

> +	unsigned long end = start + size;
>  	struct mmu_notifier_range range;
>  	struct mmu_gather tlb;
> +	MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
>  
>  	lru_add_drain();
>  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> @@ -1636,8 +1650,9 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
>  	tlb_gather_mmu(&tlb, vma->vm_mm);
>  	update_hiwater_rss(vma->vm_mm);
>  	mmu_notifier_invalidate_range_start(&range);
> -	for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
> +	do {
>  		unmap_single_vma(&tlb, vma, start, range.end, NULL);
> +	} while ((vma = mas_find(&mas, end - 1)) != NULL);
>  	mmu_notifier_invalidate_range_end(&range);
>  	tlb_finish_mmu(&tlb);
>  }
> diff --git a/mm/mmap.c b/mm/mmap.c
> index dde74e0b195d..e13c6ef76697 100644
> --- a/mm/mmap.c
> +++ b/mm/mmap.c
> @@ -74,9 +74,10 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
>  static bool ignore_rlimit_data;
>  core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
>  
> -static void unmap_region(struct mm_struct *mm,
> +static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
>  		struct vm_area_struct *vma, struct vm_area_struct *prev,
> -		unsigned long start, unsigned long end);
> +		struct vm_area_struct *next, unsigned long start,
> +		unsigned long end);
>  
>  /* description of effects of mapping type and prot in current implementation.
>   * this is due to the limited x86 page protection hardware.  The expected
> @@ -173,10 +174,8 @@ void unlink_file_vma(struct vm_area_struct *vma)
>  /*
>   * Close a vm structure and free it, returning the next.

No longer returning the next.

>   */
> -static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
> +static void remove_vma(struct vm_area_struct *vma)
>  {
> -	struct vm_area_struct *next = vma->vm_next;
> -
>  	might_sleep();
>  	if (vma->vm_ops && vma->vm_ops->close)
>  		vma->vm_ops->close(vma);

<snip>

>   */
>  struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
>  {
> +	MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end);
>  	struct anon_vma *anon_vma = NULL;
> +	struct vm_area_struct *prev, *next;
>  
>  	/* Try next first. */
> -	if (vma->vm_next) {
> -		anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
> +	next = mas_walk(&mas);
> +	if (next) {
> +		anon_vma = reusable_anon_vma(next, vma, next);
>  		if (anon_vma)
>  			return anon_vma;
>  	}
>  
> +	prev = mas_prev(&mas, 0);
> +	VM_BUG_ON_VMA(prev != vma, vma);
> +	prev = mas_prev(&mas, 0);
>  	/* Try prev next. */
> -	if (vma->vm_prev)
> -		anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
> +	if (prev)
> +		anon_vma = reusable_anon_vma(prev, prev, vma);
>  
>  	/*
>  	 * We might reach here with anon_vma == NULL if we can't find
> @@ -1906,10 +1825,10 @@ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
>  					     unsigned long start_addr,
>  					     unsigned long end_addr)
>  {
> -	MA_STATE(mas, &mm->mm_mt, start_addr, start_addr);
> +	unsigned long index = start_addr;
>  
>  	mmap_assert_locked(mm);
> -	return mas_find(&mas, end_addr - 1);
> +	return mt_find(&mm->mm_mt, &index, end_addr - 1);

Why is this now changed again?

>  }
>  EXPORT_SYMBOL(find_vma_intersection);
>  
> @@ -1923,8 +1842,10 @@ EXPORT_SYMBOL(find_vma_intersection);
>   */
>  inline struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
>  {
> -	// Note find_vma_intersection will decrease 0 to underflow to ULONG_MAX
> -	return find_vma_intersection(mm, addr, 0);
> +	unsigned long index = addr;
> +
> +	mmap_assert_locked(mm);
> +	return mt_find(&mm->mm_mt, &index, ULONG_MAX);

And here.

>  }
>  EXPORT_SYMBOL(find_vma);
>  
> @@ -2026,7 +1947,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>  	if (gap_addr < address || gap_addr > TASK_SIZE)
>  		gap_addr = TASK_SIZE;
>  
> -	next = vma->vm_next;
> +	next = vma_find(mm, vma->vm_end);
>  	if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
>  		if (!(next->vm_flags & VM_GROWSUP))
>  			return -ENOMEM;
> @@ -2072,8 +1993,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
>  				vma->vm_end = address;
>  				vma_store(mm, vma);
>  				anon_vma_interval_tree_post_update_vma(vma);
> -				if (!vma->vm_next)
> -					mm->highest_vm_end = vm_end_gap(vma);
>  				spin_unlock(&mm->page_table_lock);
>  
>  				perf_event_mmap(vma);
> @@ -2100,7 +2019,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
>  		return -EPERM;
>  
>  	/* Enforce stack_guard_gap */
> -	prev = vma->vm_prev;
> +	find_vma_prev(mm, vma->vm_start, &prev);
>  	/* Check that both stack segments have the same anon_vma? */
>  	if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
>  			vma_is_accessible(prev)) {
> @@ -2235,20 +2154,22 @@ EXPORT_SYMBOL_GPL(find_extend_vma);
>   *
>   * Called with the mm semaphore held.

Above this, the comment talks about vma list, update?

>   */
> -static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
> +static inline void remove_mt(struct mm_struct *mm, struct maple_tree *detached)
>  {
>  	unsigned long nr_accounted = 0;
> +	unsigned long index = 0;
> +	struct vm_area_struct *vma;
>  
>  	/* Update high watermark before we lower total_vm */
>  	update_hiwater_vm(mm);
> -	do {
> +	mt_for_each(detached, vma, index, ULONG_MAX) {
>  		long nrpages = vma_pages(vma);
>  
>  		if (vma->vm_flags & VM_ACCOUNT)
>  			nr_accounted += nrpages;
>  		vm_stat_account(mm, vma->vm_flags, -nrpages);
> -		vma = remove_vma(vma);
> -	} while (vma);
> +		remove_vma(vma);
> +	}
>  	vm_unacct_memory(nr_accounted);
>  	validate_mm(mm);
>  }

Powered by blists - more mailing lists