lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Date:   Wed, 26 Jan 2022 20:29:10 +0000
From:   Liam Howlett <liam.howlett@...cle.com>
To:     Vlastimil Babka <vbabka@...e.cz>
CC:     "maple-tree@...ts.infradead.org" <maple-tree@...ts.infradead.org>,
        "linux-mm@...ck.org" <linux-mm@...ck.org>,
        "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
        Andrew Morton <akpm@...ux-foundation.org>,
        Song Liu <songliubraving@...com>,
        Davidlohr Bueso <dave@...olabs.net>,
        "Paul E . McKenney" <paulmck@...nel.org>,
        Matthew Wilcox <willy@...radead.org>,
        Laurent Dufour <ldufour@...ux.ibm.com>,
        David Rientjes <rientjes@...gle.com>,
        Axel Rasmussen <axelrasmussen@...gle.com>,
        Suren Baghdasaryan <surenb@...gle.com>,
        Rik van Riel <riel@...riel.com>,
        Peter Zijlstra <peterz@...radead.org>,
        Michel Lespinasse <walken.cr@...il.com>,
        Jerome Glisse <jglisse@...hat.com>,
        Minchan Kim <minchan@...gle.com>,
        Joel Fernandes <joelaf@...gle.com>,
        Rom Lemarchand <romlem@...gle.com>
Subject: Re: [PATCH v4 65/66] mm: Remove the vma linked list

* Vlastimil Babka <vbabka@...e.cz> [220120 12:41]:
> On 12/1/21 15:30, Liam Howlett wrote:
> > From: "Liam R. Howlett" <Liam.Howlett@...cle.com>
> > 
> > Replace any vm_next use with vma_find().
> > 
> > Update free_pgtables(), unmap_vmas(), and zap_page_range() to use the
> > maple tree.
> 
> > Use the new free_pgtables() and unmap_vmas() in do_mas_align_munmap().
> > At the same time, alter the loop to be more compact.
> > 
> > Now that free_pgtables() and unmap_vmas() take a maple tree as an
> > argument, rearrange do_mas_align_munmap() to use the new table to hold
> > the lock
> 
> table or tree?

tree, thanks.

> 
> > Remove __vma_link_list() and __vma_unlink_list() as they are exclusively
> > used to update the linked list
> > 
> > Rework validation of tree as it was depending on the linked list.
> > 
> > Signed-off-by: Liam R. Howlett <Liam.Howlett@...cle.com>
> 
> git grep shows that some usages of 'vm_next' and 'vm_prev' remain after this
> patch, including some exotic arch code.

I must have missed them being added during the development cycle of
maple tree.. except parisc; parisc has a block of code left in an #if 0
so it's not lost - Good thing it's in CVS now so it's safe :)

Thanks, riscv will require a new patch.

damon test code will require a new patch - I will add this to the damon
conversion patch.

> 
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -398,12 +398,21 @@ void free_pgd_range(struct mmu_gather *tlb,
> >  	} while (pgd++, addr = next, addr != end);
> >  }
> >  
> > -void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
> > -		unsigned long floor, unsigned long ceiling)
> > +void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt,
> > +		   struct vm_area_struct *vma, unsigned long floor,
> > +		   unsigned long ceiling)
> >  {
> > -	while (vma) {
> > -		struct vm_area_struct *next = vma->vm_next;
> > +	MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
> > +
> > +	do {
> >  		unsigned long addr = vma->vm_start;
> > +		struct vm_area_struct *next;
> > +
> > +		/*
> > +		 * Note: USER_PGTABLES_CEILING may be passed as ceiling and may
> > +		 * be 0.  This will underflow and is okay.
> > +		 */
> > +		next = mas_find(&mas, ceiling - 1);
> >  
> >  		/*
> >  		 * Hide vma from rmap and truncate_pagecache before freeing
> > @@ -422,7 +431,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
> >  			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
> >  			       && !is_vm_hugetlb_page(next)) {
> >  				vma = next;
> > -				next = vma->vm_next;
> > +				next = mas_find(&mas, ceiling - 1);
> >  				unlink_anon_vmas(vma);
> >  				unlink_file_vma(vma);
> >  			}
> > @@ -430,7 +439,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
> >  				floor, next ? next->vm_start : ceiling);
> >  		}
> >  		vma = next;
> > -	}
> > +	} while (vma);
> >  }
> >  
> >  void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
> > @@ -1602,17 +1611,19 @@ static void unmap_single_vma(struct mmu_gather *tlb,
> >   * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
> >   * drops the lock and schedules.
> >   */
> > -void unmap_vmas(struct mmu_gather *tlb,
> > +void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
> >  		struct vm_area_struct *vma, unsigned long start_addr,
> >  		unsigned long end_addr)
> >  {
> >  	struct mmu_notifier_range range;
> > +	MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
> >  
> >  	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
> >  				start_addr, end_addr);
> >  	mmu_notifier_invalidate_range_start(&range);
> > -	for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
> > +	do {
> >  		unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
> > +	} while ((vma = mas_find(&mas, end_addr - 1)) != NULL);
> >  	mmu_notifier_invalidate_range_end(&range);
> >  }
> >  
> > @@ -1627,8 +1638,11 @@ void unmap_vmas(struct mmu_gather *tlb,
> >  void zap_page_range(struct vm_area_struct *vma, unsigned long start,
> >  		unsigned long size)
> >  {
> > +	struct maple_tree *mt = &vma->vm_mm->mm_mt;
> 
> Well looks like that's also an option to avoid a new parameter :)
> 
> > +	unsigned long end = start + size;
> >  	struct mmu_notifier_range range;
> >  	struct mmu_gather tlb;
> > +	MA_STATE(mas, mt, vma->vm_end, vma->vm_end);
> >  
> >  	lru_add_drain();
> >  	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
> > @@ -1636,8 +1650,9 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
> >  	tlb_gather_mmu(&tlb, vma->vm_mm);
> >  	update_hiwater_rss(vma->vm_mm);
> >  	mmu_notifier_invalidate_range_start(&range);
> > -	for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
> > +	do {
> >  		unmap_single_vma(&tlb, vma, start, range.end, NULL);
> > +	} while ((vma = mas_find(&mas, end - 1)) != NULL);
> >  	mmu_notifier_invalidate_range_end(&range);
> >  	tlb_finish_mmu(&tlb);
> >  }
> > diff --git a/mm/mmap.c b/mm/mmap.c
> > index dde74e0b195d..e13c6ef76697 100644
> > --- a/mm/mmap.c
> > +++ b/mm/mmap.c
> > @@ -74,9 +74,10 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
> >  static bool ignore_rlimit_data;
> >  core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
> >  
> > -static void unmap_region(struct mm_struct *mm,
> > +static void unmap_region(struct mm_struct *mm, struct maple_tree *mt,
> >  		struct vm_area_struct *vma, struct vm_area_struct *prev,
> > -		unsigned long start, unsigned long end);
> > +		struct vm_area_struct *next, unsigned long start,
> > +		unsigned long end);
> >  
> >  /* description of effects of mapping type and prot in current implementation.
> >   * this is due to the limited x86 page protection hardware.  The expected
> > @@ -173,10 +174,8 @@ void unlink_file_vma(struct vm_area_struct *vma)
> >  /*
> >   * Close a vm structure and free it, returning the next.
> 
> No longer returning the next.

ack

> 
> >   */
> > -static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
> > +static void remove_vma(struct vm_area_struct *vma)
> >  {
> > -	struct vm_area_struct *next = vma->vm_next;
> > -
> >  	might_sleep();
> >  	if (vma->vm_ops && vma->vm_ops->close)
> >  		vma->vm_ops->close(vma);
> 
> <snip>
> 
> >   */
> >  struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
> >  {
> > +	MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end);
> >  	struct anon_vma *anon_vma = NULL;
> > +	struct vm_area_struct *prev, *next;
> >  
> >  	/* Try next first. */
> > -	if (vma->vm_next) {
> > -		anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next);
> > +	next = mas_walk(&mas);
> > +	if (next) {
> > +		anon_vma = reusable_anon_vma(next, vma, next);
> >  		if (anon_vma)
> >  			return anon_vma;
> >  	}
> >  
> > +	prev = mas_prev(&mas, 0);
> > +	VM_BUG_ON_VMA(prev != vma, vma);
> > +	prev = mas_prev(&mas, 0);
> >  	/* Try prev next. */
> > -	if (vma->vm_prev)
> > -		anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma);
> > +	if (prev)
> > +		anon_vma = reusable_anon_vma(prev, prev, vma);
> >  
> >  	/*
> >  	 * We might reach here with anon_vma == NULL if we can't find
> > @@ -1906,10 +1825,10 @@ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
> >  					     unsigned long start_addr,
> >  					     unsigned long end_addr)
> >  {
> > -	MA_STATE(mas, &mm->mm_mt, start_addr, start_addr);
> > +	unsigned long index = start_addr;
> >  
> >  	mmap_assert_locked(mm);
> > -	return mas_find(&mas, end_addr - 1);
> > +	return mt_find(&mm->mm_mt, &index, end_addr - 1);
> 
> Why is this now changed again?

I found this with one of your previous comments, I have a fix.

> 
> >  }
> >  EXPORT_SYMBOL(find_vma_intersection);
> >  
> > @@ -1923,8 +1842,10 @@ EXPORT_SYMBOL(find_vma_intersection);
> >   */
> >  inline struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
> >  {
> > -	// Note find_vma_intersection will decrease 0 to underflow to ULONG_MAX
> > -	return find_vma_intersection(mm, addr, 0);
> > +	unsigned long index = addr;
> > +
> > +	mmap_assert_locked(mm);
> > +	return mt_find(&mm->mm_mt, &index, ULONG_MAX);
> 
> And here.

Ditto.

> 
> >  }
> >  EXPORT_SYMBOL(find_vma);
> >  
> > @@ -2026,7 +1947,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> >  	if (gap_addr < address || gap_addr > TASK_SIZE)
> >  		gap_addr = TASK_SIZE;
> >  
> > -	next = vma->vm_next;
> > +	next = vma_find(mm, vma->vm_end);
> >  	if (next && next->vm_start < gap_addr && vma_is_accessible(next)) {
> >  		if (!(next->vm_flags & VM_GROWSUP))
> >  			return -ENOMEM;
> > @@ -2072,8 +1993,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
> >  				vma->vm_end = address;
> >  				vma_store(mm, vma);
> >  				anon_vma_interval_tree_post_update_vma(vma);
> > -				if (!vma->vm_next)
> > -					mm->highest_vm_end = vm_end_gap(vma);
> >  				spin_unlock(&mm->page_table_lock);
> >  
> >  				perf_event_mmap(vma);
> > @@ -2100,7 +2019,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address)
> >  		return -EPERM;
> >  
> >  	/* Enforce stack_guard_gap */
> > -	prev = vma->vm_prev;
> > +	find_vma_prev(mm, vma->vm_start, &prev);
> >  	/* Check that both stack segments have the same anon_vma? */
> >  	if (prev && !(prev->vm_flags & VM_GROWSDOWN) &&
> >  			vma_is_accessible(prev)) {
> > @@ -2235,20 +2154,22 @@ EXPORT_SYMBOL_GPL(find_extend_vma);
> >   *
> >   * Called with the mm semaphore held.
> 
> Above this, the comment talks about vma list, update?

I will update the comment.

> 
> >   */
> > -static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
> > +static inline void remove_mt(struct mm_struct *mm, struct maple_tree *detached)
> >  {
> >  	unsigned long nr_accounted = 0;
> > +	unsigned long index = 0;
> > +	struct vm_area_struct *vma;
> >  
> >  	/* Update high watermark before we lower total_vm */
> >  	update_hiwater_vm(mm);
> > -	do {
> > +	mt_for_each(detached, vma, index, ULONG_MAX) {
> >  		long nrpages = vma_pages(vma);
> >  
> >  		if (vma->vm_flags & VM_ACCOUNT)
> >  			nr_accounted += nrpages;
> >  		vm_stat_account(mm, vma->vm_flags, -nrpages);
> > -		vma = remove_vma(vma);
> > -	} while (vma);
> > +		remove_vma(vma);
> > +	}
> >  	vm_unacct_memory(nr_accounted);
> >  	validate_mm(mm);
> >  }

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ