linux-kernel - Re: [PATCH] ksm: use range-walk function to jump over holes in scan_get_next_rmap

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <qrf6mertexxf3hcazexkv6jxokdg5jnsbuixih62w2peq3ooyl@u3mhuopdjrfc>
Date: Tue, 14 Oct 2025 10:36:49 -0300
From: Pedro Demarchi Gomes <pedrodemargomes@...il.com>
To: David Hildenbrand <david@...hat.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>, craftfever@...ena.io, 
	Xu Xin <xu.xin16@....com.cn>, Chengming Zhou <chengming.zhou@...ux.dev>, linux-mm@...ck.org, 
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH] ksm: use range-walk function to jump over holes in
 scan_get_next_rmap_item

On Tue, Oct 14, 2025 at 11:26:06AM +0200, David Hildenbrand wrote:
> On 14.10.25 07:58, Pedro Demarchi Gomes wrote:
> > Currently, scan_get_next_rmap_item() walks every page address in a VMA
> > to locate mergeable pages. This becomes highly inefficient when scanning
> > large virtual memory areas that contain mostly unmapped regions.
> > 
> > This patch replaces the per-address lookup with a range walk using
> > walk_page_range(). The range walker allows KSM to skip over entire
> > unmapped holes in a VMA, avoiding unnecessary lookups.
> > 
> > To evaluate this change, I created a test that maps a 1 TB virtual area
> > where only the first and last 10 MB are populated with identical data.
> > With this patch applied, KSM scanned and merged the region approximately
> > seven times faster.
> > 
> > This problem was previously discussed in [1].
> > 
> > [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> > 
> > Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@...il.com>
> > ---
> >   mm/ksm.c | 136 ++++++++++++++++++++++++++++++++-----------------------
> >   1 file changed, 79 insertions(+), 57 deletions(-)
> > 
> > diff --git a/mm/ksm.c b/mm/ksm.c
> > index 3aed0478fdce..584fd987e8ae 100644
> > --- a/mm/ksm.c
> > +++ b/mm/ksm.c
> > @@ -2455,15 +2455,80 @@ static bool should_skip_rmap_item(struct folio *folio,
> >   	return true;
> >   }
> > +struct ksm_walk_private {
> > +	struct page *page;
> > +	struct ksm_rmap_item *rmap_item;
> > +	struct ksm_mm_slot *mm_slot;
> > +};
> > +
> > +static int ksm_walk_test(unsigned long addr, unsigned long next, struct mm_walk *walk)
> > +{
> > +	struct vm_area_struct *vma = walk->vma;
> > +
> > +	if (!vma || !(vma->vm_flags & VM_MERGEABLE))
> 
> The anon_vma check should go in here as well.
> 
> How can we possibly get !vma?
> 
> > +		return 1;
> > +	return 0;
> > +}
> > +
> > +static int ksm_pte_entry(pte_t *pte, unsigned long addr,
> > +			    unsigned long end, struct mm_walk *walk)
> > +{
> > +	struct mm_struct *mm = walk->mm;
> > +	struct vm_area_struct *vma = walk->vma;
> > +	struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private;
> > +	struct ksm_mm_slot *mm_slot = private->mm_slot;
> > +	pte_t ptent = ptep_get(pte);
> > +	struct page *page = pfn_to_online_page(pte_pfn(ptent));
> 
> Oh no.
> 
> vm_normal_page()
> 
> > +	struct ksm_rmap_item *rmap_item;
> > +	struct folio *folio;
> > +
> > +	ksm_scan.address = addr;
> > +
> > +	if (ksm_test_exit(mm))
> > +		return 1;
> > +
> > +	if (!page)
> > +		return 0;
> > +
> > +	folio = page_folio(page);
> > +	if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> > +		return 0;
> > +
> > +	folio_get(folio);
> > +
> > +	flush_anon_page(vma, page, ksm_scan.address);
> > +	flush_dcache_page(page);
> > +	rmap_item = get_next_rmap_item(mm_slot,
> > +		ksm_scan.rmap_list, ksm_scan.address);
> > +	if (rmap_item) {
> > +		ksm_scan.rmap_list =
> > +				&rmap_item->rmap_list;
> > +
> > +		if (should_skip_rmap_item(folio, rmap_item)) {
> > +			folio_put(folio);
> > +			return 0;
> > +		}
> > +		ksm_scan.address = end;
> > +		private->page = page;
> > +	} else
> > +		folio_put(folio);
> > +
> 
> You're under PTL, get_next_rmap_item() will perform an allocation, so that
> won't work.
> 
> Observe how the original code worked around that by performing all magic
> outside of the PTL (folio_walk_end()).
> 
> When you switch to .pmd_entry() (see below) you will be able to handle it.
> 
> What you could also try doing is returing page+folio and letting the caller
> deal with everything starting at the flush_anon_page().
> 
> > +	private->rmap_item = rmap_item;
> > +	return 1;
> > +}
> > +
> > +struct mm_walk_ops walk_ops = {
> > +	.pte_entry = ksm_pte_entry,
> > +	.test_walk = ksm_walk_test,
> > +	.walk_lock = PGWALK_RDLOCK,
> > +};
> 
> It's more complicated: you'd be remapping each PMD to be mapped by PTEs
> first, which is not what we want. You'll have to handle pmd_entry instead of
> pte_entry.
> 
> > +
> >   static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
> >   {
> >   	struct mm_struct *mm;
> >   	struct ksm_mm_slot *mm_slot;
> >   	struct mm_slot *slot;
> > -	struct vm_area_struct *vma;
> > -	struct ksm_rmap_item *rmap_item;
> > -	struct vma_iterator vmi;
> > -	int nid;
> > +	int nid, ret;
> >   	if (list_empty(&ksm_mm_head.slot.mm_node))
> >   		return NULL;
> > @@ -2527,64 +2592,21 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
> >   	slot = &mm_slot->slot;
> >   	mm = slot->mm;
> > -	vma_iter_init(&vmi, mm, ksm_scan.address);
> >   	mmap_read_lock(mm);
> >   	if (ksm_test_exit(mm))
> >   		goto no_vmas;
> > -	for_each_vma(vmi, vma) {
> > -		if (!(vma->vm_flags & VM_MERGEABLE))
> > -			continue;
> > -		if (ksm_scan.address < vma->vm_start)
> > -			ksm_scan.address = vma->vm_start;
> > -		if (!vma->anon_vma)
> > -			ksm_scan.address = vma->vm_end;
> > -
> > -		while (ksm_scan.address < vma->vm_end) {
> > -			struct page *tmp_page = NULL;
> > -			struct folio_walk fw;
> > -			struct folio *folio;
> > -
> > -			if (ksm_test_exit(mm))
> > -				break;
> > -
> > -			folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
> > -			if (folio) {
> > -				if (!folio_is_zone_device(folio) &&
> > -				     folio_test_anon(folio)) {
> > -					folio_get(folio);
> > -					tmp_page = fw.page;
> > -				}
> > -				folio_walk_end(&fw, vma);
> > -			}
> > -
> > -			if (tmp_page) {
> > -				flush_anon_page(vma, tmp_page, ksm_scan.address);
> > -				flush_dcache_page(tmp_page);
> > -				rmap_item = get_next_rmap_item(mm_slot,
> > -					ksm_scan.rmap_list, ksm_scan.address);
> > -				if (rmap_item) {
> > -					ksm_scan.rmap_list =
> > -							&rmap_item->rmap_list;
> > -
> > -					if (should_skip_rmap_item(folio, rmap_item)) {
> > -						folio_put(folio);
> > -						goto next_page;
> > -					}
> > -
> > -					ksm_scan.address += PAGE_SIZE;
> > -					*page = tmp_page;
> > -				} else {
> > -					folio_put(folio);
> > -				}
> > -				mmap_read_unlock(mm);
> > -				return rmap_item;
> > -			}
> > -next_page:
> > -			ksm_scan.address += PAGE_SIZE;
> > -			cond_resched();
> 
> You're dropping all cond_resched(), which will be a problem.
> 
> > -		}
> > +	struct ksm_walk_private walk_private = {
> > +		.page = NULL,
> > +		.rmap_item = NULL,
> > +		.mm_slot = ksm_scan.mm_slot
> > +	};
> 
> empty line missing
> 
> > +	ret = walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
> > +	*page = walk_private.page;
> > +	if (ret) {
> > +		mmap_read_unlock(mm);
> > +		return walk_private.rmap_item;
> >   	}
> >   	if (ksm_test_exit(mm)) {
> 
> 
> -- 
> Cheers
> 
> David / dhildenb
> 

Thanks for the explanations, I will send a v2 shortly.