linux-kernel - Re: [PATCH] ksm: use range-walk function to jump over holes in scan_get_next_rmap

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <90ed950a-c3bb-46d5-91f9-338f5ca15af6@redhat.com>
Date: Tue, 14 Oct 2025 11:26:06 +0200
From: David Hildenbrand <david@...hat.com>
To: Pedro Demarchi Gomes <pedrodemargomes@...il.com>,
 Andrew Morton <akpm@...ux-foundation.org>, craftfever@...ena.io
Cc: Xu Xin <xu.xin16@....com.cn>, Chengming Zhou <chengming.zhou@...ux.dev>,
 linux-mm@...ck.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] ksm: use range-walk function to jump over holes in
 scan_get_next_rmap_item

On 14.10.25 07:58, Pedro Demarchi Gomes wrote:
> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions.
> 
> This patch replaces the per-address lookup with a range walk using
> walk_page_range(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
> 
> To evaluate this change, I created a test that maps a 1 TB virtual area
> where only the first and last 10 MB are populated with identical data.
> With this patch applied, KSM scanned and merged the region approximately
> seven times faster.
> 
> This problem was previously discussed in [1].
> 
> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> 
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@...il.com>
> ---
>   mm/ksm.c | 136 ++++++++++++++++++++++++++++++++-----------------------
>   1 file changed, 79 insertions(+), 57 deletions(-)
> 
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 3aed0478fdce..584fd987e8ae 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -2455,15 +2455,80 @@ static bool should_skip_rmap_item(struct folio *folio,
>   	return true;
>   }
>   
> +struct ksm_walk_private {
> +	struct page *page;
> +	struct ksm_rmap_item *rmap_item;
> +	struct ksm_mm_slot *mm_slot;
> +};
> +
> +static int ksm_walk_test(unsigned long addr, unsigned long next, struct mm_walk *walk)
> +{
> +	struct vm_area_struct *vma = walk->vma;
> +
> +	if (!vma || !(vma->vm_flags & VM_MERGEABLE))

The anon_vma check should go in here as well.

How can we possibly get !vma?

> +		return 1;
> +	return 0;
> +}
> +
> +static int ksm_pte_entry(pte_t *pte, unsigned long addr,
> +			    unsigned long end, struct mm_walk *walk)
> +{
> +	struct mm_struct *mm = walk->mm;
> +	struct vm_area_struct *vma = walk->vma;
> +	struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private;
> +	struct ksm_mm_slot *mm_slot = private->mm_slot;
> +	pte_t ptent = ptep_get(pte);
> +	struct page *page = pfn_to_online_page(pte_pfn(ptent));

Oh no.

vm_normal_page()

> +	struct ksm_rmap_item *rmap_item;
> +	struct folio *folio;
> +
> +	ksm_scan.address = addr;
> +
> +	if (ksm_test_exit(mm))
> +		return 1;
> +
> +	if (!page)
> +		return 0;
> +
> +	folio = page_folio(page);
> +	if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> +		return 0;
> +
> +	folio_get(folio);
> +
> +	flush_anon_page(vma, page, ksm_scan.address);
> +	flush_dcache_page(page);
> +	rmap_item = get_next_rmap_item(mm_slot,
> +		ksm_scan.rmap_list, ksm_scan.address);
> +	if (rmap_item) {
> +		ksm_scan.rmap_list =
> +				&rmap_item->rmap_list;
> +
> +		if (should_skip_rmap_item(folio, rmap_item)) {
> +			folio_put(folio);
> +			return 0;
> +		}
> +		ksm_scan.address = end;
> +		private->page = page;
> +	} else
> +		folio_put(folio);
> +

You're under PTL, get_next_rmap_item() will perform an allocation, so 
that won't work.

Observe how the original code worked around that by performing all magic 
outside of the PTL (folio_walk_end()).

When you switch to .pmd_entry() (see below) you will be able to handle it.

What you could also try doing is returing page+folio and letting the 
caller deal with everything starting at the flush_anon_page().

> +	private->rmap_item = rmap_item;
> +	return 1;
> +}
> +
> +struct mm_walk_ops walk_ops = {
> +	.pte_entry = ksm_pte_entry,
> +	.test_walk = ksm_walk_test,
> +	.walk_lock = PGWALK_RDLOCK,
> +};

It's more complicated: you'd be remapping each PMD to be mapped by PTEs 
first, which is not what we want. You'll have to handle pmd_entry 
instead of pte_entry.

> +
>   static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>   {
>   	struct mm_struct *mm;
>   	struct ksm_mm_slot *mm_slot;
>   	struct mm_slot *slot;
> -	struct vm_area_struct *vma;
> -	struct ksm_rmap_item *rmap_item;
> -	struct vma_iterator vmi;
> -	int nid;
> +	int nid, ret;
>   
>   	if (list_empty(&ksm_mm_head.slot.mm_node))
>   		return NULL;
> @@ -2527,64 +2592,21 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>   
>   	slot = &mm_slot->slot;
>   	mm = slot->mm;
> -	vma_iter_init(&vmi, mm, ksm_scan.address);
>   
>   	mmap_read_lock(mm);
>   	if (ksm_test_exit(mm))
>   		goto no_vmas;
>   
> -	for_each_vma(vmi, vma) {
> -		if (!(vma->vm_flags & VM_MERGEABLE))
> -			continue;
> -		if (ksm_scan.address < vma->vm_start)
> -			ksm_scan.address = vma->vm_start;
> -		if (!vma->anon_vma)
> -			ksm_scan.address = vma->vm_end;
> -
> -		while (ksm_scan.address < vma->vm_end) {
> -			struct page *tmp_page = NULL;
> -			struct folio_walk fw;
> -			struct folio *folio;
> -
> -			if (ksm_test_exit(mm))
> -				break;
> -
> -			folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
> -			if (folio) {
> -				if (!folio_is_zone_device(folio) &&
> -				     folio_test_anon(folio)) {
> -					folio_get(folio);
> -					tmp_page = fw.page;
> -				}
> -				folio_walk_end(&fw, vma);
> -			}
> -
> -			if (tmp_page) {
> -				flush_anon_page(vma, tmp_page, ksm_scan.address);
> -				flush_dcache_page(tmp_page);
> -				rmap_item = get_next_rmap_item(mm_slot,
> -					ksm_scan.rmap_list, ksm_scan.address);
> -				if (rmap_item) {
> -					ksm_scan.rmap_list =
> -							&rmap_item->rmap_list;
> -
> -					if (should_skip_rmap_item(folio, rmap_item)) {
> -						folio_put(folio);
> -						goto next_page;
> -					}
> -
> -					ksm_scan.address += PAGE_SIZE;
> -					*page = tmp_page;
> -				} else {
> -					folio_put(folio);
> -				}
> -				mmap_read_unlock(mm);
> -				return rmap_item;
> -			}
> -next_page:
> -			ksm_scan.address += PAGE_SIZE;
> -			cond_resched();

You're dropping all cond_resched(), which will be a problem.

> -		}
> +	struct ksm_walk_private walk_private = {
> +		.page = NULL,
> +		.rmap_item = NULL,
> +		.mm_slot = ksm_scan.mm_slot
> +	};

empty line missing

> +	ret = walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
> +	*page = walk_private.page;
> +	if (ret) {
> +		mmap_read_unlock(mm);
> +		return walk_private.rmap_item;
>   	}
>   
>   	if (ksm_test_exit(mm)) {


-- 
Cheers

David / dhildenb