linux-kernel - Re: [PATCH v5] ksm: use range-walk function to jump over holes in scan_get_next_rmap

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <d06f4c9d-cf63-4bc4-859d-8256e16dd68d@airmail.cc>
Date: Thu, 23 Oct 2025 09:49:00 +0000
From: craftfever <craftfever@...mail.cc>
To: Pedro Demarchi Gomes <pedrodemargomes@...il.com>,
 David Hildenbrand <david@...hat.com>,
 Andrew Morton <akpm@...ux-foundation.org>
Cc: Xu Xin <xu.xin16@....com.cn>, Chengming Zhou <chengming.zhou@...ux.dev>,
 linux-mm@...ck.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v5] ksm: use range-walk function to jump over holes in
 scan_get_next_rmap_item



Pedro Demarchi Gomes wrote:
> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions, causing
> ksmd to use large amount of cpu without deduplicating much pages.
> 
> This patch replaces the per-address lookup with a range walk using
> walk_page_range(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
> This problem was previously discussed in [1].
> 
> Consider the following test program which creates a 32 TiB mapping in
> the virtual address space but only populates a single page:
> 
> #include <unistd.h>
> #include <stdio.h>
> #include <sys/mman.h>
> 
> /* 32 TiB */
> const size_t size = 32ul * 1024 * 1024 * 1024 * 1024;
> 
> int main() {
>          char *area = mmap(NULL, size, PROT_READ | PROT_WRITE,
>                            MAP_NORESERVE | MAP_PRIVATE | MAP_ANON, -1, 0);
> 
>          if (area == MAP_FAILED) {
>                  perror("mmap() failed\n");
>                  return -1;
>          }
> 
>          /* Populate a single page such that we get an anon_vma. */
>          *area = 0;
> 
>          /* Enable KSM. */
>          madvise(area, size, MADV_MERGEABLE);
>          pause();
>          return 0;
> }
> 
> $ ./ksm-sparse  &
> $ echo 1 > /sys/kernel/mm/ksm/run
> 
> Without this patch ksmd uses 100% of the cpu for a long time (more then
> 1 hour in my test machine) scanning all the 32 TiB virtual address space
> that contain only one mapped page. This makes ksmd essentially deadlocked
> not able to deduplicate anything of value.
> With this patch ksmd walks only the one mapped page and skips the rest of
> the 32 TiB virtual address space, making the scan fast using little cpu.
> 
> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> 
> ---
> v5:
>    - Improve patch description
> 
> v4: https://lore.kernel.org/linux-mm/20251022153059.22763-1-pedrodemargomes@gmail.com/
>    - Make minimal changes to replace folio_walk by walk_page_range_vma
> 
> v3: https://lore.kernel.org/all/20251016012236.4189-1-pedrodemargomes@gmail.com/
>    - Treat THPs in ksm_pmd_entry
>    - Update ksm_scan.address outside walk_page_range
>    - Change goto to while loop
> 
> v2: https://lore.kernel.org/all/20251014151126.87589-1-pedrodemargomes@gmail.com/
>    - Use pmd_entry to walk page range
>    - Use cond_resched inside pmd_entry()
>    - walk_page_range returns page+folio
> 
> v1: https://lore.kernel.org/all/20251014055828.124522-1-pedrodemargomes@gmail.com/
> 
> Reported-by: craftfever <craftfever@...mail.cc>
> Closes: https://lkml.kernel.org/r/020cf8de6e773bb78ba7614ef250129f11a63781@murena.io
> Suggested-by: David Hildenbrand <david@...hat.com>
> Co-developed-by: David Hildenbrand <david@...hat.com>
> Signed-off-by: David Hildenbrand <david@...hat.com>
> Fixes: 31dbd01f3143 ("ksm: Kernel SamePage Merging")
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@...il.com>
> ---
>   mm/ksm.c | 113 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
>   1 file changed, 104 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 3aed0478fdce..4f672f4f2140 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -2455,6 +2455,95 @@ static bool should_skip_rmap_item(struct folio *folio,
>   	return true;
>   }
>   
> +struct ksm_next_page_arg {
> +	struct folio *folio;
> +	struct page *page;
> +	unsigned long addr;
> +};
> +
> +static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
> +		struct mm_walk *walk)
> +{
> +	struct ksm_next_page_arg *private = walk->private;
> +	struct vm_area_struct *vma = walk->vma;
> +	pte_t *start_ptep = NULL, *ptep, pte;
> +	struct mm_struct *mm = walk->mm;
> +	struct folio *folio;
> +	struct page *page;
> +	spinlock_t *ptl;
> +	pmd_t pmd;
> +
> +	if (ksm_test_exit(mm))
> +		return 0;
> +
> +	cond_resched();
> +
> +	pmd = pmdp_get_lockless(pmdp);
> +	if (!pmd_present(pmd))
> +		return 0;
> +
> +	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) {
> +		ptl = pmd_lock(mm, pmdp);
> +		pmd = pmdp_get(pmdp);
> +
> +		if (!pmd_present(pmd)) {
> +			goto not_found_unlock;
> +		} else if (pmd_leaf(pmd)) {
> +			page = vm_normal_page_pmd(vma, addr, pmd);
> +			if (!page)
> +				goto not_found_unlock;
> +			folio = page_folio(page);
> +
> +			if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> +				goto not_found_unlock;
> +
> +			page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
> +			goto found_unlock;
> +		}
> +		spin_unlock(ptl);
> +	}
> +
> +	start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
> +	if (!start_ptep)
> +		return 0;
> +
> +	for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
> +		pte = ptep_get(ptep);
> +
> +		if (!pte_present(pte))
> +			continue;
> +
> +		page = vm_normal_page(vma, addr, pte);
> +		if (!page)
> +			continue;
> +		folio = page_folio(page);
> +
> +		if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> +			continue;
> +		goto found_unlock;
> +	}
> +
> +not_found_unlock:
> +	spin_unlock(ptl);
> +	if (start_ptep)
> +		pte_unmap(start_ptep);
> +	return 0;
> +found_unlock:
> +	folio_get(folio);
> +	spin_unlock(ptl);
> +	if (start_ptep)
> +		pte_unmap(start_ptep);
> +	private->page = page;
> +	private->folio = folio;
> +	private->addr = addr;
> +	return 1;
> +}
> +
> +static struct mm_walk_ops ksm_next_page_ops = {
> +	.pmd_entry = ksm_next_page_pmd_entry,
> +	.walk_lock = PGWALK_RDLOCK,
> +};
> +
>   static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>   {
>   	struct mm_struct *mm;
> @@ -2542,21 +2631,27 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>   			ksm_scan.address = vma->vm_end;
>   
>   		while (ksm_scan.address < vma->vm_end) {
> +			struct ksm_next_page_arg ksm_next_page_arg;
>   			struct page *tmp_page = NULL;
> -			struct folio_walk fw;
>   			struct folio *folio;
>   
>   			if (ksm_test_exit(mm))
>   				break;
>   
> -			folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
> -			if (folio) {
> -				if (!folio_is_zone_device(folio) &&
> -				     folio_test_anon(folio)) {
> -					folio_get(folio);
> -					tmp_page = fw.page;
> -				}
> -				folio_walk_end(&fw, vma);
> +			int found;
> +
> +			found = walk_page_range_vma(vma, ksm_scan.address,
> +						    vma->vm_end,
> +						    &ksm_next_page_ops,
> +						    &ksm_next_page_arg);
> +
> +			if (found > 0) {
> +				folio = ksm_next_page_arg.folio;
> +				tmp_page = ksm_next_page_arg.page;
> +				ksm_scan.address = ksm_next_page_arg.addr;
> +			} else {
> +				VM_WARN_ON_ONCE(found < 0);
> +				ksm_scan.address = vma->vm_end - PAGE_SIZE;
>   			}
>   
>   			if (tmp_page) {

Thank you, it works magnificently, very pleasent. Very low CPU-consuming 
and effective, no crashes. Is this a final version?