[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <90ed950a-c3bb-46d5-91f9-338f5ca15af6@redhat.com>
Date: Tue, 14 Oct 2025 11:26:06 +0200
From: David Hildenbrand <david@...hat.com>
To: Pedro Demarchi Gomes <pedrodemargomes@...il.com>,
Andrew Morton <akpm@...ux-foundation.org>, craftfever@...ena.io
Cc: Xu Xin <xu.xin16@....com.cn>, Chengming Zhou <chengming.zhou@...ux.dev>,
linux-mm@...ck.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] ksm: use range-walk function to jump over holes in
scan_get_next_rmap_item
On 14.10.25 07:58, Pedro Demarchi Gomes wrote:
> Currently, scan_get_next_rmap_item() walks every page address in a VMA
> to locate mergeable pages. This becomes highly inefficient when scanning
> large virtual memory areas that contain mostly unmapped regions.
>
> This patch replaces the per-address lookup with a range walk using
> walk_page_range(). The range walker allows KSM to skip over entire
> unmapped holes in a VMA, avoiding unnecessary lookups.
>
> To evaluate this change, I created a test that maps a 1 TB virtual area
> where only the first and last 10 MB are populated with identical data.
> With this patch applied, KSM scanned and merged the region approximately
> seven times faster.
>
> This problem was previously discussed in [1].
>
> [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
>
> Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@...il.com>
> ---
> mm/ksm.c | 136 ++++++++++++++++++++++++++++++++-----------------------
> 1 file changed, 79 insertions(+), 57 deletions(-)
>
> diff --git a/mm/ksm.c b/mm/ksm.c
> index 3aed0478fdce..584fd987e8ae 100644
> --- a/mm/ksm.c
> +++ b/mm/ksm.c
> @@ -2455,15 +2455,80 @@ static bool should_skip_rmap_item(struct folio *folio,
> return true;
> }
>
> +struct ksm_walk_private {
> + struct page *page;
> + struct ksm_rmap_item *rmap_item;
> + struct ksm_mm_slot *mm_slot;
> +};
> +
> +static int ksm_walk_test(unsigned long addr, unsigned long next, struct mm_walk *walk)
> +{
> + struct vm_area_struct *vma = walk->vma;
> +
> + if (!vma || !(vma->vm_flags & VM_MERGEABLE))
The anon_vma check should go in here as well.
How can we possibly get !vma?
> + return 1;
> + return 0;
> +}
> +
> +static int ksm_pte_entry(pte_t *pte, unsigned long addr,
> + unsigned long end, struct mm_walk *walk)
> +{
> + struct mm_struct *mm = walk->mm;
> + struct vm_area_struct *vma = walk->vma;
> + struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private;
> + struct ksm_mm_slot *mm_slot = private->mm_slot;
> + pte_t ptent = ptep_get(pte);
> + struct page *page = pfn_to_online_page(pte_pfn(ptent));
Oh no.
vm_normal_page()
> + struct ksm_rmap_item *rmap_item;
> + struct folio *folio;
> +
> + ksm_scan.address = addr;
> +
> + if (ksm_test_exit(mm))
> + return 1;
> +
> + if (!page)
> + return 0;
> +
> + folio = page_folio(page);
> + if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> + return 0;
> +
> + folio_get(folio);
> +
> + flush_anon_page(vma, page, ksm_scan.address);
> + flush_dcache_page(page);
> + rmap_item = get_next_rmap_item(mm_slot,
> + ksm_scan.rmap_list, ksm_scan.address);
> + if (rmap_item) {
> + ksm_scan.rmap_list =
> + &rmap_item->rmap_list;
> +
> + if (should_skip_rmap_item(folio, rmap_item)) {
> + folio_put(folio);
> + return 0;
> + }
> + ksm_scan.address = end;
> + private->page = page;
> + } else
> + folio_put(folio);
> +
You're under PTL, get_next_rmap_item() will perform an allocation, so
that won't work.
Observe how the original code worked around that by performing all magic
outside of the PTL (folio_walk_end()).
When you switch to .pmd_entry() (see below) you will be able to handle it.
What you could also try doing is returing page+folio and letting the
caller deal with everything starting at the flush_anon_page().
> + private->rmap_item = rmap_item;
> + return 1;
> +}
> +
> +struct mm_walk_ops walk_ops = {
> + .pte_entry = ksm_pte_entry,
> + .test_walk = ksm_walk_test,
> + .walk_lock = PGWALK_RDLOCK,
> +};
It's more complicated: you'd be remapping each PMD to be mapped by PTEs
first, which is not what we want. You'll have to handle pmd_entry
instead of pte_entry.
> +
> static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
> {
> struct mm_struct *mm;
> struct ksm_mm_slot *mm_slot;
> struct mm_slot *slot;
> - struct vm_area_struct *vma;
> - struct ksm_rmap_item *rmap_item;
> - struct vma_iterator vmi;
> - int nid;
> + int nid, ret;
>
> if (list_empty(&ksm_mm_head.slot.mm_node))
> return NULL;
> @@ -2527,64 +2592,21 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
>
> slot = &mm_slot->slot;
> mm = slot->mm;
> - vma_iter_init(&vmi, mm, ksm_scan.address);
>
> mmap_read_lock(mm);
> if (ksm_test_exit(mm))
> goto no_vmas;
>
> - for_each_vma(vmi, vma) {
> - if (!(vma->vm_flags & VM_MERGEABLE))
> - continue;
> - if (ksm_scan.address < vma->vm_start)
> - ksm_scan.address = vma->vm_start;
> - if (!vma->anon_vma)
> - ksm_scan.address = vma->vm_end;
> -
> - while (ksm_scan.address < vma->vm_end) {
> - struct page *tmp_page = NULL;
> - struct folio_walk fw;
> - struct folio *folio;
> -
> - if (ksm_test_exit(mm))
> - break;
> -
> - folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
> - if (folio) {
> - if (!folio_is_zone_device(folio) &&
> - folio_test_anon(folio)) {
> - folio_get(folio);
> - tmp_page = fw.page;
> - }
> - folio_walk_end(&fw, vma);
> - }
> -
> - if (tmp_page) {
> - flush_anon_page(vma, tmp_page, ksm_scan.address);
> - flush_dcache_page(tmp_page);
> - rmap_item = get_next_rmap_item(mm_slot,
> - ksm_scan.rmap_list, ksm_scan.address);
> - if (rmap_item) {
> - ksm_scan.rmap_list =
> - &rmap_item->rmap_list;
> -
> - if (should_skip_rmap_item(folio, rmap_item)) {
> - folio_put(folio);
> - goto next_page;
> - }
> -
> - ksm_scan.address += PAGE_SIZE;
> - *page = tmp_page;
> - } else {
> - folio_put(folio);
> - }
> - mmap_read_unlock(mm);
> - return rmap_item;
> - }
> -next_page:
> - ksm_scan.address += PAGE_SIZE;
> - cond_resched();
You're dropping all cond_resched(), which will be a problem.
> - }
> + struct ksm_walk_private walk_private = {
> + .page = NULL,
> + .rmap_item = NULL,
> + .mm_slot = ksm_scan.mm_slot
> + };
empty line missing
> + ret = walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
> + *page = walk_private.page;
> + if (ret) {
> + mmap_read_unlock(mm);
> + return walk_private.rmap_item;
> }
>
> if (ksm_test_exit(mm)) {
--
Cheers
David / dhildenb
Powered by blists - more mailing lists