[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <qrf6mertexxf3hcazexkv6jxokdg5jnsbuixih62w2peq3ooyl@u3mhuopdjrfc>
Date: Tue, 14 Oct 2025 10:36:49 -0300
From: Pedro Demarchi Gomes <pedrodemargomes@...il.com>
To: David Hildenbrand <david@...hat.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>, craftfever@...ena.io,
Xu Xin <xu.xin16@....com.cn>, Chengming Zhou <chengming.zhou@...ux.dev>, linux-mm@...ck.org,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH] ksm: use range-walk function to jump over holes in
scan_get_next_rmap_item
On Tue, Oct 14, 2025 at 11:26:06AM +0200, David Hildenbrand wrote:
> On 14.10.25 07:58, Pedro Demarchi Gomes wrote:
> > Currently, scan_get_next_rmap_item() walks every page address in a VMA
> > to locate mergeable pages. This becomes highly inefficient when scanning
> > large virtual memory areas that contain mostly unmapped regions.
> >
> > This patch replaces the per-address lookup with a range walk using
> > walk_page_range(). The range walker allows KSM to skip over entire
> > unmapped holes in a VMA, avoiding unnecessary lookups.
> >
> > To evaluate this change, I created a test that maps a 1 TB virtual area
> > where only the first and last 10 MB are populated with identical data.
> > With this patch applied, KSM scanned and merged the region approximately
> > seven times faster.
> >
> > This problem was previously discussed in [1].
> >
> > [1] https://lore.kernel.org/linux-mm/423de7a3-1c62-4e72-8e79-19a6413e420c@redhat.com/
> >
> > Signed-off-by: Pedro Demarchi Gomes <pedrodemargomes@...il.com>
> > ---
> > mm/ksm.c | 136 ++++++++++++++++++++++++++++++++-----------------------
> > 1 file changed, 79 insertions(+), 57 deletions(-)
> >
> > diff --git a/mm/ksm.c b/mm/ksm.c
> > index 3aed0478fdce..584fd987e8ae 100644
> > --- a/mm/ksm.c
> > +++ b/mm/ksm.c
> > @@ -2455,15 +2455,80 @@ static bool should_skip_rmap_item(struct folio *folio,
> > return true;
> > }
> > +struct ksm_walk_private {
> > + struct page *page;
> > + struct ksm_rmap_item *rmap_item;
> > + struct ksm_mm_slot *mm_slot;
> > +};
> > +
> > +static int ksm_walk_test(unsigned long addr, unsigned long next, struct mm_walk *walk)
> > +{
> > + struct vm_area_struct *vma = walk->vma;
> > +
> > + if (!vma || !(vma->vm_flags & VM_MERGEABLE))
>
> The anon_vma check should go in here as well.
>
> How can we possibly get !vma?
>
> > + return 1;
> > + return 0;
> > +}
> > +
> > +static int ksm_pte_entry(pte_t *pte, unsigned long addr,
> > + unsigned long end, struct mm_walk *walk)
> > +{
> > + struct mm_struct *mm = walk->mm;
> > + struct vm_area_struct *vma = walk->vma;
> > + struct ksm_walk_private *private = (struct ksm_walk_private *) walk->private;
> > + struct ksm_mm_slot *mm_slot = private->mm_slot;
> > + pte_t ptent = ptep_get(pte);
> > + struct page *page = pfn_to_online_page(pte_pfn(ptent));
>
> Oh no.
>
> vm_normal_page()
>
> > + struct ksm_rmap_item *rmap_item;
> > + struct folio *folio;
> > +
> > + ksm_scan.address = addr;
> > +
> > + if (ksm_test_exit(mm))
> > + return 1;
> > +
> > + if (!page)
> > + return 0;
> > +
> > + folio = page_folio(page);
> > + if (folio_is_zone_device(folio) || !folio_test_anon(folio))
> > + return 0;
> > +
> > + folio_get(folio);
> > +
> > + flush_anon_page(vma, page, ksm_scan.address);
> > + flush_dcache_page(page);
> > + rmap_item = get_next_rmap_item(mm_slot,
> > + ksm_scan.rmap_list, ksm_scan.address);
> > + if (rmap_item) {
> > + ksm_scan.rmap_list =
> > + &rmap_item->rmap_list;
> > +
> > + if (should_skip_rmap_item(folio, rmap_item)) {
> > + folio_put(folio);
> > + return 0;
> > + }
> > + ksm_scan.address = end;
> > + private->page = page;
> > + } else
> > + folio_put(folio);
> > +
>
> You're under PTL, get_next_rmap_item() will perform an allocation, so that
> won't work.
>
> Observe how the original code worked around that by performing all magic
> outside of the PTL (folio_walk_end()).
>
> When you switch to .pmd_entry() (see below) you will be able to handle it.
>
> What you could also try doing is returing page+folio and letting the caller
> deal with everything starting at the flush_anon_page().
>
> > + private->rmap_item = rmap_item;
> > + return 1;
> > +}
> > +
> > +struct mm_walk_ops walk_ops = {
> > + .pte_entry = ksm_pte_entry,
> > + .test_walk = ksm_walk_test,
> > + .walk_lock = PGWALK_RDLOCK,
> > +};
>
> It's more complicated: you'd be remapping each PMD to be mapped by PTEs
> first, which is not what we want. You'll have to handle pmd_entry instead of
> pte_entry.
>
> > +
> > static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
> > {
> > struct mm_struct *mm;
> > struct ksm_mm_slot *mm_slot;
> > struct mm_slot *slot;
> > - struct vm_area_struct *vma;
> > - struct ksm_rmap_item *rmap_item;
> > - struct vma_iterator vmi;
> > - int nid;
> > + int nid, ret;
> > if (list_empty(&ksm_mm_head.slot.mm_node))
> > return NULL;
> > @@ -2527,64 +2592,21 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
> > slot = &mm_slot->slot;
> > mm = slot->mm;
> > - vma_iter_init(&vmi, mm, ksm_scan.address);
> > mmap_read_lock(mm);
> > if (ksm_test_exit(mm))
> > goto no_vmas;
> > - for_each_vma(vmi, vma) {
> > - if (!(vma->vm_flags & VM_MERGEABLE))
> > - continue;
> > - if (ksm_scan.address < vma->vm_start)
> > - ksm_scan.address = vma->vm_start;
> > - if (!vma->anon_vma)
> > - ksm_scan.address = vma->vm_end;
> > -
> > - while (ksm_scan.address < vma->vm_end) {
> > - struct page *tmp_page = NULL;
> > - struct folio_walk fw;
> > - struct folio *folio;
> > -
> > - if (ksm_test_exit(mm))
> > - break;
> > -
> > - folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
> > - if (folio) {
> > - if (!folio_is_zone_device(folio) &&
> > - folio_test_anon(folio)) {
> > - folio_get(folio);
> > - tmp_page = fw.page;
> > - }
> > - folio_walk_end(&fw, vma);
> > - }
> > -
> > - if (tmp_page) {
> > - flush_anon_page(vma, tmp_page, ksm_scan.address);
> > - flush_dcache_page(tmp_page);
> > - rmap_item = get_next_rmap_item(mm_slot,
> > - ksm_scan.rmap_list, ksm_scan.address);
> > - if (rmap_item) {
> > - ksm_scan.rmap_list =
> > - &rmap_item->rmap_list;
> > -
> > - if (should_skip_rmap_item(folio, rmap_item)) {
> > - folio_put(folio);
> > - goto next_page;
> > - }
> > -
> > - ksm_scan.address += PAGE_SIZE;
> > - *page = tmp_page;
> > - } else {
> > - folio_put(folio);
> > - }
> > - mmap_read_unlock(mm);
> > - return rmap_item;
> > - }
> > -next_page:
> > - ksm_scan.address += PAGE_SIZE;
> > - cond_resched();
>
> You're dropping all cond_resched(), which will be a problem.
>
> > - }
> > + struct ksm_walk_private walk_private = {
> > + .page = NULL,
> > + .rmap_item = NULL,
> > + .mm_slot = ksm_scan.mm_slot
> > + };
>
> empty line missing
>
> > + ret = walk_page_range(mm, ksm_scan.address, -1, &walk_ops, (void *) &walk_private);
> > + *page = walk_private.page;
> > + if (ret) {
> > + mmap_read_unlock(mm);
> > + return walk_private.rmap_item;
> > }
> > if (ksm_test_exit(mm)) {
>
>
> --
> Cheers
>
> David / dhildenb
>
Thanks for the explanations, I will send a v2 shortly.
Powered by blists - more mailing lists