linux-kernel - Re: [PATCH 2/3] userfaultfd: UFFDIO

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJuCfpHFOLLDC4z9RWMwJdFr3pkPQYH9_XFK4GNrPX9=dO9VpA@mail.gmail.com>
Date:   Tue, 19 Sep 2023 16:40:59 -0700
From:   Suren Baghdasaryan <surenb@...gle.com>
To:     Jann Horn <jannh@...gle.com>
Cc:     akpm@...ux-foundation.org, viro@...iv.linux.org.uk,
        brauner@...nel.org, shuah@...nel.org, aarcange@...hat.com,
        lokeshgidra@...gle.com, peterx@...hat.com, david@...hat.com,
        hughd@...gle.com, mhocko@...e.com, axelrasmussen@...gle.com,
        rppt@...nel.org, willy@...radead.org, Liam.Howlett@...cle.com,
        zhangpeng362@...wei.com, bgeffon@...gle.com,
        kaleshsingh@...gle.com, ngeoffray@...gle.com, jdduke@...gle.com,
        linux-mm@...ck.org, linux-fsdevel@...r.kernel.org,
        linux-kernel@...r.kernel.org, linux-kselftest@...r.kernel.org,
        kernel-team@...roid.com
Subject: Re: [PATCH 2/3] userfaultfd: UFFDIO_REMAP uABI

On Tue, Sep 19, 2023 at 4:08 PM Suren Baghdasaryan <surenb@...gle.com> wrote:
>
> On Thu, Sep 14, 2023 at 7:28 PM Jann Horn <jannh@...gle.com> wrote:
> >
> > On Thu, Sep 14, 2023 at 5:26 PM Suren Baghdasaryan <surenb@...gle.com> wrote:
> > >
> > > From: Andrea Arcangeli <aarcange@...hat.com>
> > >
> > > This implements the uABI of UFFDIO_REMAP.
> > >
> > > Notably one mode bitflag is also forwarded (and in turn known) by the
> > > lowlevel remap_pages method.
> > >
> > > Signed-off-by: Andrea Arcangeli <aarcange@...hat.com>
> > > Signed-off-by: Suren Baghdasaryan <surenb@...gle.com>
> > > ---
> > >  fs/userfaultfd.c                 |  49 +++
> > >  include/linux/rmap.h             |   5 +
> > >  include/linux/userfaultfd_k.h    |  17 +
> > >  include/uapi/linux/userfaultfd.h |  22 ++
> > >  mm/huge_memory.c                 | 118 +++++++
> > >  mm/khugepaged.c                  |   3 +
> > >  mm/userfaultfd.c                 | 586 +++++++++++++++++++++++++++++++
> > >  7 files changed, 800 insertions(+)
> > >
> > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
> > > index 56eaae9dac1a..7bf64e7541c1 100644
> > > --- a/fs/userfaultfd.c
> > > +++ b/fs/userfaultfd.c
> > > @@ -2027,6 +2027,52 @@ static inline unsigned int uffd_ctx_features(__u64 user_features)
> > >         return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
> > >  }
> > >
> > > +static int userfaultfd_remap(struct userfaultfd_ctx *ctx,
> > > +                            unsigned long arg)
> > > +{
> > > +       __s64 ret;
> > > +       struct uffdio_remap uffdio_remap;
> > > +       struct uffdio_remap __user *user_uffdio_remap;
> > > +       struct userfaultfd_wake_range range;
> > > +
> > > +       user_uffdio_remap = (struct uffdio_remap __user *) arg;
> > > +
> > > +       ret = -EFAULT;
> > > +       if (copy_from_user(&uffdio_remap, user_uffdio_remap,
> > > +                          /* don't copy "remap" last field */
> > > +                          sizeof(uffdio_remap)-sizeof(__s64)))
> > > +               goto out;
> > > +
> > > +       ret = validate_range(ctx->mm, uffdio_remap.dst, uffdio_remap.len);
> > > +       if (ret)
> > > +               goto out;
> > > +       ret = validate_range(current->mm, uffdio_remap.src, uffdio_remap.len);
> > > +       if (ret)
> > > +               goto out;
> > > +       ret = -EINVAL;
> > > +       if (uffdio_remap.mode & ~(UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES|
> > > +                                 UFFDIO_REMAP_MODE_DONTWAKE))
> > > +               goto out;
> >
>
> Sorry for the delay with the answers...
>
> > Do you not need mmget_not_zero(ctx->mm) to make sure the MM can't be
> > concurrently torn down while remap_pages() is running, similar to what
> > the other userfaultfd ioctl handlers do?
>
> Yes, I do need that. Will add.
>
>
> >
> > > +       ret = remap_pages(ctx->mm, current->mm,
> > > +                         uffdio_remap.dst, uffdio_remap.src,
> > > +                         uffdio_remap.len, uffdio_remap.mode);
> > > +       if (unlikely(put_user(ret, &user_uffdio_remap->remap)))
> > > +               return -EFAULT;
> > > +       if (ret < 0)
> > > +               goto out;
> > > +       /* len == 0 would wake all */
> > > +       BUG_ON(!ret);
> > > +       range.len = ret;
> > > +       if (!(uffdio_remap.mode & UFFDIO_REMAP_MODE_DONTWAKE)) {
> > > +               range.start = uffdio_remap.dst;
> > > +               wake_userfault(ctx, &range);
> > > +       }
> > > +       ret = range.len == uffdio_remap.len ? 0 : -EAGAIN;
> > > +out:
> > > +       return ret;
> > > +}
> > [...]
> > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> > > index 064fbd90822b..c7a9880a1f6a 100644
> > > --- a/mm/huge_memory.c
> > > +++ b/mm/huge_memory.c
> > > @@ -1932,6 +1932,124 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
> > >         return ret;
> > >  }
> > >
> > > +#ifdef CONFIG_USERFAULTFD
> > > +/*
> > > + * The PT lock for src_pmd and the mmap_lock for reading are held by
> > > + * the caller, but it must return after releasing the
> > > + * page_table_lock. We're guaranteed the src_pmd is a pmd_trans_huge
> > > + * until the PT lock of the src_pmd is released. Just move the page
> > > + * from src_pmd to dst_pmd if possible. Return zero if succeeded in
> > > + * moving the page, -EAGAIN if it needs to be repeated by the caller,
> > > + * or other errors in case of failure.
> > > + */
> > > +int remap_pages_huge_pmd(struct mm_struct *dst_mm,
> > > +                        struct mm_struct *src_mm,
> > > +                        pmd_t *dst_pmd, pmd_t *src_pmd,
> > > +                        pmd_t dst_pmdval,
> > > +                        struct vm_area_struct *dst_vma,
> > > +                        struct vm_area_struct *src_vma,
> > > +                        unsigned long dst_addr,
> > > +                        unsigned long src_addr)
> > > +{
> > > +       pmd_t _dst_pmd, src_pmdval;
> > > +       struct page *src_page;
> > > +       struct anon_vma *src_anon_vma, *dst_anon_vma;
> > > +       spinlock_t *src_ptl, *dst_ptl;
> > > +       pgtable_t pgtable;
> > > +       struct mmu_notifier_range range;
> > > +
> > > +       src_pmdval = *src_pmd;
> > > +       src_ptl = pmd_lockptr(src_mm, src_pmd);
> > > +
> > > +       BUG_ON(!pmd_trans_huge(src_pmdval));
> > > +       BUG_ON(!pmd_none(dst_pmdval));
> >
> > Why can we assert that pmd_none(dst_pmdval) is true here? Can we not
> > have concurrent faults (or userfaultfd operations) populating that
> > PMD?
>
> IIUC dst_pmdval is a copy of the value from dst_pmd, so that local
> copy should not change even if some concurrent operation changes
> dst_pmd. We can assert that it's pmd_none because we checked for that
> before calling remap_pages_huge_pmd. Later on we check if dst_pmd
> changed from under us (see pmd_same(*dst_pmd, dst_pmdval) check) and
> retry if that happened.
>
> >
> > > +       BUG_ON(!spin_is_locked(src_ptl));
> > > +       mmap_assert_locked(src_mm);
> > > +       mmap_assert_locked(dst_mm);
> > > +       BUG_ON(src_addr & ~HPAGE_PMD_MASK);
> > > +       BUG_ON(dst_addr & ~HPAGE_PMD_MASK);
> > > +
> > > +       src_page = pmd_page(src_pmdval);
> > > +       BUG_ON(!PageHead(src_page));
> > > +       BUG_ON(!PageAnon(src_page));
> > > +       if (unlikely(page_mapcount(src_page) != 1)) {
> > > +               spin_unlock(src_ptl);
> > > +               return -EBUSY;
> > > +       }
> > > +
> > > +       get_page(src_page);
> > > +       spin_unlock(src_ptl);
> > > +
> > > +       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_mm, src_addr,
> > > +                               src_addr + HPAGE_PMD_SIZE);
> > > +       mmu_notifier_invalidate_range_start(&range);
> > > +
> > > +       /* block all concurrent rmap walks */
> > > +       lock_page(src_page);
> > > +
> > > +       /*
> > > +        * split_huge_page walks the anon_vma chain without the page
> > > +        * lock. Serialize against it with the anon_vma lock, the page
> > > +        * lock is not enough.
> > > +        */
> > > +       src_anon_vma = folio_get_anon_vma(page_folio(src_page));
> > > +       if (!src_anon_vma) {
> > > +               unlock_page(src_page);
> > > +               put_page(src_page);
> > > +               mmu_notifier_invalidate_range_end(&range);
> > > +               return -EAGAIN;
> > > +       }
> > > +       anon_vma_lock_write(src_anon_vma);
> > > +
> > > +       dst_ptl = pmd_lockptr(dst_mm, dst_pmd);
> > > +       double_pt_lock(src_ptl, dst_ptl);
> > > +       if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
> > > +                    !pmd_same(*dst_pmd, dst_pmdval) ||
> > > +                    page_mapcount(src_page) != 1)) {
> > > +               double_pt_unlock(src_ptl, dst_ptl);
> > > +               anon_vma_unlock_write(src_anon_vma);
> > > +               put_anon_vma(src_anon_vma);
> > > +               unlock_page(src_page);
> > > +               put_page(src_page);
> > > +               mmu_notifier_invalidate_range_end(&range);
> > > +               return -EAGAIN;
> > > +       }
> > > +
> > > +       BUG_ON(!PageHead(src_page));
> > > +       BUG_ON(!PageAnon(src_page));
> > > +       /* the PT lock is enough to keep the page pinned now */
> > > +       put_page(src_page);
> > > +
> > > +       dst_anon_vma = (void *) dst_vma->anon_vma + PAGE_MAPPING_ANON;
> > > +       WRITE_ONCE(src_page->mapping, (struct address_space *) dst_anon_vma);
> > > +       WRITE_ONCE(src_page->index, linear_page_index(dst_vma, dst_addr));
> > > +
> > > +       if (!pmd_same(pmdp_huge_clear_flush(src_vma, src_addr, src_pmd),
> > > +                     src_pmdval))
> > > +               BUG_ON(1);
> >
> > I'm not sure we can assert that the PMDs are exactly equal; the CPU
> > might have changed the A/D bits under us?
>
> Yes. I wonder if I can simply remove the BUG_ON here like this:
>
> src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
>
> Technically we don't use src_pmdval after this but for the possible
> future use that would keep things correct. If A/D bits changed from
> under us we will still copy correct values into dst_pmd.
>
> >
> > > +       _dst_pmd = mk_huge_pmd(src_page, dst_vma->vm_page_prot);
> > > +       _dst_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
> > > +       set_pmd_at(dst_mm, dst_addr, dst_pmd, _dst_pmd);
> > > +
> > > +       pgtable = pgtable_trans_huge_withdraw(src_mm, src_pmd);
> > > +       pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
> >
> > Are we allowed to move page tables between mm_structs on all
> > architectures? The first example I found that looks a bit dodgy,
> > looking through various architectures' pte_alloc_one(), is s390's
> > page_table_alloc() which looks like page tables are tied to per-MM
> > lists sometimes.
> > If that's not allowed, we might have to allocate a new deposit table
> > and free the old one or something like that.
>
> Hmm. Yeah, looks like in the case of !CONFIG_PGSTE the table can be
> linked to mm->context.pgtable_list, so can't be moved to another mm. I
> guess I'll have to keep a pgtable allocated, ready to be deposited and
> free the old one. Maybe it's worth having an arch-specific function
> indicating whether moving a pgtable between MMs is supported? Or do it
> separately as an optimization. WDYT?
>
> >
> > > +       if (dst_mm != src_mm) {
> > > +               add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> > > +               add_mm_counter(src_mm, MM_ANONPAGES, -HPAGE_PMD_NR);
> > > +       }
> > > +       double_pt_unlock(src_ptl, dst_ptl);
> > > +
> > > +       anon_vma_unlock_write(src_anon_vma);
> > > +       put_anon_vma(src_anon_vma);
> > > +
> > > +       /* unblock rmap walks */
> > > +       unlock_page(src_page);
> > > +
> > > +       mmu_notifier_invalidate_range_end(&range);
> > > +       return 0;
> > > +}
> > > +#endif /* CONFIG_USERFAULTFD */
> > > +
> > >  /*
> > >   * Returns page table lock pointer if a given pmd maps a thp, NULL otherwise.
> > >   *
> > [...]
> > > diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> > > index 96d9eae5c7cc..0cca60dfa8f8 100644
> > > --- a/mm/userfaultfd.c
> > > +++ b/mm/userfaultfd.c
> > [...]
> > > +ssize_t remap_pages(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> > > +                   unsigned long dst_start, unsigned long src_start,
> > > +                   unsigned long len, __u64 mode)
> > > +{
> > [...]
> > > +
> > > +       if (pgprot_val(src_vma->vm_page_prot) !=
> > > +           pgprot_val(dst_vma->vm_page_prot))
> > > +               goto out;
> >
> > Does this check intentionally allow moving pages from a
> > PROT_READ|PROT_WRITE anonymous private VMA into a PROT_READ anonymous
> > private VMA (on architectures like x86 and arm64 where CoW memory has
> > the same protection flags as read-only memory), but forbid moving them
> > from a PROT_READ|PROT_EXEC VMA into a PROT_READ VMA? I think this
> > check needs at least a comment to explain what's going on here.
>
> The check is simply to ensure the VMAs have the same access
> permissions to prevent page copies that should have different
> permissions. The fact that x86 and arm64 have the same protection bits
> for R/O and COW memory is a "side-effect" IMHO. I'm not sure what
> comment would be good here but I'm open to suggestions.
>
> >
> > > +       /* only allow remapping if both are mlocked or both aren't */
> > > +       if ((src_vma->vm_flags & VM_LOCKED) ^ (dst_vma->vm_flags & VM_LOCKED))
> > > +               goto out;
> > > +
> > > +       /*
> > > +        * Be strict and only allow remap_pages if either the src or
> > > +        * dst range is registered in the userfaultfd to prevent
> > > +        * userland errors going unnoticed. As far as the VM
> > > +        * consistency is concerned, it would be perfectly safe to
> > > +        * remove this check, but there's no useful usage for
> > > +        * remap_pages ouside of userfaultfd registered ranges. This
> > > +        * is after all why it is an ioctl belonging to the
> > > +        * userfaultfd and not a syscall.
> > > +        *
> > > +        * Allow both vmas to be registered in the userfaultfd, just
> > > +        * in case somebody finds a way to make such a case useful.
> > > +        * Normally only one of the two vmas would be registered in
> > > +        * the userfaultfd.
> > > +        */
> > > +       if (!dst_vma->vm_userfaultfd_ctx.ctx &&
> > > +           !src_vma->vm_userfaultfd_ctx.ctx)
> > > +               goto out;
> > > +
> > > +       /*
> > > +        * FIXME: only allow remapping across anonymous vmas,
> > > +        * tmpfs should be added.
> > > +        */
> > > +       if (src_vma->vm_ops || dst_vma->vm_ops)
> > > +               goto out;
> >
> > I don't think it's okay to check for anonymous VMAs by checking
> > ->vm_ops. There are some weird drivers whose ->mmap helpers don't set
> > ->vm_ops and instead just shove all the necessary PTEs into the VMA
> > right on ->mmap, so I think they end up with ->vm_ops==NULL. For
> > example, kcov_mmap() looks that way. I'm not sure how common this is.
> >
> > Though, uuuuuh, I guess if that's true, the existing
> > vma_is_anonymous() is broken, since that also just checks ->vm_ops?
> > I'm not sure what the consequences of that would be... Either way,
> > vma_is_anonymous() might be the better way to check for anonymous VMAs
> > here, and someone should figure out whether vma_is_anonymous() needs
> > to be fixed.
>
> Absolutely, vma_is_anonymous() should have been used here. Will fix it.
>
> >
> > > +       /*
> > > +        * Ensure the dst_vma has a anon_vma or this page
> > > +        * would get a NULL anon_vma when moved in the
> > > +        * dst_vma.
> > > +        */
> > > +       err = -ENOMEM;
> > > +       if (unlikely(anon_vma_prepare(dst_vma)))
> > > +               goto out;
> > > +
> > > +       for (src_addr = src_start, dst_addr = dst_start;
> > > +            src_addr < src_start + len;) {
> > > +               spinlock_t *ptl;
> > > +               pmd_t dst_pmdval;
> > > +
> > > +               BUG_ON(dst_addr >= dst_start + len);
> > > +               src_pmd = mm_find_pmd(src_mm, src_addr);
> >
> > (this would blow up pretty badly if we could have transparent huge PUD
> > in the region but I think that's limited to file VMAs so it's fine as
> > it currently is)
>
> Should I add a comment here as a warning if in the future we decide to
> implement support for file-backed pages?
>
> >
> > > +               if (unlikely(!src_pmd)) {
> > > +                       if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES)) {
> > > +                               err = -ENOENT;
> > > +                               break;
> > > +                       }
> > > +                       src_pmd = mm_alloc_pmd(src_mm, src_addr);
> > > +                       if (unlikely(!src_pmd)) {
> > > +                               err = -ENOMEM;
> > > +                               break;
> > > +                       }
> > > +               }
> > > +               dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
> > > +               if (unlikely(!dst_pmd)) {
> > > +                       err = -ENOMEM;
> > > +                       break;
> > > +               }
> > > +
> > > +               dst_pmdval = pmdp_get_lockless(dst_pmd);
> > > +               /*
> > > +                * If the dst_pmd is mapped as THP don't
> > > +                * override it and just be strict.
> > > +                */
> > > +               if (unlikely(pmd_trans_huge(dst_pmdval))) {
> > > +                       err = -EEXIST;
> > > +                       break;
> > > +               }
> >
> > This check is racy because the dst_pmd can still change at this point,
> > from previously pointing to a zeroed PMD to now pointing to a
> > hugepage, right? And we rely on remap_pages_pte() and
> > remap_pages_huge_pmd() to recheck for that?
> > If yes, maybe add a comment noting this and explaining why we want this check.
>
> Yes, you are right. remap_pages_huge_pmd() does check
> pmd_same(*dst_pmd, dst_pmdval) after doing double_pt_lock(), so it
> will retry and exit on this check if dst_pmd got mapped as THP from
> under us. remap_pages_pte() OTOH simply does
> BUG_ON(pmd_trans_huge(*dst_pmd)), which is wrong. Instead it should
> also check if PMD value changed from under us and retry in such a
> case. I'll fix that and will add a comment here about this racy check
> and the retry logic.

Actually, remap_pages_pte() also indirectly handles the case when
dst_pmd changes from under us with this code:

dst_pte = pte_offset_map_nolock(dst_mm, dst_pmd, dst_addr, &dst_ptl);
/* If an huge pmd materialized from under us fail */
if (unlikely(!dst_pte)) {
        err = -EFAULT;
        goto out;
}

so, it would fail if someone concurrently mapped a THP at that
location. Seems like this race is handled, so the only thing left here
is to add a comment clarifying this.

>
> >
> > > +               ptl = pmd_trans_huge_lock(src_pmd, src_vma);
> > > +               if (ptl) {
> > > +                       /*
> > > +                        * Check if we can move the pmd without
> > > +                        * splitting it. First check the address
> > > +                        * alignment to be the same in src/dst.  These
> > > +                        * checks don't actually need the PT lock but
> > > +                        * it's good to do it here to optimize this
> > > +                        * block away at build time if
> > > +                        * CONFIG_TRANSPARENT_HUGEPAGE is not set.
> > > +                        */
> > > +                       if (thp_aligned == -1)
> > > +                               thp_aligned = ((src_addr & ~HPAGE_PMD_MASK) ==
> > > +                                              (dst_addr & ~HPAGE_PMD_MASK));
> > > +                       if (!thp_aligned || (src_addr & ~HPAGE_PMD_MASK) ||
> >
> > This seems overly complicated, the only case when you can move a huge
> > PMD is if both addresses are hugepage-aligned and you have enough
> > length for one hugepage:
> >
> > (src_addr & ~HPAGE_PMD_MASK) == 0 && (dst_addr & ~HPAGE_PMD_MASK) == 0
> > && (src_start + len - src_addr >= HPAGE_PMD_SIZE).
>
> Ack.
>
> >
> > > +                           !pmd_none(dst_pmdval) ||
> > > +                           src_start + len - src_addr < HPAGE_PMD_SIZE) {
> > > +                               spin_unlock(ptl);
> > > +                               /* Fall through */
> > > +                               split_huge_pmd(src_vma, src_pmd, src_addr);
> > > +                       } else {
> > > +                               err = remap_pages_huge_pmd(dst_mm,
> > > +                                                          src_mm,
> > > +                                                          dst_pmd,
> > > +                                                          src_pmd,
> > > +                                                          dst_pmdval,
> > > +                                                          dst_vma,
> > > +                                                          src_vma,
> > > +                                                          dst_addr,
> > > +                                                          src_addr);
> > > +                               cond_resched();
> > > +
> > > +                               if (!err) {
> > > +                                       dst_addr += HPAGE_PMD_SIZE;
> > > +                                       src_addr += HPAGE_PMD_SIZE;
> > > +                                       moved += HPAGE_PMD_SIZE;
> > > +                               }
> > > +
> > > +                               if ((!err || err == -EAGAIN) &&
> > > +                                   fatal_signal_pending(current))
> > > +                                       err = -EINTR;
> > > +
> > > +                               if (err && err != -EAGAIN)
> > > +                                       break;
> > > +
> > > +                               continue;
> > > +                       }
> > > +               }
> > > +
> > > +               if (pmd_none(*src_pmd)) {
> > > +                       if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES)) {
> > > +                               err = -ENOENT;
> > > +                               break;
> > > +                       }
> > > +                       if (unlikely(__pte_alloc(src_mm, src_pmd))) {
> > > +                               err = -ENOMEM;
> > > +                               break;
> > > +                       }
> > > +               }
> > > +
> > > +               if (unlikely(pmd_none(dst_pmdval)) &&
> > > +                   unlikely(__pte_alloc(dst_mm, dst_pmd))) {
> >
> > Maybe just use pte_alloc() here?
>
> Ack.
>
> >
> > > +                       err = -ENOMEM;
> > > +                       break;
> > > +               }
> > > +
> > > +               err = remap_pages_pte(dst_mm, src_mm,
> > > +                                     dst_pmd, src_pmd,
> > > +                                     dst_vma, src_vma,
> > > +                                     dst_addr, src_addr,
> > > +                                     mode);
> > > +
> > > +               cond_resched();
> > > +
> > > +               if (!err) {
> > > +                       dst_addr += PAGE_SIZE;
> > > +                       src_addr += PAGE_SIZE;
> > > +                       moved += PAGE_SIZE;
> > > +               }
> > > +
> > > +               if ((!err || err == -EAGAIN) &&
> > > +                   fatal_signal_pending(current))
> > > +                       err = -EINTR;
> > > +
> > > +               if (err && err != -EAGAIN)
> > > +                       break;
> > > +       }
> > > +
> > > +out:
> > > +       mmap_read_unlock(dst_mm);
> > > +       if (dst_mm != src_mm)
> > > +               mmap_read_unlock(src_mm);
> > > +       BUG_ON(moved < 0);
> > > +       BUG_ON(err > 0);
> > > +       BUG_ON(!moved && !err);
> > > +       return moved ? moved : err;
> > > +}
> >
> > Maybe you could try whether this function would look simpler with a
> > shape roughly like:
> >
> > for (src_addr = ...; src_addr < ...;) {
> >   unsigned long step_size;
> >
> >   if (hugepage case) {
> >     if (have to split) {
> >       split it;
> >       continue;
> >     }
> >     step_size = HPAGE_PMD_SIZE;
> >     ...
> >   } else {
> >     ... 4k case ...
> >     step_size = PAGE_SIZE;
> >   }
> >   ...
> >   cond_resched();
> >   if (!err) {
> >     dst_addr += step_size;
> >     src_addr += step_size;
> >     moved += step_size;
> >   }
>
> Yeah, that looks simpler. I'll try that, thanks!
>
> >   ...
> > }