linux-kernel - Re: [PATCH v2 2/3] userfaultfd: UFFDIO

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <CAJuCfpGb5Amo9Sk0yyruJt9NKaYe9-y+5jmU442NSf3+VT5-dA@mail.gmail.com>
Date:   Wed, 27 Sep 2023 11:07:48 -0700
From:   Suren Baghdasaryan <surenb@...gle.com>
To:     Jann Horn <jannh@...gle.com>
Cc:     akpm@...ux-foundation.org, viro@...iv.linux.org.uk,
        brauner@...nel.org, shuah@...nel.org, aarcange@...hat.com,
        lokeshgidra@...gle.com, peterx@...hat.com, david@...hat.com,
        hughd@...gle.com, mhocko@...e.com, axelrasmussen@...gle.com,
        rppt@...nel.org, willy@...radead.org, Liam.Howlett@...cle.com,
        zhangpeng362@...wei.com, bgeffon@...gle.com,
        kaleshsingh@...gle.com, ngeoffray@...gle.com, jdduke@...gle.com,
        linux-mm@...ck.org, linux-fsdevel@...r.kernel.org,
        linux-kernel@...r.kernel.org, linux-kselftest@...r.kernel.org,
        kernel-team@...roid.com
Subject: Re: [PATCH v2 2/3] userfaultfd: UFFDIO_REMAP uABI

On Wed, Sep 27, 2023 at 5:47 AM Jann Horn <jannh@...gle.com> wrote:
>
> On Sat, Sep 23, 2023 at 3:31 AM Suren Baghdasaryan <surenb@...gle.com> wrote:
> > From: Andrea Arcangeli <aarcange@...hat.com>
> >
> > This implements the uABI of UFFDIO_REMAP.
> >
> > Notably one mode bitflag is also forwarded (and in turn known) by the
> > lowlevel remap_pages method.
> >
> > Signed-off-by: Andrea Arcangeli <aarcange@...hat.com>
> > Signed-off-by: Suren Baghdasaryan <surenb@...gle.com>
> [...]
> > +int remap_pages_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> > +                        pmd_t *dst_pmd, pmd_t *src_pmd, pmd_t dst_pmdval,
> > +                        struct vm_area_struct *dst_vma,
> > +                        struct vm_area_struct *src_vma,
> > +                        unsigned long dst_addr, unsigned long src_addr)
> > +{
> > +       pmd_t _dst_pmd, src_pmdval;
> > +       struct page *src_page;
> > +       struct folio *src_folio;
> > +       struct anon_vma *src_anon_vma, *dst_anon_vma;
> > +       spinlock_t *src_ptl, *dst_ptl;
> > +       pgtable_t src_pgtable, dst_pgtable;
> > +       struct mmu_notifier_range range;
> > +       int err = 0;
> > +
> > +       src_pmdval = *src_pmd;
> > +       src_ptl = pmd_lockptr(src_mm, src_pmd);
> > +
> > +       BUG_ON(!spin_is_locked(src_ptl));
> > +       mmap_assert_locked(src_mm);
> > +       mmap_assert_locked(dst_mm);
> > +
> > +       BUG_ON(!pmd_trans_huge(src_pmdval));
> > +       BUG_ON(!pmd_none(dst_pmdval));
> > +       BUG_ON(src_addr & ~HPAGE_PMD_MASK);
> > +       BUG_ON(dst_addr & ~HPAGE_PMD_MASK);
> > +
> > +       src_page = pmd_page(src_pmdval);
> > +       if (unlikely(!PageAnonExclusive(src_page))) {
> > +               spin_unlock(src_ptl);
> > +               return -EBUSY;
> > +       }
> > +
> > +       src_folio = page_folio(src_page);
> > +       folio_get(src_folio);
> > +       spin_unlock(src_ptl);
> > +
> > +       /* preallocate dst_pgtable if needed */
> > +       if (dst_mm != src_mm) {
> > +               dst_pgtable = pte_alloc_one(dst_mm);
> > +               if (unlikely(!dst_pgtable)) {
> > +                       err = -ENOMEM;
> > +                       goto put_folio;
> > +               }
> > +       } else {
> > +               dst_pgtable = NULL;
> > +       }
> > +
> > +       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_mm, src_addr,
> > +                               src_addr + HPAGE_PMD_SIZE);
> > +       mmu_notifier_invalidate_range_start(&range);
> > +
> > +       /* block all concurrent rmap walks */
> > +       folio_lock(src_folio);
> > +
> > +       /*
> > +        * split_huge_page walks the anon_vma chain without the page
> > +        * lock. Serialize against it with the anon_vma lock, the page
> > +        * lock is not enough.
> > +        */
> > +       src_anon_vma = folio_get_anon_vma(src_folio);
> > +       if (!src_anon_vma) {
> > +               err = -EAGAIN;
> > +               goto unlock_folio;
> > +       }
> > +       anon_vma_lock_write(src_anon_vma);
> > +
> > +       dst_ptl = pmd_lockptr(dst_mm, dst_pmd);
> > +       double_pt_lock(src_ptl, dst_ptl);
> > +       if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
> > +                    !pmd_same(*dst_pmd, dst_pmdval) ||
> > +                    folio_mapcount(src_folio) != 1)) {
>
> I think this is also supposed to be PageAnonExclusive()?

Yes. Will fix.

>
> > +               double_pt_unlock(src_ptl, dst_ptl);
> > +               err = -EAGAIN;
> > +               goto put_anon_vma;
> > +       }
> > +
> > +       BUG_ON(!folio_test_head(src_folio));
> > +       BUG_ON(!folio_test_anon(src_folio));
> > +
> > +       dst_anon_vma = (void *)dst_vma->anon_vma + PAGE_MAPPING_ANON;
> > +       WRITE_ONCE(src_folio->mapping, (struct address_space *) dst_anon_vma);
> > +       WRITE_ONCE(src_folio->index, linear_page_index(dst_vma, dst_addr));
> > +
> > +       src_pmdval = pmdp_huge_clear_flush(src_vma, src_addr, src_pmd);
> > +       _dst_pmd = mk_huge_pmd(&src_folio->page, dst_vma->vm_page_prot);
> > +       _dst_pmd = maybe_pmd_mkwrite(pmd_mkdirty(_dst_pmd), dst_vma);
> > +       set_pmd_at(dst_mm, dst_addr, dst_pmd, _dst_pmd);
> > +
> > +       src_pgtable = pgtable_trans_huge_withdraw(src_mm, src_pmd);
> > +       if (dst_pgtable) {
> > +               pgtable_trans_huge_deposit(dst_mm, dst_pmd, dst_pgtable);
> > +               pte_free(src_mm, src_pgtable);
> > +               dst_pgtable = NULL;
> > +
> > +               mm_inc_nr_ptes(dst_mm);
> > +               mm_dec_nr_ptes(src_mm);
> > +               add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> > +               add_mm_counter(src_mm, MM_ANONPAGES, -HPAGE_PMD_NR);
> > +       } else {
> > +               pgtable_trans_huge_deposit(dst_mm, dst_pmd, src_pgtable);
> > +       }
> > +       double_pt_unlock(src_ptl, dst_ptl);
> > +
> > +put_anon_vma:
> > +       anon_vma_unlock_write(src_anon_vma);
> > +       put_anon_vma(src_anon_vma);
> > +unlock_folio:
> > +       /* unblock rmap walks */
> > +       folio_unlock(src_folio);
> > +       mmu_notifier_invalidate_range_end(&range);
> > +       if (dst_pgtable)
> > +               pte_free(dst_mm, dst_pgtable);
> > +put_folio:
> > +       folio_put(src_folio);
> > +
> > +       return err;
> > +}
> > +#endif /* CONFIG_USERFAULTFD */
> [...]
> > +static int remap_anon_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> > +                         struct vm_area_struct *dst_vma,
> > +                         struct vm_area_struct *src_vma,
> > +                         unsigned long dst_addr, unsigned long src_addr,
> > +                         pte_t *dst_pte, pte_t *src_pte,
> > +                         pte_t orig_dst_pte, pte_t orig_src_pte,
> > +                         spinlock_t *dst_ptl, spinlock_t *src_ptl,
> > +                         struct folio *src_folio)
> > +{
> > +       struct anon_vma *dst_anon_vma;
> > +
> > +       double_pt_lock(dst_ptl, src_ptl);
> > +
> > +       if (!pte_same(*src_pte, orig_src_pte) ||
> > +           !pte_same(*dst_pte, orig_dst_pte) ||
> > +           folio_test_large(src_folio) ||
> > +           folio_estimated_sharers(src_folio) != 1) {
> > +               double_pt_unlock(dst_ptl, src_ptl);
> > +               return -EAGAIN;
> > +       }
> > +
> > +       BUG_ON(!folio_test_anon(src_folio));
> > +
> > +       dst_anon_vma = (void *)dst_vma->anon_vma + PAGE_MAPPING_ANON;
> > +       WRITE_ONCE(src_folio->mapping,
> > +                  (struct address_space *) dst_anon_vma);
> > +       WRITE_ONCE(src_folio->index, linear_page_index(dst_vma,
> > +                                                     dst_addr));
> > +
> > +       orig_src_pte = ptep_clear_flush(src_vma, src_addr, src_pte);
> > +       orig_dst_pte = mk_pte(&src_folio->page, dst_vma->vm_page_prot);
> > +       orig_dst_pte = maybe_mkwrite(pte_mkdirty(orig_dst_pte),
> > +                                    dst_vma);
>
> I think there's still a theoretical issue here that you could fix by
> checking for the AnonExclusive flag, similar to the huge page case.
>
> Consider the following scenario:
>
> 1. process P1 does a write fault in a private anonymous VMA, creating
> and mapping a new anonymous page A1
> 2. process P1 forks and creates two children P2 and P3. afterwards, A1
> is mapped in P1, P2 and P3 as a COW page, with mapcount 3.
> 3. process P1 removes its mapping of A1, dropping its mapcount to 2.
> 4. process P2 uses vmsplice() to grab a reference to A1 with get_user_pages()
> 5. process P2 removes its mapping of A1, dropping its mapcount to 1.
>
> If at this point P3 does a write fault on its mapping of A1, it will
> still trigger copy-on-write thanks to the AnonExclusive mechanism; and
> this is necessary to avoid P3 mapping A1 as writable and writing data
> into it that will become visible to P2, if P2 and P3 are in different
> security contexts.
>
> But if P3 instead moves its mapping of A1 to another address with
> remap_anon_pte() which only does a page mapcount check, the
> maybe_mkwrite() will directly make the mapping writable, circumventing
> the AnonExclusive mechanism.

I see. Thanks for the detailed explanation! I will add
PageAnonExclusive() check in this path to prevent this scenario.

>
> > +       set_pte_at(dst_mm, dst_addr, dst_pte, orig_dst_pte);
> > +
> > +       if (dst_mm != src_mm) {
> > +               inc_mm_counter(dst_mm, MM_ANONPAGES);
> > +               dec_mm_counter(src_mm, MM_ANONPAGES);
> > +       }
> > +
> > +       double_pt_unlock(dst_ptl, src_ptl);
> > +
> > +       return 0;
> > +}
> > +
> > +static int remap_swap_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> > +                         unsigned long dst_addr, unsigned long src_addr,
> > +                         pte_t *dst_pte, pte_t *src_pte,
> > +                         pte_t orig_dst_pte, pte_t orig_src_pte,
> > +                         spinlock_t *dst_ptl, spinlock_t *src_ptl)
> > +{
> > +       if (!pte_swp_exclusive(orig_src_pte))
> > +               return -EBUSY;
> > +
> > +       double_pt_lock(dst_ptl, src_ptl);
> > +
> > +       if (!pte_same(*src_pte, orig_src_pte) ||
> > +           !pte_same(*dst_pte, orig_dst_pte)) {
> > +               double_pt_unlock(dst_ptl, src_ptl);
> > +               return -EAGAIN;
> > +       }
> > +
> > +       orig_src_pte = ptep_get_and_clear(src_mm, src_addr, src_pte);
> > +       set_pte_at(dst_mm, dst_addr, dst_pte, orig_src_pte);
> > +
> > +       if (dst_mm != src_mm) {
> > +               inc_mm_counter(dst_mm, MM_ANONPAGES);
> > +               dec_mm_counter(src_mm, MM_ANONPAGES);
>
> I think this is the wrong counter. Looking at zap_pte_range(), in the
> "Genuine swap entry" case, we modify the MM_SWAPENTS counter, not
> MM_ANONPAGES.

Oops, my bad. Will fix.

>
> > +       }
> > +
> > +       double_pt_unlock(dst_ptl, src_ptl);
> > +
> > +       return 0;
> > +}
> > +
> > +/*
> > + * The mmap_lock for reading is held by the caller. Just move the page
> > + * from src_pmd to dst_pmd if possible, and return true if succeeded
> > + * in moving the page.
> > + */
> > +static int remap_pages_pte(struct mm_struct *dst_mm,
> > +                          struct mm_struct *src_mm,
> > +                          pmd_t *dst_pmd,
> > +                          pmd_t *src_pmd,
> > +                          struct vm_area_struct *dst_vma,
> > +                          struct vm_area_struct *src_vma,
> > +                          unsigned long dst_addr,
> > +                          unsigned long src_addr,
> > +                          __u64 mode)
> > +{
> > +       swp_entry_t entry;
> > +       pte_t orig_src_pte, orig_dst_pte;
> > +       spinlock_t *src_ptl, *dst_ptl;
> > +       pte_t *src_pte = NULL;
> > +       pte_t *dst_pte = NULL;
> > +
> > +       struct folio *src_folio = NULL;
> > +       struct anon_vma *src_anon_vma = NULL;
> > +       struct mmu_notifier_range range;
> > +       int err = 0;
> > +
> > +       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, src_mm,
> > +                               src_addr, src_addr + PAGE_SIZE);
> > +       mmu_notifier_invalidate_range_start(&range);
> > +retry:
>
> This retry looks a bit dodgy. On this retry label, we restart almost
> the entire operation, including re-reading the source PTE; the only
> variables that carry state forward from the previous retry loop
> iteration are src_folio and src_anon_vma.
>
> > +       dst_pte = pte_offset_map_nolock(dst_mm, dst_pmd, dst_addr, &dst_ptl);
> > +
> > +       /* If an huge pmd materialized from under us fail */
> > +       if (unlikely(!dst_pte)) {
> > +               err = -EFAULT;
> > +               goto out;
> > +       }
> [...]
> > +       spin_lock(dst_ptl);
> > +       orig_dst_pte = *dst_pte;
> > +       spin_unlock(dst_ptl);
> > +       if (!pte_none(orig_dst_pte)) {
> > +               err = -EEXIST;
> > +               goto out;
> > +       }
> > +
> > +       spin_lock(src_ptl);
> > +       orig_src_pte = *src_pte;
>
> Here we read an entirely new orig_src_pte value. Something like a
> concurrent MADV_DONTNEED+pagefault could have made the PTE point to a
> different page between loop iterations.
>
> > +       spin_unlock(src_ptl);
>
> I think you have to insert something like the following here:
>
>         if (src_folio && (orig_dst_pte != previous_src_pte)) {
>                 err = -EAGAIN;
>                 goto out;
>         }
>         previous_src_pte = orig_dst_pte;

Yes, this definitely needs to be rechecked. Will fix.

>
> Otherwise:
>
> > +       if (pte_none(orig_src_pte)) {
> > +               if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES))
> > +                       err = -ENOENT;
> > +               else /* nothing to do to remap a hole */
> > +                       err = 0;
> > +               goto out;
> > +       }
> > +
> > +       if (pte_present(orig_src_pte)) {
> > +               /*
> > +                * Pin and lock both source folio and anon_vma. Since we are in
> > +                * RCU read section, we can't block, so on contention have to
> > +                * unmap the ptes, obtain the lock and retry.
> > +                */
> > +               if (!src_folio) {
>
> If we already found a src_folio in the previous iteration (but the
> trylock failed), we keep using the same src_folio without rechecking
> if the current PTE still points to the same folio.
>
> > +                       struct folio *folio;
> > +
> > +                       /*
> > +                        * Pin the page while holding the lock to be sure the
> > +                        * page isn't freed under us
> > +                        */
> > +                       spin_lock(src_ptl);
> > +                       if (!pte_same(orig_src_pte, *src_pte)) {
> > +                               spin_unlock(src_ptl);
> > +                               err = -EAGAIN;
> > +                               goto out;
> > +                       }
> > +
> > +                       folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
> > +                       if (!folio || !folio_test_anon(folio) ||
> > +                           folio_test_large(folio) ||
> > +                           folio_estimated_sharers(folio) != 1) {
> > +                               spin_unlock(src_ptl);
> > +                               err = -EBUSY;
> > +                               goto out;
> > +                       }
> > +
> > +                       folio_get(folio);
> > +                       src_folio = folio;
> > +                       spin_unlock(src_ptl);
> > +
> > +                       /* block all concurrent rmap walks */
> > +                       if (!folio_trylock(src_folio)) {
> > +                               pte_unmap(&orig_src_pte);
> > +                               pte_unmap(&orig_dst_pte);
> > +                               src_pte = dst_pte = NULL;
> > +                               /* now we can block and wait */
> > +                               folio_lock(src_folio);
> > +                               goto retry;
> > +                       }
> > +               }
> > +
> > +               if (!src_anon_vma) {
>
> (And here, if we already saw a src_anon_vma but the trylock failed,
> we'll keep using that src_anon_vma.)

Ack. The check for previous_src_pte should handle that as well.

>
> > +                       /*
> > +                        * folio_referenced walks the anon_vma chain
> > +                        * without the folio lock. Serialize against it with
> > +                        * the anon_vma lock, the folio lock is not enough.
> > +                        */
> > +                       src_anon_vma = folio_get_anon_vma(src_folio);
> > +                       if (!src_anon_vma) {
> > +                               /* page was unmapped from under us */
> > +                               err = -EAGAIN;
> > +                               goto out;
> > +                       }
> > +                       if (!anon_vma_trylock_write(src_anon_vma)) {
> > +                               pte_unmap(&orig_src_pte);
> > +                               pte_unmap(&orig_dst_pte);
> > +                               src_pte = dst_pte = NULL;
> > +                               /* now we can block and wait */
> > +                               anon_vma_lock_write(src_anon_vma);
> > +                               goto retry;
> > +                       }
> > +               }
>
> So at this point we have:
>
>  - the current src_pte
>  - some referenced+locked src_folio that used to be mapped exclusively
> at src_addr
>  - (the anon_vma associated with the src_folio)
>
> > +               err = remap_anon_pte(dst_mm, src_mm,  dst_vma, src_vma,
> > +                                    dst_addr, src_addr, dst_pte, src_pte,
> > +                                    orig_dst_pte, orig_src_pte,
> > +                                    dst_ptl, src_ptl, src_folio);
>
> And then this will, without touching folio mapcounts/refcounts, delete
> the current PTE at src_addr, and create a PTE at dst_addr pointing to
> the old src_folio, leading to incorrect refcounts/mapcounts?

I assume this still points to the missing previous_src_pte check
discussed in the previous comments. Is that correct or is there yet
another issue?

>
> > +       } else {
> [...]
> > +       }
> > +
> > +out:
> > +       if (src_anon_vma) {
> > +               anon_vma_unlock_write(src_anon_vma);
> > +               put_anon_vma(src_anon_vma);
> > +       }
> > +       if (src_folio) {
> > +               folio_unlock(src_folio);
> > +               folio_put(src_folio);
> > +       }
> > +       if (dst_pte)
> > +               pte_unmap(dst_pte);
> > +       if (src_pte)
> > +               pte_unmap(src_pte);
> > +       mmu_notifier_invalidate_range_end(&range);
> > +
> > +       return err;
> > +}
> [...]
> > +ssize_t remap_pages(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> > +                   unsigned long dst_start, unsigned long src_start,
> > +                   unsigned long len, __u64 mode)
> > +{
> > +       struct vm_area_struct *src_vma, *dst_vma;
> > +       unsigned long src_addr, dst_addr;
> > +       pmd_t *src_pmd, *dst_pmd;
> > +       long err = -EINVAL;
> > +       ssize_t moved = 0;
> > +
> > +       /*
> > +        * Sanitize the command parameters:
> > +        */
> > +       BUG_ON(src_start & ~PAGE_MASK);
> > +       BUG_ON(dst_start & ~PAGE_MASK);
> > +       BUG_ON(len & ~PAGE_MASK);
> > +
> > +       /* Does the address range wrap, or is the span zero-sized? */
> > +       BUG_ON(src_start + len <= src_start);
> > +       BUG_ON(dst_start + len <= dst_start);
> > +
> > +       /*
> > +        * Because these are read sempahores there's no risk of lock
> > +        * inversion.
> > +        */
> > +       mmap_read_lock(dst_mm);
> > +       if (dst_mm != src_mm)
> > +               mmap_read_lock(src_mm);
> > +
> > +       /*
> > +        * Make sure the vma is not shared, that the src and dst remap
> > +        * ranges are both valid and fully within a single existing
> > +        * vma.
> > +        */
> > +       src_vma = find_vma(src_mm, src_start);
> > +       if (!src_vma || (src_vma->vm_flags & VM_SHARED))
> > +               goto out;
> > +       if (src_start < src_vma->vm_start ||
> > +           src_start + len > src_vma->vm_end)
> > +               goto out;
> > +
> > +       dst_vma = find_vma(dst_mm, dst_start);
> > +       if (!dst_vma || (dst_vma->vm_flags & VM_SHARED))
> > +               goto out;
> > +       if (dst_start < dst_vma->vm_start ||
> > +           dst_start + len > dst_vma->vm_end)
> > +               goto out;
> > +
> > +       err = validate_remap_areas(src_vma, dst_vma);
> > +       if (err)
> > +               goto out;
> > +
> > +       for (src_addr = src_start, dst_addr = dst_start;
> > +            src_addr < src_start + len;) {
> > +               spinlock_t *ptl;
> > +               pmd_t dst_pmdval;
> > +               unsigned long step_size;
> > +
> > +               BUG_ON(dst_addr >= dst_start + len);
> > +               /*
> > +                * Below works because anonymous area would not have a
> > +                * transparent huge PUD. If file-backed support is added,
> > +                * that case would need to be handled here.
> > +                */
> > +               src_pmd = mm_find_pmd(src_mm, src_addr);
> > +               if (unlikely(!src_pmd)) {
> > +                       if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES)) {
> > +                               err = -ENOENT;
> > +                               break;
> > +                       }
> > +                       src_pmd = mm_alloc_pmd(src_mm, src_addr);
> > +                       if (unlikely(!src_pmd)) {
> > +                               err = -ENOMEM;
> > +                               break;
> > +                       }
> > +               }
> > +               dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
> > +               if (unlikely(!dst_pmd)) {
> > +                       err = -ENOMEM;
> > +                       break;
> > +               }
> > +
> > +               dst_pmdval = pmdp_get_lockless(dst_pmd);
> > +               /*
> > +                * If the dst_pmd is mapped as THP don't override it and just
> > +                * be strict. If dst_pmd changes into TPH after this check, the
> > +                * remap_pages_huge_pmd() will detect the change and retry
> > +                * while remap_pages_pte() will detect the change and fail.
> > +                */
> > +               if (unlikely(pmd_trans_huge(dst_pmdval))) {
> > +                       err = -EEXIST;
> > +                       break;
> > +               }
> > +
> > +               ptl = pmd_trans_huge_lock(src_pmd, src_vma);
> > +               if (ptl && !pmd_trans_huge(*src_pmd)) {
> > +                       spin_unlock(ptl);
> > +                       ptl = NULL;
> > +               }
>
> This still looks wrong - we do still have to split_huge_pmd()
> somewhere so that remap_pages_pte() works.

Hmm, I guess this extra check is not even needed...

>
> > +               if (ptl) {
> > +                       /*
> > +                        * Check if we can move the pmd without
> > +                        * splitting it. First check the address
> > +                        * alignment to be the same in src/dst.  These
> > +                        * checks don't actually need the PT lock but
> > +                        * it's good to do it here to optimize this
> > +                        * block away at build time if
> > +                        * CONFIG_TRANSPARENT_HUGEPAGE is not set.
> > +                        */
> > +                       if ((src_addr & ~HPAGE_PMD_MASK) || (dst_addr & ~HPAGE_PMD_MASK) ||
> > +                           src_start + len - src_addr < HPAGE_PMD_SIZE || !pmd_none(dst_pmdval)) {
> > +                               spin_unlock(ptl);
> > +                               split_huge_pmd(src_vma, src_pmd, src_addr);
> > +                               continue;
> > +                       }
> > +
> > +                       err = remap_pages_huge_pmd(dst_mm, src_mm,
> > +                                                  dst_pmd, src_pmd,
> > +                                                  dst_pmdval,
> > +                                                  dst_vma, src_vma,
> > +                                                  dst_addr, src_addr);
> > +                       step_size = HPAGE_PMD_SIZE;
> > +               } else {
> > +                       if (pmd_none(*src_pmd)) {
> > +                               if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES)) {
> > +                                       err = -ENOENT;
> > +                                       break;
> > +                               }
> > +                               if (unlikely(__pte_alloc(src_mm, src_pmd))) {
> > +                                       err = -ENOMEM;
> > +                                       break;
> > +                               }
> > +                       }
> > +
> > +                       if (unlikely(pte_alloc(dst_mm, dst_pmd))) {
> > +                               err = -ENOMEM;
> > +                               break;
> > +                       }
> > +
> > +                       err = remap_pages_pte(dst_mm, src_mm,
> > +                                             dst_pmd, src_pmd,
> > +                                             dst_vma, src_vma,
> > +                                             dst_addr, src_addr,
> > +                                             mode);
> > +                       step_size = PAGE_SIZE;
> > +               }
> > +
> > +               cond_resched();
> > +
> > +               if (!err) {
> > +                       dst_addr += step_size;
> > +                       src_addr += step_size;
> > +                       moved += step_size;
> > +               }
> > +
> > +               if ((!err || err == -EAGAIN) &&
> > +                   fatal_signal_pending(current))
> > +                       err = -EINTR;
> > +
> > +               if (err && err != -EAGAIN)
> > +                       break;
> > +       }
> > +
> > +out:
> > +       mmap_read_unlock(dst_mm);
> > +       if (dst_mm != src_mm)
> > +               mmap_read_unlock(src_mm);
> > +       BUG_ON(moved < 0);
> > +       BUG_ON(err > 0);
> > +       BUG_ON(!moved && !err);
> > +       return moved ? moved : err;
> > +}
> > --
> > 2.42.0.515.g380fc7ccd1-goog
> >