[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZQNMze6SXdIm13CW@casper.infradead.org>
Date: Thu, 14 Sep 2023 19:11:25 +0100
From: Matthew Wilcox <willy@...radead.org>
To: Suren Baghdasaryan <surenb@...gle.com>
Cc: akpm@...ux-foundation.org, viro@...iv.linux.org.uk,
brauner@...nel.org, shuah@...nel.org, aarcange@...hat.com,
lokeshgidra@...gle.com, peterx@...hat.com, david@...hat.com,
hughd@...gle.com, mhocko@...e.com, axelrasmussen@...gle.com,
rppt@...nel.org, Liam.Howlett@...cle.com, jannh@...gle.com,
zhangpeng362@...wei.com, bgeffon@...gle.com,
kaleshsingh@...gle.com, ngeoffray@...gle.com, jdduke@...gle.com,
linux-mm@...ck.org, linux-fsdevel@...r.kernel.org,
linux-kernel@...r.kernel.org, linux-kselftest@...r.kernel.org,
kernel-team@...roid.com
Subject: Re: [PATCH 2/3] userfaultfd: UFFDIO_REMAP uABI
On Thu, Sep 14, 2023 at 08:26:12AM -0700, Suren Baghdasaryan wrote:
> +++ b/include/linux/userfaultfd_k.h
> @@ -93,6 +93,23 @@ extern int mwriteprotect_range(struct mm_struct *dst_mm,
> extern long uffd_wp_range(struct vm_area_struct *vma,
> unsigned long start, unsigned long len, bool enable_wp);
>
> +/* remap_pages */
> +extern void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
> +extern void double_pt_unlock(spinlock_t *ptl1, spinlock_t *ptl2);
> +extern ssize_t remap_pages(struct mm_struct *dst_mm,
> + struct mm_struct *src_mm,
> + unsigned long dst_start,
> + unsigned long src_start,
> + unsigned long len, __u64 flags);
> +extern int remap_pages_huge_pmd(struct mm_struct *dst_mm,
> + struct mm_struct *src_mm,
> + pmd_t *dst_pmd, pmd_t *src_pmd,
> + pmd_t dst_pmdval,
> + struct vm_area_struct *dst_vma,
> + struct vm_area_struct *src_vma,
> + unsigned long dst_addr,
> + unsigned long src_addr);
Drop the 'extern' markers from function declarations.
> +int remap_pages_huge_pmd(struct mm_struct *dst_mm,
> + struct mm_struct *src_mm,
> + pmd_t *dst_pmd, pmd_t *src_pmd,
> + pmd_t dst_pmdval,
> + struct vm_area_struct *dst_vma,
> + struct vm_area_struct *src_vma,
> + unsigned long dst_addr,
> + unsigned long src_addr)
> +{
> + pmd_t _dst_pmd, src_pmdval;
> + struct page *src_page;
> + struct anon_vma *src_anon_vma, *dst_anon_vma;
> + spinlock_t *src_ptl, *dst_ptl;
> + pgtable_t pgtable;
> + struct mmu_notifier_range range;
> +
> + src_pmdval = *src_pmd;
> + src_ptl = pmd_lockptr(src_mm, src_pmd);
> +
> + BUG_ON(!pmd_trans_huge(src_pmdval));
> + BUG_ON(!pmd_none(dst_pmdval));
> + BUG_ON(!spin_is_locked(src_ptl));
> + mmap_assert_locked(src_mm);
> + mmap_assert_locked(dst_mm);
> + BUG_ON(src_addr & ~HPAGE_PMD_MASK);
> + BUG_ON(dst_addr & ~HPAGE_PMD_MASK);
> +
> + src_page = pmd_page(src_pmdval);
> + BUG_ON(!PageHead(src_page));
> + BUG_ON(!PageAnon(src_page));
Better to add a src_folio = page_folio(src_page);
and then folio_test_anon() here.
> + if (unlikely(page_mapcount(src_page) != 1)) {
Brr, this is going to miss PTE mappings of this folio. I think you
actually want folio_mapcount() instead, although it'd be more efficient
to look at folio->_entire_mapcount == 1 and _nr_pages_mapped == 0.
Not wure what a good name for that predicate would be.
> + get_page(src_page);
folio_get()
> + /* block all concurrent rmap walks */
> + lock_page(src_page);
folio_lock()
> + /*
> + * split_huge_page walks the anon_vma chain without the page
> + * lock. Serialize against it with the anon_vma lock, the page
> + * lock is not enough.
> + */
> + src_anon_vma = folio_get_anon_vma(page_folio(src_page));
...
> + if (!src_anon_vma) {
> + unlock_page(src_page);
folio_unlock()
> + put_page(src_page);
folio_put()
> + if (unlikely(!pmd_same(*src_pmd, src_pmdval) ||
> + !pmd_same(*dst_pmd, dst_pmdval) ||
> + page_mapcount(src_page) != 1)) {
similar folio_total_mapcount()
> + double_pt_unlock(src_ptl, dst_ptl);
> + anon_vma_unlock_write(src_anon_vma);
> + put_anon_vma(src_anon_vma);
> + unlock_page(src_page);
> + put_page(src_page);
...
> + BUG_ON(!PageHead(src_page));
> + BUG_ON(!PageAnon(src_page));
> + /* the PT lock is enough to keep the page pinned now */
> + put_page(src_page);
...
> + dst_anon_vma = (void *) dst_vma->anon_vma + PAGE_MAPPING_ANON;
> + WRITE_ONCE(src_page->mapping, (struct address_space *) dst_anon_vma);
> + WRITE_ONCE(src_page->index, linear_page_index(dst_vma, dst_addr));
Definitely want to use the folio here.
> + /* unblock rmap walks */
> + unlock_page(src_page);
...
> +static int remap_pages_pte(struct mm_struct *dst_mm,
> + struct mm_struct *src_mm,
> + pmd_t *dst_pmd,
> + pmd_t *src_pmd,
> + struct vm_area_struct *dst_vma,
> + struct vm_area_struct *src_vma,
> + unsigned long dst_addr,
> + unsigned long src_addr,
> + __u64 mode)
> +{
> + swp_entry_t entry;
> + pte_t orig_src_pte, orig_dst_pte;
> + spinlock_t *src_ptl, *dst_ptl;
> + pte_t *src_pte = NULL;
> + pte_t *dst_pte = NULL;
> +
> + struct folio *src_folio = NULL;
> + struct anon_vma *src_anon_vma = NULL;
> + struct anon_vma *dst_anon_vma;
> + struct mmu_notifier_range range;
> + int err = 0;
> +
> +retry:
> + dst_pte = pte_offset_map_nolock(dst_mm, dst_pmd, dst_addr, &dst_ptl);
> +
> + /* If an huge pmd materialized from under us fail */
> + if (unlikely(!dst_pte)) {
> + err = -EFAULT;
> + goto out;
> + }
> +
> + src_pte = pte_offset_map_nolock(src_mm, src_pmd, src_addr, &src_ptl);
> +
> + /*
> + * We held the mmap_lock for reading so MADV_DONTNEED
> + * can zap transparent huge pages under us, or the
> + * transparent huge page fault can establish new
> + * transparent huge pages under us.
> + */
> + if (unlikely(!src_pte)) {
> + err = -EFAULT;
> + goto out;
> + }
> +
> + BUG_ON(pmd_none(*dst_pmd));
> + BUG_ON(pmd_none(*src_pmd));
> + BUG_ON(pmd_trans_huge(*dst_pmd));
> + BUG_ON(pmd_trans_huge(*src_pmd));
> +
> + spin_lock(dst_ptl);
> + orig_dst_pte = *dst_pte;
> + spin_unlock(dst_ptl);
> + if (!pte_none(orig_dst_pte)) {
> + err = -EEXIST;
> + goto out;
> + }
> +
> + spin_lock(src_ptl);
> + orig_src_pte = *src_pte;
> + spin_unlock(src_ptl);
> + if (pte_none(orig_src_pte)) {
> + if (!(mode & UFFDIO_REMAP_MODE_ALLOW_SRC_HOLES))
> + err = -ENOENT;
> + else /* nothing to do to remap a hole */
> + err = 0;
> + goto out;
> + }
> +
> + if (pte_present(orig_src_pte)) {
> + if (!src_folio) {
> + struct folio *folio;
> +
> + /*
> + * Pin the page while holding the lock to be sure the
> + * page isn't freed under us
> + */
> + spin_lock(src_ptl);
> + if (!pte_same(orig_src_pte, *src_pte)) {
> + spin_unlock(src_ptl);
> + err = -EAGAIN;
> + goto out;
> + }
> +
> + folio = vm_normal_folio(src_vma, src_addr, orig_src_pte);
> + if (!folio || !folio_test_anon(folio) ||
> + folio_estimated_sharers(folio) != 1) {
I wonder if we also want to fail if folio_test_large()? While we don't
have large anon folios today, it seems to me that trying to migrate one
of them like this is just not going to work.
Powered by blists - more mailing lists