[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <66ef75e59c7ea_109b5294d1@dwillia2-mobl3.amr.corp.intel.com.notmuch>
Date: Sun, 22 Sep 2024 03:41:57 +0200
From: Dan Williams <dan.j.williams@...el.com>
To: Alistair Popple <apopple@...dia.com>, <dan.j.williams@...el.com>,
<linux-mm@...ck.org>
CC: Alistair Popple <apopple@...dia.com>, <vishal.l.verma@...el.com>,
<dave.jiang@...el.com>, <logang@...tatee.com>, <bhelgaas@...gle.com>,
<jack@...e.cz>, <jgg@...pe.ca>, <catalin.marinas@....com>, <will@...nel.org>,
<mpe@...erman.id.au>, <npiggin@...il.com>, <dave.hansen@...ux.intel.com>,
<ira.weiny@...el.com>, <willy@...radead.org>, <djwong@...nel.org>,
<tytso@....edu>, <linmiaohe@...wei.com>, <david@...hat.com>,
<peterx@...hat.com>, <linux-doc@...r.kernel.org>,
<linux-kernel@...r.kernel.org>, <linux-arm-kernel@...ts.infradead.org>,
<linuxppc-dev@...ts.ozlabs.org>, <nvdimm@...ts.linux.dev>,
<linux-cxl@...r.kernel.org>, <linux-fsdevel@...r.kernel.org>,
<linux-ext4@...r.kernel.org>, <linux-xfs@...r.kernel.org>,
<jhubbard@...dia.com>, <hch@....de>, <david@...morbit.com>,
<hca@...ux.ibm.com>, <gor@...ux.ibm.com>, <agordeev@...ux.ibm.com>,
<borntraeger@...ux.ibm.com>, <svens@...ux.ibm.com>,
<linux-s390@...r.kernel.org>, <gerald.schaefer@...ux.ibm.com>
Subject: Re: [PATCH 05/12] mm/memory: Add dax_insert_pfn
[ add s390 folks to comment on CONFIG_FS_DAX_LIMITED ]
Alistair Popple wrote:
> Currently to map a DAX page the DAX driver calls vmf_insert_pfn. This
> creates a special devmap PTE entry for the pfn but does not take a
> reference on the underlying struct page for the mapping. This is
> because DAX page refcounts are treated specially, as indicated by the
> presence of a devmap entry.
>
> To allow DAX page refcounts to be managed the same as normal page
> refcounts introduce dax_insert_pfn. This will take a reference on the
> underlying page much the same as vmf_insert_page, except it also
> permits upgrading an existing mapping to be writable if
> requested/possible.
>
> Signed-off-by: Alistair Popple <apopple@...dia.com>
>
> ---
>
> Updates from v1:
>
> - Re-arrange code in insert_page_into_pte_locked() based on comments
> from Jan Kara.
>
> - Call mkdrity/mkyoung for the mkwrite case, also suggested by Jan.
> ---
> include/linux/mm.h | 1 +-
> mm/memory.c | 83 ++++++++++++++++++++++++++++++++++++++++++-----
> 2 files changed, 76 insertions(+), 8 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index b0ff06d..ae6d713 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3463,6 +3463,7 @@ int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
> unsigned long num);
> int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
> unsigned long num);
> +vm_fault_t dax_insert_pfn(struct vm_fault *vmf, pfn_t pfn_t, bool write);
> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
> unsigned long pfn);
> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
> diff --git a/mm/memory.c b/mm/memory.c
> index d2785fb..368e15d 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2039,19 +2039,47 @@ static int validate_page_before_insert(struct vm_area_struct *vma,
> }
>
> static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
> - unsigned long addr, struct page *page, pgprot_t prot)
> + unsigned long addr, struct page *page,
> + pgprot_t prot, bool mkwrite)
This upgrade of insert_page_into_pte_locked() to handle write faults
deserves to be its own patch with rationale along the lines of:
"In preparation for using insert_page() for DAX, enhance
insert_page_into_pte_locked() to handle establishing writable mappings.
Recall that DAX returns VM_FAULT_NOPAGE after installing a PTE which
bypasses the typical set_pte_range() in finish_fault."
> {
> struct folio *folio = page_folio(page);
> + pte_t entry = ptep_get(pte);
> pte_t pteval;
>
> - if (!pte_none(ptep_get(pte)))
> - return -EBUSY;
> + if (!pte_none(entry)) {
> + if (!mkwrite)
> + return -EBUSY;
> +
> + /*
> + * For read faults on private mappings the PFN passed in may not
> + * match the PFN we have mapped if the mapped PFN is a writeable
> + * COW page. In the mkwrite case we are creating a writable PTE
> + * for a shared mapping and we expect the PFNs to match. If they
> + * don't match, we are likely racing with block allocation and
> + * mapping invalidation so just skip the update.
> + */
> + if (pte_pfn(entry) != page_to_pfn(page)) {
> + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
> + return -EFAULT;
> + }
> + entry = maybe_mkwrite(entry, vma);
> + entry = pte_mkyoung(entry);
> + if (ptep_set_access_flags(vma, addr, pte, entry, 1))
> + update_mmu_cache(vma, addr, pte);
I was going to say that this should be creating a shared helper with
insert_pfn(), but on closer look the mkwrite case in insert_pfn() is now
dead code (almost, *grumbles about dcssblk*). So I would just mention
that in the changelog for this standalone patch and then we can follow
on with a cleanup like the patch at the bottom of this mail (untested).
> + return 0;
> + }
> +
> /* Ok, finally just insert the thing.. */
> pteval = mk_pte(page, prot);
> if (unlikely(is_zero_folio(folio))) {
> pteval = pte_mkspecial(pteval);
> } else {
> folio_get(folio);
> + entry = mk_pte(page, prot);
> + if (mkwrite) {
> + entry = pte_mkyoung(entry);
> + entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> + }
> inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
> folio_add_file_rmap_pte(folio, page, vma);
> }
> @@ -2060,7 +2088,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
> }
>
> static int insert_page(struct vm_area_struct *vma, unsigned long addr,
> - struct page *page, pgprot_t prot)
> + struct page *page, pgprot_t prot, bool mkwrite)
> {
> int retval;
> pte_t *pte;
> @@ -2073,7 +2101,8 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
> pte = get_locked_pte(vma->vm_mm, addr, &ptl);
> if (!pte)
> goto out;
> - retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
> + retval = insert_page_into_pte_locked(vma, pte, addr, page, prot,
> + mkwrite);
> pte_unmap_unlock(pte, ptl);
> out:
> return retval;
> @@ -2087,7 +2116,7 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
> err = validate_page_before_insert(vma, page);
> if (err)
> return err;
> - return insert_page_into_pte_locked(vma, pte, addr, page, prot);
> + return insert_page_into_pte_locked(vma, pte, addr, page, prot, false);
> }
>
> /* insert_pages() amortizes the cost of spinlock operations
> @@ -2223,7 +2252,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
> BUG_ON(vma->vm_flags & VM_PFNMAP);
> vm_flags_set(vma, VM_MIXEDMAP);
> }
> - return insert_page(vma, addr, page, vma->vm_page_prot);
> + return insert_page(vma, addr, page, vma->vm_page_prot, false);
> }
> EXPORT_SYMBOL(vm_insert_page);
>
> @@ -2503,7 +2532,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
> * result in pfn_t_has_page() == false.
> */
> page = pfn_to_page(pfn_t_to_pfn(pfn));
> - err = insert_page(vma, addr, page, pgprot);
> + err = insert_page(vma, addr, page, pgprot, mkwrite);
> } else {
> return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
> }
> @@ -2516,6 +2545,44 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
> return VM_FAULT_NOPAGE;
> }
>
> +vm_fault_t dax_insert_pfn(struct vm_fault *vmf, pfn_t pfn_t, bool write)
> +{
> + struct vm_area_struct *vma = vmf->vma;
> + pgprot_t pgprot = vma->vm_page_prot;
> + unsigned long pfn = pfn_t_to_pfn(pfn_t);
> + struct page *page = pfn_to_page(pfn);
The problem here is that we stubbornly have __dcssblk_direct_access() to
worry about. That is the only dax driver that does not return
pfn_valid() pfns.
In fact, it looks like __dcssblk_direct_access() is the only thing
standing in the way of the removal of pfn_t.
It turns out it has been 3 years since the last time the question of
bringing s390 fully into the ZONE_DEVICE regime was raised:
https://lore.kernel.org/all/20210820210318.187742e8@thinkpad/
Given that this series removes PTE_DEVMAP which was a stumbling block,
would it be feasible to remove CONFIG_FS_DAX_LIMITED for a few kernel
cycles until someone from the s390 side can circle back to add full
ZONE_DEVICE support?
> + unsigned long addr = vmf->address;
> + int err;
> +
> + if (addr < vma->vm_start || addr >= vma->vm_end)
> + return VM_FAULT_SIGBUS;
> +
> + track_pfn_insert(vma, &pgprot, pfn_t);
> +
> + if (!pfn_modify_allowed(pfn, pgprot))
> + return VM_FAULT_SIGBUS;
> +
> + /*
> + * We refcount the page normally so make sure pfn_valid is true.
> + */
> + if (!pfn_t_valid(pfn_t))
> + return VM_FAULT_SIGBUS;
> +
> + WARN_ON_ONCE(pfn_t_devmap(pfn_t));
> +
> + if (WARN_ON(is_zero_pfn(pfn) && write))
> + return VM_FAULT_SIGBUS;
> +
> + err = insert_page(vma, addr, page, pgprot, write);
> + if (err == -ENOMEM)
> + return VM_FAULT_OOM;
> + if (err < 0 && err != -EBUSY)
> + return VM_FAULT_SIGBUS;
> +
> + return VM_FAULT_NOPAGE;
> +}
> +EXPORT_SYMBOL_GPL(dax_insert_pfn);
With insert_page_into_pte_locked() split out into its own patch and the
dcssblk issue resolved to kill that special case, this patch looks good
to me.
> +
> vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
> pfn_t pfn)
> {
> --
> git-series 0.9.1
-- >8 --
Subject: mm: Remove vmf_insert_mixed_mkwrite()
From: Dan Williams <dan.j.williams@...el.com>
Now that fsdax has switched to dax_insert_pfn() which uses
insert_page_into_pte_locked() internally, there are no more callers of
vmf_insert_mixed_mkwrite(). This also reveals that all remaining callers
of insert_pfn() never set @mkrite to true, so also cleanup insert_pfn()'s
@mkwrite argument.
Signed-off-by: Dan Williams <dan.j.williams@...el.com>
---
include/linux/mm.h | 2 --
mm/memory.c | 60 +++++++---------------------------------------------
2 files changed, 8 insertions(+), 54 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5976276d4494..d9517e109ac3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3444,8 +3444,6 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t pgprot);
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn);
-vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn);
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma,
diff --git a/mm/memory.c b/mm/memory.c
index 000873596672..80b07dbd8304 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2327,7 +2327,7 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
EXPORT_SYMBOL(vm_map_pages_zero);
static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, pgprot_t prot, bool mkwrite)
+ pfn_t pfn, pgprot_t prot)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte, entry;
@@ -2337,38 +2337,12 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
if (!pte)
return VM_FAULT_OOM;
entry = ptep_get(pte);
- if (!pte_none(entry)) {
- if (mkwrite) {
- /*
- * For read faults on private mappings the PFN passed
- * in may not match the PFN we have mapped if the
- * mapped PFN is a writeable COW page. In the mkwrite
- * case we are creating a writable PTE for a shared
- * mapping and we expect the PFNs to match. If they
- * don't match, we are likely racing with block
- * allocation and mapping invalidation so just skip the
- * update.
- */
- if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
- WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
- goto out_unlock;
- }
- entry = pte_mkyoung(entry);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- if (ptep_set_access_flags(vma, addr, pte, entry, 1))
- update_mmu_cache(vma, addr, pte);
- }
+ if (!pte_none(entry))
goto out_unlock;
- }
/* Ok, finally just insert the thing.. */
entry = pte_mkspecial(pfn_t_pte(pfn, prot));
- if (mkwrite) {
- entry = pte_mkyoung(entry);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- }
-
set_pte_at(mm, addr, pte, entry);
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
@@ -2433,8 +2407,7 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
- return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
- false);
+ return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
}
EXPORT_SYMBOL(vmf_insert_pfn_prot);
@@ -2480,8 +2453,8 @@ static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn, bool mkwrite)
return false;
}
-static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn, bool mkwrite)
+vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
{
pgprot_t pgprot = vma->vm_page_prot;
int err;
@@ -2513,9 +2486,9 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* result in pfn_t_has_page() == false.
*/
page = pfn_to_page(pfn_t_to_pfn(pfn));
- err = insert_page(vma, addr, page, pgprot, mkwrite);
+ err = insert_page(vma, addr, page, pgprot, false);
} else {
- return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+ return insert_pfn(vma, addr, pfn, pgprot);
}
if (err == -ENOMEM)
@@ -2525,6 +2498,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
return VM_FAULT_NOPAGE;
}
+EXPORT_SYMBOL(vmf_insert_mixed);
vm_fault_t dax_insert_pfn(struct vm_fault *vmf, pfn_t pfn_t, bool write)
{
@@ -2562,24 +2536,6 @@ vm_fault_t dax_insert_pfn(struct vm_fault *vmf, pfn_t pfn_t, bool write)
}
EXPORT_SYMBOL_GPL(dax_insert_pfn);
-vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
-{
- return __vm_insert_mixed(vma, addr, pfn, false);
-}
-EXPORT_SYMBOL(vmf_insert_mixed);
-
-/*
- * If the insertion of PTE failed because someone else already added a
- * different entry in the mean time, we treat that as success as we assume
- * the same entry was actually inserted.
- */
-vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
- unsigned long addr, pfn_t pfn)
-{
- return __vm_insert_mixed(vma, addr, pfn, true);
-}
-
/*
* maps a range of physical memory into the requested pages. the old
* mappings are removed. any references to nonexistent pages results
Powered by blists - more mailing lists