lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <a8c41028-c7f5-9b93-4721-b8ddcf2427da@oracle.com>
Date:   Thu, 1 Apr 2021 20:54:10 +0100
From:   Joao Martins <joao.m.martins@...cle.com>
To:     Dan Williams <dan.j.williams@...el.com>
Cc:     Jason Gunthorpe <jgg@...pe.ca>, Christoph Hellwig <hch@....de>,
        Shiyang Ruan <ruansy.fnst@...itsu.com>,
        Vishal Verma <vishal.l.verma@...el.com>,
        Dave Jiang <dave.jiang@...el.com>,
        Ira Weiny <ira.weiny@...el.com>,
        Matthew Wilcox <willy@...radead.org>, Jan Kara <jack@...e.cz>,
        Andrew Morton <akpm@...ux-foundation.org>,
        david <david@...morbit.com>,
        linux-fsdevel <linux-fsdevel@...r.kernel.org>,
        Linux Kernel Mailing List <linux-kernel@...r.kernel.org>,
        Linux MM <linux-mm@...ck.org>,
        linux-nvdimm <linux-nvdimm@...ts.01.org>
Subject: Re: [PATCH 3/3] mm/devmap: Remove pgmap accounting in the
 get_user_pages_fast() path

On 3/24/21 7:00 PM, Joao Martins wrote:
> On 3/24/21 5:45 PM, Dan Williams wrote:
>> On Thu, Mar 18, 2021 at 3:02 AM Joao Martins <joao.m.martins@...cle.com> wrote:
>>> On 3/18/21 4:08 AM, Dan Williams wrote:
>>>> Now that device-dax and filesystem-dax are guaranteed to unmap all user
>>>> mappings of devmap / DAX pages before tearing down the 'struct page'
>>>> array, get_user_pages_fast() can rely on its traditional synchronization
>>>> method "validate_pte(); get_page(); revalidate_pte()" to catch races with
>>>> device shutdown. Specifically the unmap guarantee ensures that gup-fast
>>>> either succeeds in taking a page reference (lock-less), or it detects a
>>>> need to fall back to the slow path where the device presence can be
>>>> revalidated with locks held.
>>>
>>> [...]
>>>
>>> So for allowing FOLL_LONGTERM[0] would it be OK if we used page->pgmap after
>>> try_grab_page() for checking pgmap type to see if we are in a device-dax
>>> longterm pin?
>>
>> So, there is an effort to add a new pte bit p{m,u}d_special to disable
>> gup-fast for huge pages [1]. I'd like to investigate whether we could
>> use devmap + special as an encoding for "no longterm" and never
>> consult the pgmap in the gup-fast path.
>>
>> [1]: https://lore.kernel.org/linux-mm/a1fa7fa2-914b-366d-9902-e5b784e8428c@shipmail.org/
>>
> 
> Oh, nice! That would be ideal indeed, as we would skip the pgmap lookup enterily.
> 
> I suppose device-dax would use pfn_t PFN_MAP while fs-dax memory device would set PFN_MAP
> | PFN_DEV (provided vmf_insert_pfn_{pmd,pud} calls mkspecial on PFN_DEV).
> 
> I haven't been following that thread, but for PMD/PUD special in vmf_* these might be useful:
> 
> https://lore.kernel.org/linux-mm/20200110190313.17144-2-joao.m.martins@oracle.com/
> https://lore.kernel.org/linux-mm/20200110190313.17144-4-joao.m.martins@oracle.com/
> 

On a second thought, maybe it doesn't need to be that complicated for {fs,dev}dax if the
added special bit is just a subcase of devmap pte/pmd/puds. See below scsissors mark as a
rough estimation on the changes (nothing formal/proper as it isn't properly splitted).
Running gup_test with devdax/fsdax FOLL_LONGTERM and without does the intended. (gup_test
-m 16384 -r 10 -a -S -n 512 -w -f <file> [-F 0x10000]).

Unless this is about using special PMD/PUD bits without page structs (thus without devmap
bits) as in the previous two links.

-- >8 --

Subject: mm/gup, nvdimm: allow FOLL_LONGTERM for device-dax gup-fast

Signed-off-by: Joao Martins <joao.m.martins@...cle.com>

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 47027796c2f9..8b5d68d89cde 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -439,11 +439,16 @@ static inline pmd_t pmd_mkinvalid(pmd_t pmd)

 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define pmd_devmap(pmd)		pte_devmap(pmd_pte(pmd))
+#define pmd_special(pmd)	pte_special(pmd_pte(pmd))
 #endif
 static inline pmd_t pmd_mkdevmap(pmd_t pmd)
 {
 	return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_DEVMAP)));
 }
+static inline pmd_t pmd_mkspecial(pmd_t pmd)
+{
+	return pte_pmd(set_pte_bit(pmd_pte(pmd), __pgprot(PTE_SPECIAL)));
+}

 #define __pmd_to_phys(pmd)	__pte_to_phys(pmd_pte(pmd))
 #define __phys_to_pmd_val(phys)	__phys_to_pte_val(phys)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index b1099f2d9800..45449ee86d4f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -259,13 +259,13 @@ static inline int pmd_large(pmd_t pte)
 /* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */
 static inline int pmd_trans_huge(pmd_t pmd)
 {
-	return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
+	return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP|_PAGE_SPECIAL)) == _PAGE_PSE;
 }

 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 static inline int pud_trans_huge(pud_t pud)
 {
-	return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
+	return (pud_val(pud) & (_PAGE_PSE|_PAGE_DEVMAP|_PAGE_SPECIAL)) == _PAGE_PSE;
 }
 #endif

@@ -297,6 +297,19 @@ static inline int pgd_devmap(pgd_t pgd)
 {
 	return 0;
 }
+
+static inline bool pmd_special(pmd_t pmd)
+{
+	return !!(pmd_flags(pmd) & _PAGE_SPECIAL);
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static inline bool pud_special(pud_t pud)
+{
+	return !!(pud_flags(pud) & _PAGE_SPECIAL);
+}
+#endif
+
 #endif
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */

@@ -452,6 +465,11 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
 	return pmd_set_flags(pmd, _PAGE_DEVMAP);
 }

+static inline pmd_t pmd_mkspecial(pmd_t pmd)
+{
+	return pmd_set_flags(pmd, _PAGE_SPECIAL);
+}
+
 static inline pmd_t pmd_mkhuge(pmd_t pmd)
 {
 	return pmd_set_flags(pmd, _PAGE_PSE);
@@ -511,6 +529,11 @@ static inline pud_t pud_mkhuge(pud_t pud)
 	return pud_set_flags(pud, _PAGE_PSE);
 }

+static inline pud_t pud_mkspecial(pud_t pud)
+{
+	return pud_set_flags(pud, _PAGE_SPECIAL);
+}
+
 static inline pud_t pud_mkyoung(pud_t pud)
 {
 	return pud_set_flags(pud, _PAGE_ACCESSED);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 16760b237229..156ceae33164 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -435,7 +435,7 @@ static int pmem_attach_disk(struct device *dev,
 		pmem->data_offset = le64_to_cpu(pfn_sb->dataoff);
 		pmem->pfn_pad = resource_size(res) -
 			range_len(&pmem->pgmap.range);
-		pmem->pfn_flags |= PFN_MAP;
+		pmem->pfn_flags |= PFN_MAP | PFN_SPECIAL;
 		bb_range = pmem->pgmap.range;
 		bb_range.start += pmem->data_offset;
 	} else if (pmem_should_map_pages(dev)) {
@@ -445,7 +445,7 @@ static int pmem_attach_disk(struct device *dev,
 		pmem->pgmap.type = MEMORY_DEVICE_FS_DAX;
 		pmem->pgmap.ops = &fsdax_pagemap_ops;
 		addr = devm_memremap_pages(dev, &pmem->pgmap);
-		pmem->pfn_flags |= PFN_MAP;
+		pmem->pfn_flags |= PFN_MAP | PFN_SPECIAL;
 		bb_range = pmem->pgmap.range;
 	} else {
 		if (devm_add_action_or_reset(dev, pmem_release_queue,
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 4ee6f734ba83..873c8e53c85d 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -748,7 +748,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev,
pgoff_t pgoff,
 		*kaddr = fs->window_kaddr + offset;
 	if (pfn)
 		*pfn = phys_to_pfn_t(fs->window_phys_addr + offset,
-					PFN_DEV | PFN_MAP);
+					PFN_DEV | PFN_MAP | PFN_SPECIAL);
 	return nr_pages > max_nr_pages ? max_nr_pages : nr_pages;
 }

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 7364e5a70228..ad7078e38ef2 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1189,6 +1189,18 @@ static inline int pgd_devmap(pgd_t pgd)
 }
 #endif

+#if !defined(CONFIG_ARCH_HAS_PTE_SPECIAL) || !defined(CONFIG_TRANSPARENT_HUGEPAGE)
+static inline bool pmd_special(pmd_t pmd)
+{
+	return false;
+}
+
+static inline pmd_t pmd_mkspecial(pmd_t pmd)
+{
+	return pmd;
+}
+#endif
+
 #if !defined(CONFIG_TRANSPARENT_HUGEPAGE) || \
 	(defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
 	 !defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD))
@@ -1196,6 +1208,14 @@ static inline int pud_trans_huge(pud_t pud)
 {
 	return 0;
 }
+static inline bool pud_special(pud_t pud)
+{
+	return false;
+}
+static inline pud_t pud_mkspecial(pud_t pud)
+{
+	return pud;
+}
 #endif

 /* See pmd_none_or_trans_huge_or_clear_bad for discussion. */
diff --git a/mm/gup.c b/mm/gup.c
index b3e647c8b7ee..87aa229a9347 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2086,7 +2086,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned
long end,
 			goto pte_unmap;

 		if (pte_devmap(pte)) {
-			if (unlikely(flags & FOLL_LONGTERM))
+			if (unlikely(flags & FOLL_LONGTERM) && pte_special(pte))
 				goto pte_unmap;

 			pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
@@ -2338,7 +2338,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 		return 0;

 	if (pmd_devmap(orig)) {
-		if (unlikely(flags & FOLL_LONGTERM))
+		if (unlikely(flags & FOLL_LONGTERM) && pmd_special(orig))
 			return 0;
 		return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
 					     pages, nr);
@@ -2372,7 +2372,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 		return 0;

 	if (pud_devmap(orig)) {
-		if (unlikely(flags & FOLL_LONGTERM))
+		if (unlikely(flags & FOLL_LONGTERM) && pud_special(orig))
 			return 0;
 		return __gup_device_huge_pud(orig, pudp, addr, end, flags,
 					     pages, nr);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index f6f70632fc29..9d5117711919 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -796,8 +796,11 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long
addr,
 	}

 	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
-	if (pfn_t_devmap(pfn))
+	if (pfn_t_devmap(pfn)) {
 		entry = pmd_mkdevmap(entry);
+		if (pfn_t_special(pfn))
+			entry = pmd_mkspecial(entry);
+	}
 	if (write) {
 		entry = pmd_mkyoung(pmd_mkdirty(entry));
 		entry = maybe_pmd_mkwrite(entry, vma);
@@ -896,8 +899,11 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long
addr,
 	}

 	entry = pud_mkhuge(pfn_t_pud(pfn, prot));
-	if (pfn_t_devmap(pfn))
+	if (pfn_t_devmap(pfn)) {
 		entry = pud_mkdevmap(entry);
+		if (pfn_t_special(pfn))
+			entry = pud_mkspecial(entry);
+	}
 	if (write) {
 		entry = pud_mkyoung(pud_mkdirty(entry));
 		entry = maybe_pud_mkwrite(entry, vma);

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ