[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20200903170844.GI1152540@nvidia.com>
Date: Thu, 3 Sep 2020 14:08:44 -0300
From: Jason Gunthorpe <jgg@...dia.com>
To: Matthew Wilcox <willy@...radead.org>
CC: Zi Yan <ziy@...dia.com>, <linux-mm@...ck.org>,
Roman Gushchin <guro@...com>, Rik van Riel <riel@...riel.com>,
"Kirill A . Shutemov" <kirill.shutemov@...ux.intel.com>,
Shakeel Butt <shakeelb@...gle.com>,
"Yang Shi" <yang.shi@...ux.alibaba.com>,
David Nellans <dnellans@...dia.com>,
<linux-kernel@...r.kernel.org>
Subject: Re: [RFC PATCH 00/16] 1GB THP support on x86_64
On Thu, Sep 03, 2020 at 05:55:59PM +0100, Matthew Wilcox wrote:
> On Thu, Sep 03, 2020 at 01:40:32PM -0300, Jason Gunthorpe wrote:
> > However if the sizeof(*pXX) is 8 on a 32 bit platform then load
> > tearing is a problem. At lest the various pXX_*() test functions
> > operate on a single 32 bit word so don't tear, but to to convert the
> > *pXX to a lower level page table pointer a coherent, untorn, read is
> > required.
> >
> > So, looking again, I remember now, I could never quite figure out why
> > gup_pmd_range() was safe to do:
> >
> > pmd_t pmd = READ_ONCE(*pmdp);
> > [..]
> > } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
> > [..]
> > ptem = ptep = pte_offset_map(&pmd, addr);
> >
> > As I don't see what prevents load tearing a 64 bit pmd.. Eg no
> > pmd_trans_unstable() or equivalent here.
>
> I don't think there are any 32-bit page tables which support a PUD-sized
> page. Pretty sure x86 doesn't until you get to 4- or 5- level page tables
> (which need you to be running in 64-bit mode). There's not much utility
> in having 1GB of your 3GB process address space taken up by a single page.
Make sense for PUD, but why is the above GUP code OK for PMD?
pmd_trans_unstable() exists specifically to close read tearing races,
so it looks like a real problem?
> I'm OK if there are some oddball architectures which support it, but
> Linux doesn't.
So, based on that observation, I think something approximately like
this is needed for the page walker for PUD: (this has been on my
backlog to return to these patches..)
>From 00a361ecb2d9e1226600d9e78e6e1803a886f2d6 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@...lanox.com>
Date: Fri, 13 Mar 2020 13:15:36 -0300
Subject: [RFC] mm/pagewalk: use READ_ONCE when reading the PUD entry
unlocked
The pagewalker runs while only holding the mmap_sem for read. The pud can
be set asynchronously, while also holding the mmap_sem for read
eg from:
handle_mm_fault()
__handle_mm_fault()
create_huge_pmd()
dev_dax_huge_fault()
__dev_dax_pud_fault()
vmf_insert_pfn_pud()
insert_pfn_pud()
pud_lock()
set_pud_at()
At least x86 sets the PUD using WRITE_ONCE(), so an unlocked read of
unstable data should be paired to use READ_ONCE().
For the pagewalker to work locklessly the PUD must work similarly to the
PMD: once the PUD entry becomes a pointer to a PMD, it must be stable, and
safe to pass to pmd_offset()
Passing the value from READ_ONCE into the callbacks prevents the callers
from seeing inconsistencies after they re-read, such as seeing pud_none().
If a callback does obtain the pud_lock then it should trigger ACTION_AGAIN
if a data race caused the original value to change.
Use the same pattern as gup_pmd_range() and pass in the address of the
local READ_ONCE stack variable to pmd_offset() to avoid reading it again.
Signed-off-by: Jason Gunthorpe <jgg@...lanox.com>
---
include/linux/pagewalk.h | 2 +-
mm/hmm.c | 16 +++++++---------
mm/mapping_dirty_helpers.c | 6 ++----
mm/pagewalk.c | 28 ++++++++++++++++------------
mm/ptdump.c | 3 +--
5 files changed, 27 insertions(+), 28 deletions(-)
diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h
index b1cb6b753abb53..6caf28aadafbff 100644
--- a/include/linux/pagewalk.h
+++ b/include/linux/pagewalk.h
@@ -39,7 +39,7 @@ struct mm_walk_ops {
unsigned long next, struct mm_walk *walk);
int (*p4d_entry)(p4d_t *p4d, unsigned long addr,
unsigned long next, struct mm_walk *walk);
- int (*pud_entry)(pud_t *pud, unsigned long addr,
+ int (*pud_entry)(pud_t pud, pud_t *pudp, unsigned long addr,
unsigned long next, struct mm_walk *walk);
int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk);
diff --git a/mm/hmm.c b/mm/hmm.c
index 6d9da4b0f0a9f8..98ced96421b913 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -459,28 +459,26 @@ static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
range->flags[HMM_PFN_VALID];
}
-static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
- struct mm_walk *walk)
+static int hmm_vma_walk_pud(pud_t pud, pud_t *pudp, unsigned long start,
+ unsigned long end, struct mm_walk *walk)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long addr = start;
- pud_t pud;
int ret = 0;
spinlock_t *ptl = pud_trans_huge_lock(pudp, walk->vma);
if (!ptl)
return 0;
+ if (memcmp(pudp, &pud, sizeof(pud)) != 0) {
+ walk->action = ACTION_AGAIN;
+ spin_unlock(ptl);
+ return 0;
+ }
/* Normally we don't want to split the huge page */
walk->action = ACTION_CONTINUE;
- pud = READ_ONCE(*pudp);
- if (pud_none(pud)) {
- spin_unlock(ptl);
- return hmm_vma_walk_hole(start, end, -1, walk);
- }
-
if (pud_huge(pud) && pud_devmap(pud)) {
unsigned long i, npages, pfn;
uint64_t *pfns, cpu_flags;
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index 71070dda9643d4..8943c2509ec0f7 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -125,12 +125,10 @@ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
}
/* wp_clean_pud_entry - The pagewalk pud callback. */
-static int wp_clean_pud_entry(pud_t *pud, unsigned long addr, unsigned long end,
- struct mm_walk *walk)
+static int wp_clean_pud_entry(pud_t pudval, pud_t *pudp, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
{
/* Dirty-tracking should be handled on the pte level */
- pud_t pudval = READ_ONCE(*pud);
-
if (pud_trans_huge(pudval) || pud_devmap(pudval))
WARN_ON(pud_write(pudval) || pud_dirty(pudval));
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 928df1638c30d1..cf99536cec23be 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -58,7 +58,7 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
return err;
}
-static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+static int walk_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
pmd_t *pmd;
@@ -67,7 +67,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
int err = 0;
int depth = real_depth(3);
- pmd = pmd_offset(pud, addr);
+ pmd = pmd_offset(&pud, addr);
do {
again:
next = pmd_addr_end(addr, end);
@@ -119,17 +119,19 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- pud_t *pud;
+ pud_t *pudp;
+ pud_t pud;
unsigned long next;
const struct mm_walk_ops *ops = walk->ops;
int err = 0;
int depth = real_depth(2);
- pud = pud_offset(p4d, addr);
+ pudp = pud_offset(p4d, addr);
do {
again:
+ pud = READ_ONCE(*pudp);
next = pud_addr_end(addr, end);
- if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) {
+ if (pud_none(pud) || (!walk->vma && !walk->no_vma)) {
if (ops->pte_hole)
err = ops->pte_hole(addr, next, depth, walk);
if (err)
@@ -140,27 +142,29 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
walk->action = ACTION_SUBTREE;
if (ops->pud_entry)
- err = ops->pud_entry(pud, addr, next, walk);
+ err = ops->pud_entry(pud, pudp, addr, next, walk);
if (err)
break;
if (walk->action == ACTION_AGAIN)
goto again;
- if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) ||
+ if ((!walk->vma && (pud_leaf(pud) || !pud_present(pud))) ||
walk->action == ACTION_CONTINUE ||
!(ops->pmd_entry || ops->pte_entry))
continue;
- if (walk->vma)
- split_huge_pud(walk->vma, pud, addr);
- if (pud_none(*pud))
- goto again;
+ if (walk->vma) {
+ split_huge_pud(walk->vma, pudp, addr);
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
+ goto again;
+ }
err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
- } while (pud++, addr = next, addr != end);
+ } while (pudp++, addr = next, addr != end);
return err;
}
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 26208d0d03b7a9..c5e1717671e36a 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -59,11 +59,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
return 0;
}
-static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
+static int ptdump_pud_entry(pud_t val, pud_t *pudp, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
- pud_t val = READ_ONCE(*pud);
#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_KASAN)
if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd)))
--
2.28.0
Powered by blists - more mailing lists