[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <cf2d7d64-8872-2b22-0b2e-8db96135d3ef@nvidia.com>
Date: Mon, 22 May 2023 17:55:52 -0700
From: John Hubbard <jhubbard@...dia.com>
To: Alistair Popple <apopple@...dia.com>,
Andrew Morton <akpm@...ux-foundation.org>
CC: <linux-mm@...ck.org>, <linux-kernel@...r.kernel.org>,
<robin.murphy@....com>, <will@...nel.org>, <nicolinc@...dia.com>,
<linux-arm-kernel@...ts.infradead.org>, <kvm@...r.kernel.org>,
<kvmarm@...ts.cs.columbia.edu>, <jgg@...dia.com>
Subject: Re: [PATCH] mmu_notifiers: Notify on pte permission upgrades
On 5/21/23 23:37, Alistair Popple wrote:
> Some architectures, specifically ARM and perhaps Sparc and IA64,
> require TLB invalidates when upgrading pte permission from read-only
> to read-write.
>
> The current mmu_notifier implementation assumes that upgrades do not
> need notifications. Typically though mmu_notifiers are used to
> implement TLB invalidations for secondary MMUs that comply with the
> main CPU architecture.
>
> Therefore if the main CPU architecture requires an invalidation for
> permission upgrade the secondary MMU will as well and an mmu_notifier
> should be sent for the upgrade.
>
> Currently CPU invalidations for permission upgrade occur in
> ptep_set_access_flags(). Unfortunately MMU notifiers cannot be called
> directly from this architecture specific code as the notifier
> callbacks can sleep, and ptep_set_access_flags() is usually called
> whilst holding the PTL spinlock. Therefore add the notifier calls
> after the PTL is dropped and only if the PTE actually changed. This
> will allow secondary MMUs to obtain an updated PTE with appropriate
> permissions.
>
> This problem was discovered during testing of an ARM SMMU
> implementation that does not support broadcast TLB maintenance
> (BTM). In this case the SMMU driver uses notifiers to issue TLB
> invalidates. For read-only to read-write pte upgrades the SMMU
> continually returned a read-only PTE to the device, even though the
> CPU had a read-write PTE installed.
>
> Sending a mmu notifier event to the SMMU driver fixes the problem by
> flushing secondary TLB entries. A new notifier event type is added so
> drivers may filter out these invalidations if not required. Note a
> driver should never upgrade or install a PTE in response to this mmu
> notifier event as it is not synchronised against other PTE operations.
>
> Signed-off-by: Alistair Popple <apopple@...dia.com>
> ---
> include/linux/mmu_notifier.h | 6 +++++
> mm/hugetlb.c | 24 ++++++++++++++++++-
> mm/memory.c | 45 ++++++++++++++++++++++++++++++++++--
> 3 files changed, 72 insertions(+), 3 deletions(-)
Point of order:
What is this based on? It would really help if you would either use --base
with git-format-patch, or else just mention the base somewhere. Otherwise,
actually applying the patch takes some hunting around...in particular for
older stuff. This is from Feb, 2023, before hugetlb.c got converted to
folios, right?
thanks,
--
John Hubbard
NVIDIA
>
> diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
> index d6c06e140277..f14d68f119d8 100644
> --- a/include/linux/mmu_notifier.h
> +++ b/include/linux/mmu_notifier.h
> @@ -31,6 +31,11 @@ struct mmu_interval_notifier;
> * pages in the range so to mirror those changes the user must inspect the CPU
> * page table (from the end callback).
> *
> + * @MMU_NOTIFY_PROTECTION_UPGRAGE: update is due to a change from read-only to
> + * read-write for pages in the range. This must not be used to upgrade
> + * permissions on secondary PTEs, rather it should only be used to invalidate
> + * caches such as secondary TLBs that may cache old read-only entries.
> + *
> * @MMU_NOTIFY_SOFT_DIRTY: soft dirty accounting (still same page and same
> * access flags). User should soft dirty the page in the end callback to make
> * sure that anyone relying on soft dirtiness catch pages that might be written
> @@ -53,6 +58,7 @@ enum mmu_notifier_event {
> MMU_NOTIFY_CLEAR,
> MMU_NOTIFY_PROTECTION_VMA,
> MMU_NOTIFY_PROTECTION_PAGE,
> + MMU_NOTIFY_PROTECTION_UPGRADE,
> MMU_NOTIFY_SOFT_DIRTY,
> MMU_NOTIFY_RELEASE,
> MMU_NOTIFY_MIGRATE,
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index bdbfeb6fb393..e5d467c7bff7 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5987,6 +5987,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> vm_fault_t ret;
> u32 hash;
> pgoff_t idx;
> + bool changed = false;
> struct page *page = NULL;
> struct page *pagecache_page = NULL;
> struct hstate *h = hstate_vma(vma);
> @@ -6122,6 +6123,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> if (!huge_pte_write(entry)) {
> ret = hugetlb_wp(mm, vma, address, ptep, flags,
> pagecache_page, ptl);
> + if (!ret)
> + changed = true;
> +
> goto out_put_page;
> } else if (likely(flags & FAULT_FLAG_WRITE)) {
> entry = huge_pte_mkdirty(entry);
> @@ -6129,8 +6133,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> }
> entry = pte_mkyoung(entry);
> if (huge_ptep_set_access_flags(vma, haddr, ptep, entry,
> - flags & FAULT_FLAG_WRITE))
> + flags & FAULT_FLAG_WRITE)) {
> update_mmu_cache(vma, haddr, ptep);
> + changed = true;
> + }
> +
> out_put_page:
> if (page != pagecache_page)
> unlock_page(page);
> @@ -6138,6 +6145,21 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
> out_ptl:
> spin_unlock(ptl);
>
> + if (changed) {
> + struct mmu_notifier_range range;
> + unsigned long hpage_mask = huge_page_mask(h);
> + unsigned long hpage_size = huge_page_size(h);
> +
> + update_mmu_cache(vma, haddr, ptep);
> +
> + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_UPGRADE,
> + 0, vma, mm, haddr & hpage_mask,
> + (haddr & hpage_mask) + hpage_size);
> + mmu_notifier_invalidate_range_start(&range);
> + mmu_notifier_invalidate_range_end(&range);
> + }
> +
> +
> if (pagecache_page) {
> unlock_page(pagecache_page);
> put_page(pagecache_page);
> diff --git a/mm/memory.c b/mm/memory.c
> index f526b9152bef..0ac78c6a232c 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2098,6 +2098,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
> struct mm_struct *mm = vma->vm_mm;
> pte_t *pte, entry;
> spinlock_t *ptl;
> + bool changed = false;
>
> pte = get_locked_pte(mm, addr, &ptl);
> if (!pte)
> @@ -2120,8 +2121,10 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
> }
> entry = pte_mkyoung(*pte);
> entry = maybe_mkwrite(pte_mkdirty(entry), vma);
> - if (ptep_set_access_flags(vma, addr, pte, entry, 1))
> + if (ptep_set_access_flags(vma, addr, pte, entry, 1)) {
> update_mmu_cache(vma, addr, pte);
> + changed = true;
> + }
> }
> goto out_unlock;
> }
> @@ -2142,6 +2145,17 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
>
> out_unlock:
> pte_unmap_unlock(pte, ptl);
> +
> + if (changed) {
> + struct mmu_notifier_range range;
> +
> + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_UPGRADE,
> + 0, vma, mm, addr & PAGE_MASK,
> + (addr & PAGE_MASK) + PAGE_SIZE);
> + mmu_notifier_invalidate_range_start(&range);
> + mmu_notifier_invalidate_range_end(&range);
> + }
> +
> return VM_FAULT_NOPAGE;
> }
>
> @@ -2820,6 +2834,7 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
> struct vm_area_struct *vma = vmf->vma;
> struct mm_struct *mm = vma->vm_mm;
> unsigned long addr = vmf->address;
> + bool changed = false;
>
> if (likely(src)) {
> if (copy_mc_user_highpage(dst, src, addr, vma)) {
> @@ -2858,8 +2873,10 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
> }
>
> entry = pte_mkyoung(vmf->orig_pte);
> - if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
> + if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) {
> update_mmu_cache(vma, addr, vmf->pte);
> + changed = true;
> + }
> }
>
> /*
> @@ -2897,6 +2914,16 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
> }
> }
>
> + if (changed) {
> + struct mmu_notifier_range range;
> +
> + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_UPGRADE,
> + 0, vma, vma->vm_mm, addr & PAGE_MASK,
> + (addr & PAGE_MASK) + PAGE_SIZE);
> + mmu_notifier_invalidate_range_start(&range);
> + mmu_notifier_invalidate_range_end(&range);
> + }
> +
> ret = 0;
>
> pte_unlock:
> @@ -4877,6 +4904,7 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
> static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
> {
> pte_t entry;
> + bool changed = false;
>
> if (unlikely(pmd_none(*vmf->pmd))) {
> /*
> @@ -4957,6 +4985,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
> if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
> vmf->flags & FAULT_FLAG_WRITE)) {
> update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
> + changed = true;
> } else {
> /* Skip spurious TLB flush for retried page fault */
> if (vmf->flags & FAULT_FLAG_TRIED)
> @@ -4972,6 +5001,18 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
> }
> unlock:
> pte_unmap_unlock(vmf->pte, vmf->ptl);
> +
> + if (changed) {
> + struct mmu_notifier_range range;
> +
> + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_UPGRADE,
> + 0, vmf->vma, vmf->vma->vm_mm,
> + vmf->address & PAGE_MASK,
> + (vmf->address & PAGE_MASK) + PAGE_SIZE);
> + mmu_notifier_invalidate_range_start(&range);
> + mmu_notifier_invalidate_range_end(&range);
> + }
> +
> return 0;
> }
>
Powered by blists - more mailing lists