[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <5ed7d46b-ae26-43f2-81e0-91e3cfc0218a@amd.com>
Date: Mon, 22 Jan 2024 10:18:01 +0100
From: Christian König <christian.koenig@....com>
To: Xianrong Zhou <Xianrong.Zhou@....com>, linux-mm@...ck.org,
linux-kernel@...r.kernel.org
Cc: luto@...nel.org, tglx@...utronix.de, mingo@...hat.com, bp@...en8.de,
dave.hansen@...ux.intel.com, x86@...nel.org, hpa@...or.com,
alexander.deucher@....com, Xinhui.Pan@....com, airlied@...il.com,
daniel@...ll.ch, jani.nikula@...ux.intel.com,
joonas.lahtinen@...ux.intel.com, rodrigo.vivi@...el.com,
tvrtko.ursulin@...ux.intel.com, kherbst@...hat.com, lyude@...hat.com,
dakr@...hat.com, ray.huang@....com, maarten.lankhorst@...ux.intel.com,
mripard@...nel.org, tzimmermann@...e.de, zack.rusin@...adcom.com,
bcm-kernel-feedback-list@...adcom.com, akpm@...ux-foundation.org,
Felix.Kuehling@....com, Philip.Yang@....com, srinivasan.shanmugam@....com,
surenb@...gle.com, James.Zhu@....com, matthew.auld@...el.com,
nirmoy.das@...el.com, lee@...nel.org, amd-gfx@...ts.freedesktop.org,
dri-devel@...ts.freedesktop.org, intel-gfx@...ts.freedesktop.org,
nouveau@...ts.freedesktop.org, guoqing.zhang@....com, huazeng.li@....com
Subject: Re: [PATCH] mm: Remove double faults once write a device pfn
Am 22.01.24 um 04:32 schrieb Xianrong Zhou:
> The vmf_insert_pfn_prot could cause unnecessary double faults
> on a device pfn. Because currently the vmf_insert_pfn_prot does
> not make the pfn writable so the pte entry is normally read-only
> or dirty catching.
What? How do you got to this conclusion?
> The first fault only sets up the pte entry which actually is
> dirty catching. And the second immediate fault to the pfn due
> to first dirty catching when the cpu re-execute the store
> instruction.
It could be that this is done to work around some hw behavior, but not
because of dirty catching.
> Normally if the drivers call vmf_insert_pfn_prot and also supply
> 'pfn_mkwrite' callback within vm_operations_struct which requires
> the pte to be dirty catching then the vmf_insert_pfn_prot and the
> double fault are reasonable. It is not a problem.
Well, as far as I can see that behavior absolutely doesn't make sense.
When pfn_mkwrite is requested then the driver should use PAGE_COPY,
which is exactly what VMWGFX (the only driver using dirty tracking) is
doing.
Everybody else uses PAGE_SHARED which should make the pte writeable
immediately.
Regards,
Christian.
>
> However the most of drivers calling vmf_insert_pfn_prot do not
> supply the 'pfn_mkwrite' callback so that the second fault is
> unnecessary.
>
> So just like vmf_insert_mixed and vmf_insert_mixed_mkwrite pair,
> we should also supply vmf_insert_pfn_mkwrite for drivers as well.
>
> Signed-off-by: Xianrong Zhou <Xianrong.Zhou@....com>
> ---
> arch/x86/entry/vdso/vma.c | 3 ++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +-
> drivers/gpu/drm/i915/gem/i915_gem_ttm.c | 2 +-
> drivers/gpu/drm/nouveau/nouveau_gem.c | 2 +-
> drivers/gpu/drm/radeon/radeon_gem.c | 2 +-
> drivers/gpu/drm/ttm/ttm_bo_vm.c | 8 +++++---
> drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c | 8 +++++---
> include/drm/ttm/ttm_bo.h | 3 ++-
> include/linux/mm.h | 2 +-
> mm/memory.c | 14 +++++++++++---
> 10 files changed, 30 insertions(+), 16 deletions(-)
>
> diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
> index 7645730dc228..dd2431c2975f 100644
> --- a/arch/x86/entry/vdso/vma.c
> +++ b/arch/x86/entry/vdso/vma.c
> @@ -185,7 +185,8 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm,
> if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) {
> return vmf_insert_pfn_prot(vma, vmf->address,
> __pa(pvti) >> PAGE_SHIFT,
> - pgprot_decrypted(vma->vm_page_prot));
> + pgprot_decrypted(vma->vm_page_prot),
> + true);
> }
> } else if (sym_offset == image->sym_hvclock_page) {
> pfn = hv_get_tsc_pfn();
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> index 49a5f1c73b3e..adcb20d9e624 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
> @@ -64,7 +64,7 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf)
> }
>
> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
> - TTM_BO_VM_NUM_PREFAULT);
> + TTM_BO_VM_NUM_PREFAULT, true);
>
> drm_dev_exit(idx);
> } else {
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> index 9227f8146a58..c6f13ae6c308 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_ttm.c
> @@ -1114,7 +1114,7 @@ static vm_fault_t vm_fault_ttm(struct vm_fault *vmf)
>
> if (drm_dev_enter(dev, &idx)) {
> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
> - TTM_BO_VM_NUM_PREFAULT);
> + TTM_BO_VM_NUM_PREFAULT, true);
> drm_dev_exit(idx);
> } else {
> ret = ttm_bo_vm_dummy_page(vmf, vmf->vma->vm_page_prot);
> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c b/drivers/gpu/drm/nouveau/nouveau_gem.c
> index 49c2bcbef129..7e1453762ec9 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> @@ -56,7 +56,7 @@ static vm_fault_t nouveau_ttm_fault(struct vm_fault *vmf)
>
> nouveau_bo_del_io_reserve_lru(bo);
> prot = vm_get_page_prot(vma->vm_flags);
> - ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT);
> + ret = ttm_bo_vm_fault_reserved(vmf, prot, TTM_BO_VM_NUM_PREFAULT, true);
> nouveau_bo_add_io_reserve_lru(bo);
> if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
> return ret;
> diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c
> index 3fec3acdaf28..b21cf00ae162 100644
> --- a/drivers/gpu/drm/radeon/radeon_gem.c
> +++ b/drivers/gpu/drm/radeon/radeon_gem.c
> @@ -62,7 +62,7 @@ static vm_fault_t radeon_gem_fault(struct vm_fault *vmf)
> goto unlock_resv;
>
> ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
> - TTM_BO_VM_NUM_PREFAULT);
> + TTM_BO_VM_NUM_PREFAULT, true);
> if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
> goto unlock_mclk;
>
> diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> index 4212b8c91dd4..7d14a7d267aa 100644
> --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c
> +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c
> @@ -167,6 +167,7 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
> * @num_prefault: Maximum number of prefault pages. The caller may want to
> * specify this based on madvice settings and the size of the GPU object
> * backed by the memory.
> + * @mkwrite: make the pfn or page writable
> *
> * This function inserts one or more page table entries pointing to the
> * memory backing the buffer object, and then returns a return code
> @@ -180,7 +181,8 @@ EXPORT_SYMBOL(ttm_bo_vm_reserve);
> */
> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> pgprot_t prot,
> - pgoff_t num_prefault)
> + pgoff_t num_prefault,
> + bool mkwrite)
> {
> struct vm_area_struct *vma = vmf->vma;
> struct ttm_buffer_object *bo = vma->vm_private_data;
> @@ -263,7 +265,7 @@ vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> * at arbitrary times while the data is mmap'ed.
> * See vmf_insert_pfn_prot() for a discussion.
> */
> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot, mkwrite);
>
> /* Never error on prefaulted PTEs */
> if (unlikely((ret & VM_FAULT_ERROR))) {
> @@ -312,7 +314,7 @@ vm_fault_t ttm_bo_vm_dummy_page(struct vm_fault *vmf, pgprot_t prot)
> /* Prefault the entire VMA range right away to avoid further faults */
> for (address = vma->vm_start; address < vma->vm_end;
> address += PAGE_SIZE)
> - ret = vmf_insert_pfn_prot(vma, address, pfn, prot);
> + ret = vmf_insert_pfn_prot(vma, address, pfn, prot, true);
>
> return ret;
> }
> diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> index 74ff2812d66a..bb8e4b641681 100644
> --- a/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_page_dirty.c
> @@ -452,12 +452,14 @@ vm_fault_t vmw_bo_vm_fault(struct vm_fault *vmf)
> * sure the page protection is write-enabled so we don't get
> * a lot of unnecessary write faults.
> */
> - if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE)
> + if (vbo->dirty && vbo->dirty->method == VMW_BO_DIRTY_MKWRITE) {
> prot = vm_get_page_prot(vma->vm_flags & ~VM_SHARED);
> - else
> + ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault, false);
> + } else {
> prot = vm_get_page_prot(vma->vm_flags);
> + ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault, true);
> + }
>
> - ret = ttm_bo_vm_fault_reserved(vmf, prot, num_prefault);
> if (ret == VM_FAULT_RETRY && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT))
> return ret;
>
> diff --git a/include/drm/ttm/ttm_bo.h b/include/drm/ttm/ttm_bo.h
> index 0223a41a64b2..66e293db69ee 100644
> --- a/include/drm/ttm/ttm_bo.h
> +++ b/include/drm/ttm/ttm_bo.h
> @@ -386,7 +386,8 @@ vm_fault_t ttm_bo_vm_reserve(struct ttm_buffer_object *bo,
> struct vm_fault *vmf);
> vm_fault_t ttm_bo_vm_fault_reserved(struct vm_fault *vmf,
> pgprot_t prot,
> - pgoff_t num_prefault);
> + pgoff_t num_prefault,
> + bool mkwrite);
> vm_fault_t ttm_bo_vm_fault(struct vm_fault *vmf);
> void ttm_bo_vm_open(struct vm_area_struct *vma);
> void ttm_bo_vm_close(struct vm_area_struct *vma);
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index f5a97dec5169..f8868e28ea04 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3553,7 +3553,7 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
> unsigned long pfn);
> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
> - unsigned long pfn, pgprot_t pgprot);
> + unsigned long pfn, pgprot_t pgprot, bool mkwrite);
> vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
> pfn_t pfn);
> vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
> diff --git a/mm/memory.c b/mm/memory.c
> index 7e1f4849463a..2c28f1a349ff 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -2195,6 +2195,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
> * @addr: target user address of this page
> * @pfn: source kernel pfn
> * @pgprot: pgprot flags for the inserted page
> + * @mkwrite: make the pfn writable
> *
> * This is exactly like vmf_insert_pfn(), except that it allows drivers
> * to override pgprot on a per-page basis.
> @@ -2223,7 +2224,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
> * Return: vm_fault_t value.
> */
> vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
> - unsigned long pfn, pgprot_t pgprot)
> + unsigned long pfn, pgprot_t pgprot, bool mkwrite)
> {
> /*
> * Technically, architectures with pte_special can avoid all these
> @@ -2246,7 +2247,7 @@ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
> track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
>
> return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
> - false);
> + mkwrite);
> }
> EXPORT_SYMBOL(vmf_insert_pfn_prot);
>
> @@ -2273,10 +2274,17 @@ EXPORT_SYMBOL(vmf_insert_pfn_prot);
> vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
> unsigned long pfn)
> {
> - return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
> + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot, false);
> }
> EXPORT_SYMBOL(vmf_insert_pfn);
>
> +vm_fault_t vmf_insert_pfn_mkwrite(struct vm_area_struct *vma, unsigned long addr,
> + unsigned long pfn)
> +{
> + return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot, true);
> +}
> +EXPORT_SYMBOL(vmf_insert_pfn_mkwrite);
> +
> static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
> {
> /* these checks mirror the abort conditions in vm_normal_page */
Powered by blists - more mailing lists