linux-kernel - Re: [PATCH] drm/amdgpu: cache in more vm fault information

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <2f792620-fd8a-412e-9130-e276ba36d5a0@amd.com>
Date: Wed, 6 Mar 2024 11:07:01 +0100
From: Christian König <christian.koenig@....com>
To: Sunil Khatri <sunil.khatri@....com>,
 Alex Deucher <alexander.deucher@....com>,
 Shashank Sharma <shashank.sharma@....com>
Cc: amd-gfx@...ts.freedesktop.org, Pan@...-sunil-navi33.amd.com,
 Xinhui <Xinhui.Pan@....com>, dri-devel@...ts.freedesktop.org,
 linux-kernel@...r.kernel.org, Mukul Joshi <mukul.joshi@....com>,
 Arunpravin Paneer Selvam <Arunpravin.PaneerSelvam@....com>
Subject: Re: [PATCH] drm/amdgpu: cache in more vm fault information

Am 06.03.24 um 10:04 schrieb Sunil Khatri:
> When an  page fault interrupt is raised there
> is a lot more information that is useful for
> developers to analyse the pagefault.

Well actually those information are not that interesting  because they 
are hw generation specific.

You should probably rather use the decoded strings here, e.g. hub, 
client, xcc_id, node_id etc...

See gmc_v9_0_process_interrupt() an example.

Regards,
Christian.

>
> Add all such information in the last cached
> pagefault from an interrupt handler.
>
> Signed-off-by: Sunil Khatri <sunil.khatri@....com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 9 +++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h | 7 ++++++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 2 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 2 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  | 2 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 2 +-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 2 +-
>   7 files changed, 18 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> index 4299ce386322..b77e8e28769d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> @@ -2905,7 +2905,7 @@ void amdgpu_debugfs_vm_bo_info(struct amdgpu_vm *vm, struct seq_file *m)
>    * Cache the fault info for later use by userspace in debugging.
>    */
>   void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
> -				  unsigned int pasid,
> +				  struct amdgpu_iv_entry *entry,
>   				  uint64_t addr,
>   				  uint32_t status,
>   				  unsigned int vmhub)
> @@ -2915,7 +2915,7 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
>   
>   	xa_lock_irqsave(&adev->vm_manager.pasids, flags);
>   
> -	vm = xa_load(&adev->vm_manager.pasids, pasid);
> +	vm = xa_load(&adev->vm_manager.pasids, entry->pasid);
>   	/* Don't update the fault cache if status is 0.  In the multiple
>   	 * fault case, subsequent faults will return a 0 status which is
>   	 * useless for userspace and replaces the useful fault status, so
> @@ -2924,6 +2924,11 @@ void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
>   	if (vm && status) {
>   		vm->fault_info.addr = addr;
>   		vm->fault_info.status = status;
> +		vm->fault_info.client_id = entry->client_id;
> +		vm->fault_info.src_id = entry->src_id;
> +		vm->fault_info.vmid = entry->vmid;
> +		vm->fault_info.pasid = entry->pasid;
> +		vm->fault_info.ring_id = entry->ring_id;
>   		if (AMDGPU_IS_GFXHUB(vmhub)) {
>   			vm->fault_info.vmhub = AMDGPU_VMHUB_TYPE_GFX;
>   			vm->fault_info.vmhub |=
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> index 047ec1930d12..c7782a89bdb5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> @@ -286,6 +286,11 @@ struct amdgpu_vm_fault_info {
>   	uint32_t	status;
>   	/* which vmhub? gfxhub, mmhub, etc. */
>   	unsigned int	vmhub;
> +	unsigned int	client_id;
> +	unsigned int	src_id;
> +	unsigned int	ring_id;
> +	unsigned int	pasid;
> +	unsigned int	vmid;
>   };
>   
>   struct amdgpu_vm {
> @@ -605,7 +610,7 @@ static inline void amdgpu_vm_eviction_unlock(struct amdgpu_vm *vm)
>   }
>   
>   void amdgpu_vm_update_fault_cache(struct amdgpu_device *adev,
> -				  unsigned int pasid,
> +				  struct amdgpu_iv_entry *entry,
>   				  uint64_t addr,
>   				  uint32_t status,
>   				  unsigned int vmhub);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index d933e19e0cf5..6b177ce8db0e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -150,7 +150,7 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
>   		status = RREG32(hub->vm_l2_pro_fault_status);
>   		WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
>   
> -		amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status,
> +		amdgpu_vm_update_fault_cache(adev, entry, addr, status,
>   					     entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0));
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index 527dc917e049..bcf254856a3e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -121,7 +121,7 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device *adev,
>   		status = RREG32(hub->vm_l2_pro_fault_status);
>   		WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
>   
> -		amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status,
> +		amdgpu_vm_update_fault_cache(adev, entry, addr, status,
>   					     entry->vmid_src ? AMDGPU_MMHUB0(0) : AMDGPU_GFXHUB(0));
>   	}
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
> index 3da7b6a2b00d..e9517ebbe1fd 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
> @@ -1270,7 +1270,7 @@ static int gmc_v7_0_process_interrupt(struct amdgpu_device *adev,
>   	if (!addr && !status)
>   		return 0;
>   
> -	amdgpu_vm_update_fault_cache(adev, entry->pasid,
> +	amdgpu_vm_update_fault_cache(adev, entry,
>   				     ((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, status, AMDGPU_GFXHUB(0));
>   
>   	if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST)
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> index d20e5f20ee31..a271bf832312 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> @@ -1438,7 +1438,7 @@ static int gmc_v8_0_process_interrupt(struct amdgpu_device *adev,
>   	if (!addr && !status)
>   		return 0;
>   
> -	amdgpu_vm_update_fault_cache(adev, entry->pasid,
> +	amdgpu_vm_update_fault_cache(adev, entry,
>   				     ((u64)addr) << AMDGPU_GPU_PAGE_SHIFT, status, AMDGPU_GFXHUB(0));
>   
>   	if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_FIRST)
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 47b63a4ce68b..dc9fb1fb9540 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -666,7 +666,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
>   	rw = REG_GET_FIELD(status, VM_L2_PROTECTION_FAULT_STATUS, RW);
>   	WREG32_P(hub->vm_l2_pro_fault_cntl, 1, ~1);
>   
> -	amdgpu_vm_update_fault_cache(adev, entry->pasid, addr, status, vmhub);
> +	amdgpu_vm_update_fault_cache(adev, entry, addr, status, vmhub);
>   
>   	dev_err(adev->dev,
>   		"VM_L2_PROTECTION_FAULT_STATUS:0x%08X\n",