lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <5b9f24d0-00b3-4581-826d-0bce8c9349ce@amd.com>
Date:   Thu, 31 Aug 2023 08:31:17 +0200
From:   Christian König <christian.koenig@....com>
To:     André Almeida <andrealmeid@...lia.com>,
        dri-devel@...ts.freedesktop.org, amd-gfx@...ts.freedesktop.org,
        linux-kernel@...r.kernel.org
Cc:     kernel-dev@...lia.com, alexander.deucher@....com,
        pierre-eric.pelloux-prayer@....com,
        'Marek Olšák' <maraeo@...il.com>
Subject: Re: [PATCH v2 2/2] drm/amdgpu: Create an option to disable soft
 recovery

Am 31.08.23 um 00:08 schrieb André Almeida:
> Create a module option to disable soft recoveries on amdgpu, making
> every recovery go through the device reset path. This option makes
> easier to force device resets for testing and debugging purposes.
>
> Signed-off-by: André Almeida <andrealmeid@...lia.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h      | 1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  | 6 ++++++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 6 +++++-
>   drivers/gpu/drm/amd/include/amd_shared.h | 1 +
>   4 files changed, 13 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 82eaccfce347..5f49e2c0ae7a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1105,6 +1105,7 @@ struct amdgpu_device {
>   	/* Debug */
>   	bool                            debug_vm;
>   	bool                            debug_largebar;
> +	bool                            debug_disable_soft_recovery;
>   };
>   
>   static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 0cd48c025433..59e9fe594b51 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -927,6 +927,7 @@ MODULE_PARM_DESC(enforce_isolation, "enforce process isolation between graphics
>    * - 0x2: Enable simulating large-bar capability on non-large bar system. This
>    *   limits the VRAM size reported to ROCm applications to the visible
>    *   size, usually 256MB.
> + * - 0x4: Disable GPU soft recovery

"Disable GPU soft recovery, always do a full reset."

Apart from that Reviewed-by: Christian König <christian.koenig@....com>.

Regards,
Christian.

>    */
>   MODULE_PARM_DESC(debug_mask, "debug options for amdgpu, disabled by default");
>   module_param_named(debug_mask, amdgpu_debug_mask, uint, 0444);
> @@ -2046,6 +2047,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
>   		pr_info("debug: enabled simulating large-bar capability on non-large bar system\n");
>   		adev->debug_largebar = true;
>   	}
> +
> +	if (amdgpu_debug_mask & AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY) {
> +		pr_info("debug: soft reset for GPU recovery disabled\n");
> +		adev->debug_disable_soft_recovery = true;
> +	}
>   }
>   
>   static int amdgpu_pci_probe(struct pci_dev *pdev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> index 80d6e132e409..6a80d3ec887e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> @@ -434,8 +434,12 @@ bool amdgpu_ring_soft_recovery(struct amdgpu_ring *ring, unsigned int vmid,
>   			       struct dma_fence *fence)
>   {
>   	unsigned long flags;
> +	ktime_t deadline;
>   
> -	ktime_t deadline = ktime_add_us(ktime_get(), 10000);
> +	if (unlikely(ring->adev->debug_disable_soft_recovery))
> +		return false;
> +
> +	deadline = ktime_add_us(ktime_get(), 10000);
>   
>   	if (amdgpu_sriov_vf(ring->adev) || !ring->funcs->soft_recovery || !fence)
>   		return false;
> diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h
> index 2fd6af2183cc..32ee982be99e 100644
> --- a/drivers/gpu/drm/amd/include/amd_shared.h
> +++ b/drivers/gpu/drm/amd/include/amd_shared.h
> @@ -263,6 +263,7 @@ enum amd_dpm_forced_level;
>   enum AMDGPU_DEBUG_MASK {
>   	AMDGPU_DEBUG_VM = BIT(0),
>   	AMDGPU_DEBUG_LARGEBAR = BIT(1),
> +	AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
>   };
>   
>   /**

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ