[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <CADnq5_MjDxqG9GzPShL0oucpCPx9J5HodMWRuaOAgs0s0CD0=A@mail.gmail.com>
Date: Wed, 26 Feb 2025 14:22:13 -0500
From: Alex Deucher <alexdeucher@...il.com>
To: André Almeida <andrealmeid@...lia.com>
Cc: Alex Deucher <alexander.deucher@....com>,
Christian König <christian.koenig@....com>,
amd-gfx@...ts.freedesktop.org, dri-devel@...ts.freedesktop.org,
linux-kernel@...r.kernel.org, kernel-dev@...lia.com, siqueira@...lia.com
Subject: Re: [PATCH] drm/amdgpu: Create a debug option to disable ring reset
Applied. Thanks!
On Wed, Feb 26, 2025 at 8:11 AM André Almeida <andrealmeid@...lia.com> wrote:
>
> Prior to the addition of ring reset, the debug option
> `debug_disable_soft_recovery` could be used to force a full device
> reset. Now that we have ring reset, create a debug option to disable
> them in amdgpu, forcing the driver to go with the full device
> reset path again when both options are combined.
>
> This option is useful for testing and debugging purposes when one wants
> to test the full reset from userspace.
>
> Signed-off-by: André Almeida <andrealmeid@...lia.com>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
> drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 ++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_job.c | 5 +++--
> 3 files changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 69895fccb474..75dc4b962d64 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1186,6 +1186,7 @@ struct amdgpu_device {
> bool debug_use_vram_fw_buf;
> bool debug_enable_ras_aca;
> bool debug_exp_resets;
> + bool debug_disable_gpu_ring_reset;
>
> bool enforce_isolation[MAX_XCP];
> /* Added this mutex for cleaner shader isolation between GFX and compute processes */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> index 95a05b03f799..edeb12c816e8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> @@ -136,6 +136,7 @@ enum AMDGPU_DEBUG_MASK {
> AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3),
> AMDGPU_DEBUG_ENABLE_RAS_ACA = BIT(4),
> AMDGPU_DEBUG_ENABLE_EXP_RESETS = BIT(5),
> + AMDGPU_DEBUG_DISABLE_GPU_RING_RESET = BIT(6),
> };
>
> unsigned int amdgpu_vram_limit = UINT_MAX;
> @@ -2221,6 +2222,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
> pr_info("debug: enable experimental reset features\n");
> adev->debug_exp_resets = true;
> }
> +
> + if (amdgpu_debug_mask & AMDGPU_DEBUG_DISABLE_GPU_RING_RESET) {
> + pr_info("debug: ring reset disabled\n");
> + adev->debug_disable_gpu_ring_reset = true;
> + }
> }
>
> static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> index 7b79b0f39ba1..8ab23182127e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
> @@ -137,8 +137,9 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
> dma_fence_set_error(&s_job->s_fence->finished, -ETIME);
>
> /* attempt a per ring reset */
> - if (amdgpu_gpu_recovery &&
> - ring->funcs->reset) {
> + if (unlikely(adev->debug_disable_gpu_ring_reset)) {
> + dev_err(adev->dev, "Ring reset disabled by debug mask\n");
> + } else if (amdgpu_gpu_recovery && ring->funcs->reset) {
> dev_err(adev->dev, "Starting %s ring reset\n", s_job->sched->name);
> /* stop the scheduler, but don't mess with the
> * bad job yet because if ring reset fails
> --
> 2.48.1
>
Powered by blists - more mailing lists