lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240805192903.vjsjahbl4t5z5vzp@hu-akhilpo-hyd.qualcomm.com>
Date: Tue, 6 Aug 2024 00:59:03 +0530
From: Akhil P Oommen <quic_akhilpo@...cinc.com>
To: Vladimir Lypak <vladimir.lypak@...il.com>
CC: Rob Clark <robdclark@...il.com>, Sean Paul <sean@...rly.run>,
        "Konrad
 Dybcio" <konrad.dybcio@...aro.org>,
        Abhinav Kumar
	<quic_abhinavk@...cinc.com>,
        Dmitry Baryshkov <dmitry.baryshkov@...aro.org>,
        Marijn Suijten <marijn.suijten@...ainline.org>,
        David Airlie
	<airlied@...il.com>, "Daniel Vetter" <daniel@...ll.ch>,
        Jordan Crouse
	<jordan@...micpenguin.net>,
        <linux-arm-msm@...r.kernel.org>, <dri-devel@...ts.freedesktop.org>,
        <freedreno@...ts.freedesktop.org>, <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH 3/4] drm/msm/a5xx: fix races in preemption evaluation
 stage

On Thu, Jul 11, 2024 at 10:00:20AM +0000, Vladimir Lypak wrote:
> On A5XX GPUs when preemption is used it's invietable to enter a soft
> lock-up state in which GPU is stuck at empty ring-buffer doing nothing.
> This appears as full UI lockup and not detected as GPU hang (because
> it's not). This happens due to not triggering preemption when it was
> needed. Sometimes this state can be recovered by some new submit but
> generally it won't happen because applications are waiting for old
> submits to retire.
> 
> One of the reasons why this happens is a race between a5xx_submit and
> a5xx_preempt_trigger called from IRQ during submit retire. Former thread
> updates ring->cur of previously empty and not current ring right after
> latter checks it for emptiness. Then both threads can just exit because
> for first one preempt_state wasn't NONE yet and for second one all rings
> appeared to be empty.
> 
> To prevent such situations from happening we need to establish guarantee
> for preempt_trigger to be called after each submit. To implement it this
> patch adds trigger call at the end of a5xx_preempt_irq to re-check if we
> should switch to non-empty or higher priority ring. Also we find next
> ring in new preemption state "EVALUATE". If the thread that updated some
> ring with new submit sees this state it should wait until it passes.
> 
> Fixes: b1fc2839d2f9 ("drm/msm: Implement preemption for A5XX targets")
> Signed-off-by: Vladimir Lypak <vladimir.lypak@...il.com>

I didn't go through the other thread with Connor completely, but can you
please check if the below chunk is enough instead of this patch?

diff --git a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
index f58dd564d122..d69b14ebbe44 100644
--- a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
+++ b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
@@ -47,9 +47,8 @@ static inline void update_wptr(struct msm_gpu *gpu, struct msm_ringbuffer *ring)

        spin_lock_irqsave(&ring->preempt_lock, flags);
        wptr = get_wptr(ring);
-       spin_unlock_irqrestore(&ring->preempt_lock, flags);
-
        gpu_write(gpu, REG_A5XX_CP_RB_WPTR, wptr);
+       spin_unlock_irqrestore(&ring->preempt_lock, flags);
 }

 /* Return the highest priority ringbuffer with something in it */
@@ -188,6 +187,8 @@ void a5xx_preempt_irq(struct msm_gpu *gpu)
        update_wptr(gpu, a5xx_gpu->cur_ring);

        set_preempt_state(a5xx_gpu, PREEMPT_NONE);
+
+       a5xx_preempt_trigger(gpu);
 }

-Akhil

> ---
>  drivers/gpu/drm/msm/adreno/a5xx_gpu.c     |  6 +++---
>  drivers/gpu/drm/msm/adreno/a5xx_gpu.h     | 11 +++++++----
>  drivers/gpu/drm/msm/adreno/a5xx_preempt.c | 24 +++++++++++++++++++----
>  3 files changed, 30 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> index 6c80d3003966..266744ee1d5f 100644
> --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
> @@ -110,7 +110,7 @@ static void a5xx_submit_in_rb(struct msm_gpu *gpu, struct msm_gem_submit *submit
>  	}
>  
>  	a5xx_flush(gpu, ring, true);
> -	a5xx_preempt_trigger(gpu);
> +	a5xx_preempt_trigger(gpu, true);
>  
>  	/* we might not necessarily have a cmd from userspace to
>  	 * trigger an event to know that submit has completed, so
> @@ -240,7 +240,7 @@ static void a5xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit)
>  	a5xx_flush(gpu, ring, false);
>  
>  	/* Check to see if we need to start preemption */
> -	a5xx_preempt_trigger(gpu);
> +	a5xx_preempt_trigger(gpu, true);
>  }
>  
>  static const struct adreno_five_hwcg_regs {
> @@ -1296,7 +1296,7 @@ static irqreturn_t a5xx_irq(struct msm_gpu *gpu)
>  		a5xx_gpmu_err_irq(gpu);
>  
>  	if (status & A5XX_RBBM_INT_0_MASK_CP_CACHE_FLUSH_TS) {
> -		a5xx_preempt_trigger(gpu);
> +		a5xx_preempt_trigger(gpu, false);
>  		msm_gpu_retire(gpu);
>  	}
>  
> diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
> index c7187bcc5e90..1120824853d4 100644
> --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
> +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.h
> @@ -57,10 +57,12 @@ void a5xx_debugfs_init(struct msm_gpu *gpu, struct drm_minor *minor);
>   * through the process.
>   *
>   * PREEMPT_NONE - no preemption in progress.  Next state START.
> - * PREEMPT_START - The trigger is evaulating if preemption is possible. Next
> - * states: TRIGGERED, NONE
> + * PREEMPT_EVALUATE - The trigger is evaulating if preemption is possible. Next
> + * states: START, ABORT
>   * PREEMPT_ABORT - An intermediate state before moving back to NONE. Next
>   * state: NONE.
> + * PREEMPT_START - The trigger is preparing for preemption. Next state:
> + * TRIGGERED
>   * PREEMPT_TRIGGERED: A preemption has been executed on the hardware. Next
>   * states: FAULTED, PENDING
>   * PREEMPT_FAULTED: A preemption timed out (never completed). This will trigger
> @@ -71,8 +73,9 @@ void a5xx_debugfs_init(struct msm_gpu *gpu, struct drm_minor *minor);
>  
>  enum preempt_state {
>  	PREEMPT_NONE = 0,
> -	PREEMPT_START,
> +	PREEMPT_EVALUATE,
>  	PREEMPT_ABORT,
> +	PREEMPT_START,
>  	PREEMPT_TRIGGERED,
>  	PREEMPT_FAULTED,
>  	PREEMPT_PENDING,
> @@ -156,7 +159,7 @@ void a5xx_set_hwcg(struct msm_gpu *gpu, bool state);
>  
>  void a5xx_preempt_init(struct msm_gpu *gpu);
>  void a5xx_preempt_hw_init(struct msm_gpu *gpu);
> -void a5xx_preempt_trigger(struct msm_gpu *gpu);
> +void a5xx_preempt_trigger(struct msm_gpu *gpu, bool new_submit);
>  void a5xx_preempt_irq(struct msm_gpu *gpu);
>  void a5xx_preempt_fini(struct msm_gpu *gpu);
>  
> diff --git a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
> index 67a8ef4adf6b..f8d09a83c5ae 100644
> --- a/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
> +++ b/drivers/gpu/drm/msm/adreno/a5xx_preempt.c
> @@ -87,21 +87,33 @@ static void a5xx_preempt_timer(struct timer_list *t)
>  }
>  
>  /* Try to trigger a preemption switch */
> -void a5xx_preempt_trigger(struct msm_gpu *gpu)
> +void a5xx_preempt_trigger(struct msm_gpu *gpu, bool new_submit)
>  {
>  	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>  	struct a5xx_gpu *a5xx_gpu = to_a5xx_gpu(adreno_gpu);
>  	unsigned long flags;
>  	struct msm_ringbuffer *ring;
> +	enum preempt_state state;
>  
>  	if (gpu->nr_rings == 1)
>  		return;
>  
>  	/*
> -	 * Try to start preemption by moving from NONE to START. If
> -	 * unsuccessful, a preemption is already in flight
> +	 * Try to start preemption by moving from NONE to EVALUATE. If current
> +	 * state is EVALUATE/ABORT we can't just quit because then we can't
> +	 * guarantee that preempt_trigger will be called after ring is updated
> +	 * by new submit.
>  	 */
> -	if (!try_preempt_state(a5xx_gpu, PREEMPT_NONE, PREEMPT_START))
> +	state = atomic_cmpxchg(&a5xx_gpu->preempt_state, PREEMPT_NONE,
> +			       PREEMPT_EVALUATE);
> +	while (new_submit && (state == PREEMPT_EVALUATE ||
> +			      state == PREEMPT_ABORT)) {
> +		cpu_relax();
> +		state = atomic_cmpxchg(&a5xx_gpu->preempt_state, PREEMPT_NONE,
> +				       PREEMPT_EVALUATE);
> +	}
> +
> +	if (state != PREEMPT_NONE)
>  		return;
>  
>  	/* Get the next ring to preempt to */
> @@ -130,6 +142,8 @@ void a5xx_preempt_trigger(struct msm_gpu *gpu)
>  		return;
>  	}
>  
> +	set_preempt_state(a5xx_gpu, PREEMPT_START);
> +
>  	/* Make sure the wptr doesn't update while we're in motion */
>  	spin_lock_irqsave(&ring->preempt_lock, flags);
>  	a5xx_gpu->preempt[ring->id]->wptr = get_wptr(ring);
> @@ -188,6 +202,8 @@ void a5xx_preempt_irq(struct msm_gpu *gpu)
>  	update_wptr(gpu, a5xx_gpu->cur_ring);
>  
>  	set_preempt_state(a5xx_gpu, PREEMPT_NONE);
> +
> +	a5xx_preempt_trigger(gpu, false);
>  }
>  
>  void a5xx_preempt_hw_init(struct msm_gpu *gpu)
> -- 
> 2.45.2
> 

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ