linux-kernel - Re: [PATCH 2/2] drm/msm: preemption aware hangcheck

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <d3490029-f532-48c2-a44a-9a7049ecea4d@oss.qualcomm.com>
Date: Sat, 27 Sep 2025 02:42:25 +0530
From: Akhil P Oommen <akhilpo@....qualcomm.com>
To: Anna Maniscalco <anna.maniscalco2000@...il.com>
Cc: linux-arm-msm@...r.kernel.org, dri-devel@...ts.freedesktop.org,
        freedreno@...ts.freedesktop.org, linux-kernel@...r.kernel.org,
        Rob Clark <robin.clark@....qualcomm.com>, Sean Paul <sean@...rly.run>,
        Konrad Dybcio <konradybcio@...nel.org>,
        Dmitry Baryshkov <lumag@...nel.org>,
        Abhinav Kumar <abhinav.kumar@...ux.dev>,
        Jessica Zhang <jessica.zhang@....qualcomm.com>,
        Marijn Suijten <marijn.suijten@...ainline.org>,
        David Airlie <airlied@...il.com>, Simona Vetter <simona@...ll.ch>
Subject: Re: [PATCH 2/2] drm/msm: preemption aware hangcheck

On 9/18/2025 8:15 PM, Anna Maniscalco wrote:
> On 9/17/25 5:23 PM, Akhil P Oommen wrote:
>> On 9/11/2025 10:31 PM, Anna Maniscalco wrote:
>>> Rework hangcheck code to work well toghether with preemption.
>>>
>>> Track the time a job has spent in a ring by storing timestamps of the
>>> `CP_ALWAYS_ON_CONTEXT` register at the beginning of a job and when
>>> switching rings as well as reading it back if the ring is currently
>>> active.
>>>
>>> Signed-off-by: Anna Maniscalco <anna.maniscalco2000@...il.com>
>>> ---
>>>   drivers/gpu/drm/msm/adreno/a5xx_gpu.c     |  3 +-
>>>   drivers/gpu/drm/msm/adreno/a6xx_gmu.c     |  3 +-
>>>   drivers/gpu/drm/msm/adreno/a6xx_gpu.c     | 28 +++++++++++++++--
>>>   drivers/gpu/drm/msm/adreno/a6xx_gpu.h     |  1 +
>>>   drivers/gpu/drm/msm/adreno/a6xx_preempt.c | 25 +++++++++++----
>>>   drivers/gpu/drm/msm/adreno/adreno_gpu.c   |  3 +-
>>>   drivers/gpu/drm/msm/msm_gpu.c             | 51 ++++++++++++++++++++
>>> +++++------
>>>   drivers/gpu/drm/msm/msm_gpu.h             |  3 ++
>>>   drivers/gpu/drm/msm/msm_ringbuffer.h      |  6 ++++
>>>   9 files changed, 102 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c b/drivers/gpu/drm/
>>> msm/adreno/a5xx_gpu.c
>>> index
>>> 4a04dc43a8e6764a113d0ade3dee94bd4c0083af..cb4775a35da0706e571eb27ce617044de84ca118 100644
>>> --- a/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
>>> +++ b/drivers/gpu/drm/msm/adreno/a5xx_gpu.c
>>> @@ -1255,7 +1255,8 @@ static void a5xx_fault_detect_irq(struct
>>> msm_gpu *gpu)
>>>           gpu_read(gpu, REG_A5XX_CP_IB2_BUFSZ));
>>>         /* Turn off the hangcheck timer to keep it from bothering us */
>>> -    timer_delete(&gpu->hangcheck_timer);
>>> +    for (int i = 0; i < gpu->nr_rings; i++)
>>> +        timer_delete(&gpu->rb[i]->hangcheck_timer);
>>>         kthread_queue_work(gpu->worker, &gpu->recover_work);
>>>   }
>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c b/drivers/gpu/drm/
>>> msm/adreno/a6xx_gmu.c
>>> index
>>> fc62fef2fed87f065cb8fa4e997abefe4ff11cd5..103c19fa8669f06a6c1627ced1daf2bcd60415db 100644
>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gmu.c
>>> @@ -28,7 +28,8 @@ static void a6xx_gmu_fault(struct a6xx_gmu *gmu)
>>>       gmu->hung = true;
>>>         /* Turn off the hangcheck timer while we are resetting */
>>> -    timer_delete(&gpu->hangcheck_timer);
>>> +    for (int i = 0; i < gpu->nr_rings; i++)
>>> +        timer_delete(&gpu->rb[i]->hangcheck_timer);
>>>         /* Queue the GPU handler because we need to treat this as a
>>> recovery */
>>>       kthread_queue_work(gpu->worker, &gpu->recover_work);
>> I think a helper routine makes sense. We have to disable hangcheck
>> whenever recover_worker is queued.
>>
>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/
>>> msm/adreno/a6xx_gpu.c
>>> index
>>> b8f8ae940b55f5578abdbdec6bf1e90a53e721a5..7647e3dfd50db7446589e67949ed08d0a422f543 100644
>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>>> @@ -465,6 +465,9 @@ static void a7xx_submit(struct msm_gpu *gpu,
>>> struct msm_gem_submit *submit)
>>>       get_stats_counter(ring, REG_A6XX_CP_ALWAYS_ON_COUNTER,
>>>           rbmemptr_stats(ring, index, alwayson_start));
>>>   +    get_stats_counter(ring, REG_A6XX_CP_ALWAYS_ON_CONTEXT,
>>> +        rbmemptr(ring, last_job_start_ctx));
>>> +
>>>       OUT_PKT7(ring, CP_THREAD_CONTROL, 1);
>>>       OUT_RING(ring, CP_SET_THREAD_BOTH);
>>>   @@ -1816,7 +1819,8 @@ static void a6xx_fault_detect_irq(struct
>>> msm_gpu *gpu)
>>>           gpu_read(gpu, REG_A6XX_CP_IB2_REM_SIZE));
>>>         /* Turn off the hangcheck timer to keep it from bothering us */
>>> -    timer_delete(&gpu->hangcheck_timer);
>>> +    for (int i = 0; i < gpu->nr_rings; i++)
>>> +        timer_delete(&gpu->rb[i]->hangcheck_timer);
>>>         /* Turn off interrupts to avoid triggering recovery again */
>>>       gpu_write(gpu, REG_A6XX_RBBM_INT_0_MASK, 0);
>>> @@ -1839,7 +1843,8 @@ static void a7xx_sw_fuse_violation_irq(struct
>>> msm_gpu *gpu)
>>>        */
>>>       if (status & (A7XX_CX_MISC_SW_FUSE_VALUE_RAYTRACING |
>>>                 A7XX_CX_MISC_SW_FUSE_VALUE_LPAC)) {
>>> -        timer_delete(&gpu->hangcheck_timer);
>>> +        for (int i = 0; i < gpu->nr_rings; i++)
>>> +            timer_delete(&gpu->rb[i]->hangcheck_timer);
>>>             kthread_queue_work(gpu->worker, &gpu->recover_work);
>>>       }
>>> @@ -2327,6 +2332,22 @@ static int a6xx_get_timestamp(struct msm_gpu
>>> *gpu, uint64_t *value)
>>>       return 0;
>>>   }
>>>   +static int a6xx_get_ctx_timestamp(struct msm_ringbuffer *ring,
>>> uint64_t *value)
>>> +{
>>> +    struct msm_gpu *gpu = ring->gpu;
>>> +    struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>>> +    struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
>>> +
>>> +    guard(spinlock_irqsave)(&a6xx_gpu->eval_lock);
>> Is eval_lock initialized anywhere? Also why do we need this?
> 
> Yeah eval_lock was introduced in the preemption series. It is
> initialized in `a6xx_preempt_hw_init`.
> 
>>
>>> +
>>> +    if (a6xx_in_preempt(a6xx_gpu) || ring != a6xx_gpu->cur_ring)
>> This will race with preemption. I think we should wrap the preempt state
>> check and the regread under the preempt lock.
> 
> Continuing. The idea is that if in_preempt returns true then it doesn't
> matter that reading cur_ring might race because we exit early.
> 
> On the other end, if it returns false, since we are holding `eval_lock`
> and the only place where we can go from the PREEMPT_NONE state to any
> other state is also guarded by that lock, then we are guaranteed that no
> preemption will occur so long as we are within this function.
> 
>>
>>> +        return -EBUSY;
>>> +
>>> +    *value = gpu_read64(gpu, REG_A6XX_CP_ALWAYS_ON_CONTEXT);With
>>> IFPC, we cannot access a GX domain register (CP, RBBM etc) unless
>> we are certain that the GX is powered up. Could you please test this
>> series along with the IFPC series? If we hit the right timing, there
>> should be a GMU fence error in the dmesg. Not sure how easy it is to hit
>> that timing, but I believe there is a problem here conceptually.
> 
> Right. I'll fix this although we don't have a fenced_read helper and I
> wonder if reading behaves like writing when it comes to waking up the gpu.
> 
> So would it work to try the read and poll REG_A6XX_GMU_AHB_FENCE_STATUS
> like we do when writing?
Unfortunately, only fenced write is supported as per the hw spec and
that too for a specific region or registers.

> 
>>> +
>>> +    return 0;
>>> +}
>>> +
>>>   static struct msm_ringbuffer *a6xx_active_ring(struct msm_gpu *gpu)
>>>   {
>>>       struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>>> @@ -2555,6 +2576,7 @@ static const struct adreno_gpu_funcs funcs = {
>>>           .get_rptr = a6xx_get_rptr,
>>>           .progress = a6xx_progress,
>>>           .sysprof_setup = a6xx_gmu_sysprof_setup,
>>> +        .get_ctx_timestamp = a6xx_get_ctx_timestamp,
>>>       },
>>>       .get_timestamp = a6xx_gmu_get_timestamp,
>>>   };
>>> @@ -2584,6 +2606,7 @@ static const struct adreno_gpu_funcs
>>> funcs_gmuwrapper = {
>>>           .create_private_vm = a6xx_create_private_vm,
>>>           .get_rptr = a6xx_get_rptr,
>>>           .progress = a6xx_progress,
>>> +        .get_ctx_timestamp = a6xx_get_ctx_timestamp,
>>>       },
>>>       .get_timestamp = a6xx_get_timestamp,
>>>   };
>>> @@ -2616,6 +2639,7 @@ static const struct adreno_gpu_funcs funcs_a7xx
>>> = {
>>>           .get_rptr = a6xx_get_rptr,
>>>           .progress = a6xx_progress,
>>>           .sysprof_setup = a6xx_gmu_sysprof_setup,
>>> +        .get_ctx_timestamp = a6xx_get_ctx_timestamp,
>>>       },
>>>       .get_timestamp = a6xx_gmu_get_timestamp,
>>>   };
>>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/
>>> msm/adreno/a6xx_gpu.h
>>> index
>>> 0b17d36c36a9567e6afa4269ae7783ed3578e40e..7248d3d38c6d8a06cb4a536043bf4877179447cc 100644
>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>>> @@ -80,6 +80,7 @@ struct a6xx_gpu {
>>>       struct drm_gem_object *preempt_postamble_bo;
>>>       void *preempt_postamble_ptr;
>>>       uint64_t preempt_postamble_iova;
>>> +    uint64_t preempt_postamble_cntreset_end;
>>>       uint64_t preempt_postamble_len;
>>>       bool postamble_enabled;
>>>   diff --git a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c b/drivers/
>>> gpu/drm/msm/adreno/a6xx_preempt.c
>>> index
>>> afc5f4aa3b17334027f3c20072cc3f059a9733b7..88a65549fa8038d4836eb5aeaea775d679415315 100644
>>> --- a/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
>>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_preempt.c
>>> @@ -99,11 +99,15 @@ static void a6xx_preempt_timer(struct timer_list *t)
>>>       kthread_queue_work(gpu->worker, &gpu->recover_work);
>>>   }
>>>   -static void preempt_prepare_postamble(struct a6xx_gpu *a6xx_gpu)
>>> +static void preempt_prepare_postamble(struct a6xx_gpu *a6xx_gpu,
>>> struct msm_ringbuffer *ring)
>>>   {
>>>       u32 *postamble = a6xx_gpu->preempt_postamble_ptr;
>>> +    uint64_t last_active_ctxcycles;
>>>       u32 count = 0;
>>>   +    if (ring)
>>> +        last_active_ctxcycles = rbmemptr(ring, last_active_ctxcycles);
>>> +
>>>       postamble[count++] = PKT7(CP_REG_RMW, 3);
>>>       postamble[count++] = REG_A6XX_RBBM_PERFCTR_SRAM_INIT_CMD;
>>>       postamble[count++] = 0;
>>> @@ -118,6 +122,15 @@ static void preempt_prepare_postamble(struct
>>> a6xx_gpu *a6xx_gpu)
>>>       postamble[count++] = CP_WAIT_REG_MEM_4_MASK(0x1);
>>>       postamble[count++] = CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(0);
>>>   +    a6xx_gpu->preempt_postamble_cntreset_end = count;
>>> +
>>> +    postamble[count++] = PKT7(ring ? CP_REG_TO_MEM : CP_NOP, 3);
>>> +    postamble[count++] =
>>> CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_CONTEXT) |
>>> +        CP_REG_TO_MEM_0_CNT(2) |
>>> +        CP_REG_TO_MEM_0_64B;
>>> +    postamble[count++] = lower_32_bits(last_active_ctxcycles);
>>> +    postamble[count++] = upper_32_bits(last_active_ctxcycles);
>>> +
>>>       a6xx_gpu->preempt_postamble_len = count;
>>>         a6xx_gpu->postamble_enabled = true;
>>> @@ -129,9 +142,9 @@ static void preempt_disable_postamble(struct
>>> a6xx_gpu *a6xx_gpu)
>>>         /*
>>>        * Disable the postamble by replacing the first packet header
>>> with a NOP
>>> -     * that covers the whole buffer.
>>> +     * that skips the counters reset part.
>>>        */
>>> -    *postamble = PKT7(CP_NOP, (a6xx_gpu->preempt_postamble_len - 1));
>>> +    *postamble = PKT7(CP_NOP, (a6xx_gpu-
>>> >preempt_postamble_cntreset_end - 1));
>>>         a6xx_gpu->postamble_enabled = false;
>>>   }
>>> @@ -338,8 +351,8 @@ void a6xx_preempt_trigger(struct msm_gpu *gpu)
>>>       /* Enable or disable postamble as needed */
>>>       sysprof = refcount_read(&a6xx_gpu->base.base.sysprof_active) > 1;
>>>   -    if (!sysprof && !a6xx_gpu->postamble_enabled)
>>> -        preempt_prepare_postamble(a6xx_gpu);
>>> +    if (!sysprof)
>>> +        preempt_prepare_postamble(a6xx_gpu, ring);
>>>         if (sysprof && a6xx_gpu->postamble_enabled)
>>>           preempt_disable_postamble(a6xx_gpu);
>>> @@ -454,7 +467,7 @@ void a6xx_preempt_init(struct msm_gpu *gpu)
>>>               gpu->vm, &a6xx_gpu->preempt_postamble_bo,
>>>               &a6xx_gpu->preempt_postamble_iova);
>>>   -    preempt_prepare_postamble(a6xx_gpu);
>>> +    preempt_prepare_postamble(a6xx_gpu, NULL);
>>>         if (IS_ERR(a6xx_gpu->preempt_postamble_ptr))
>>>           goto fail;
>>> diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c b/drivers/gpu/
>>> drm/msm/adreno/adreno_gpu.c
>>> index
>>> afaa3cfefd357dc0230994c8b5830a14c6d7a352..58f1e2a95bbfb00feb5a3bb91853e6bb533ec631 100644
>>> --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>>> +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>>> @@ -334,7 +334,8 @@ int adreno_fault_handler(struct msm_gpu *gpu,
>>> unsigned long iova, int flags,
>>>           struct msm_gpu_fault_info fault_info = {};
>>>             /* Turn off the hangcheck timer to keep it from bothering
>>> us */
>>> -        timer_delete(&gpu->hangcheck_timer);
>>> +        for (int i = 0; i < gpu->nr_rings; i++)
>>> +            timer_delete(&gpu->rb[i]->hangcheck_timer);
>>>             fault_info.ttbr0 = info->ttbr0;
>>>           fault_info.iova  = iova;
>>> diff --git a/drivers/gpu/drm/msm/msm_gpu.c b/drivers/gpu/drm/msm/
>>> msm_gpu.c
>>> index
>>> 17759abc46d7d7af4117b1d71f1d5fba6ba0b61c..a3c5073aca1f65e450e0673262e8ca4bc7a5be6f 100644
>>> --- a/drivers/gpu/drm/msm/msm_gpu.c
>>> +++ b/drivers/gpu/drm/msm/msm_gpu.c
>>> @@ -463,7 +463,9 @@ static void recover_worker(struct kthread_work
>>> *work)
>>>       struct drm_device *dev = gpu->dev;
>>>       struct msm_drm_private *priv = dev->dev_private;
>>>       struct msm_gem_submit *submit;
>>> -    struct msm_ringbuffer *cur_ring = gpu->funcs->active_ring(gpu);
>>> +    struct msm_ringbuffer *cur_ring = gpu->hung_ring ?
>>> +        gpu->hung_ring : gpu->funcs->active_ring(gpu);
>>> +    gpu->hung_ring = NULL;
>>>       char *comm = NULL, *cmd = NULL;
>>>       struct task_struct *task;
>>>       int i;
>>> @@ -613,11 +615,17 @@ void msm_gpu_fault_crashstate_capture(struct
>>> msm_gpu *gpu, struct msm_gpu_fault_
>>>       mutex_unlock(&gpu->lock);
>>>   }
>>>   -static void hangcheck_timer_reset(struct msm_gpu *gpu)
>>> +static void hangcheck_ring_timer_reset(struct msm_gpu *gpu, struct
>>> msm_ringbuffer *ring)
>>>   {
>>>       struct msm_drm_private *priv = gpu->dev->dev_private;
>>> -    mod_timer(&gpu->hangcheck_timer,
>>> -            round_jiffies_up(jiffies + msecs_to_jiffies(priv-
>>> >hangcheck_period)));
>>> +    mod_timer(&ring->hangcheck_timer,
>>> +              round_jiffies_up(jiffies + msecs_to_jiffies(priv-
>>> >hangcheck_period)));
>>> +}
>>> +
>>> +static void hangcheck_timer_reset(struct msm_gpu *gpu)
>>> +{
>>> +    for (int i = 0; i < gpu->nr_rings; i++)
>>> +        hangcheck_ring_timer_reset(gpu, gpu->rb[i]);
>> It triggers my OCD a bit that there are multiple timers flying around
>> waking up CPU clusters. But this is okay for now I guess. :)
>>
>>>   }
>>>     static bool made_progress(struct msm_gpu *gpu, struct
>>> msm_ringbuffer *ring)
>>> @@ -635,11 +643,33 @@ static bool made_progress(struct msm_gpu *gpu,
>>> struct msm_ringbuffer *ring)
>>>       return true;
>>>   }
>>>   +static bool check_ring_timeout(struct msm_ringbuffer *ring,
>>> unsigned long timeout)
>>> +{
>>> +    struct msm_gpu *gpu = ring->gpu;
>>> +    struct msm_ringbuffer *curr_ring = gpu->funcs->active_ring(gpu);
>>> +    u64 start, end;
>>> +    int ret;
>>> +
>>> +    if (!gpu->funcs->get_ctx_timestamp)
>>> +        return !made_progress(gpu, ring);
>>> +
>>> +    start = ring->memptrs->last_job_start_ctx;
>>> +
>>> +    if (!gpu->funcs->get_ctx_timestamp(ring, &end))
>> I suppose you want the other way. ie, if get_ctx_timestamp() returns
>> -EBUSY, which means gpu is either under preemption or in a different
>> ring, use the memptr data.
> You are right, that `!` does not belong there!
>>
>>> +        end = ring->memptrs->last_active_ctxcycles;
>> Assuming my above comment is correct, if GPU is under preemption, there
>> is a chance that the postamble might not have got executed. So with a
>> stale 'end' value, the below calc may go wrong?
>>
>>> +
>>> +    if (end >= start)
>>> +        return (end - start) < timeout;
>>> +    else
>>> +        return false;
>> In case of an infinite shader blocking preemption, wouldn't we always
>> return false here?
> Right. Once we fix the above condition this shouldn't be a problem
> because hangcheck timer will fire on the ring that is stuck so `end`
> will be read from the register and it should detect the hang.

You mean the timer on the outgoing ring? But we won't read the register
as the preemption is active, right? We read the memptr which has a stale
'end' value. Since 'end' < 'start', this code will continously report
"no timeout" to the caller.

>>
>> -Akhil> +}
>>> +
>>>   static void hangcheck_handler(struct timer_list *t)
>>>   {
>>> -    struct msm_gpu *gpu = timer_container_of(gpu, t, hangcheck_timer);
>>> +    struct msm_ringbuffer *ring = timer_container_of(ring, t,
>>> hangcheck_timer);
>>> +    struct msm_gpu *gpu = ring->gpu;
>>> +    struct msm_drm_private *priv = gpu->dev->dev_private;
>>>       struct drm_device *dev = gpu->dev;
>>> -    struct msm_ringbuffer *ring = gpu->funcs->active_ring(gpu);
>>>       uint32_t fence = ring->memptrs->fence;
>>>         if (fence != ring->hangcheck_fence) {
>>> @@ -647,7 +677,7 @@ static void hangcheck_handler(struct timer_list *t)
>>>           ring->hangcheck_fence = fence;
>>>           ring->hangcheck_progress_retries = 0;
>>>       } else if (fence_before(fence, ring->fctx->last_fence) &&
>>> -            !made_progress(gpu, ring)) {
>>> +            check_ring_timeout(ring, priv->hangcheck_period *
>>> 192000)) {
>>>           /* no progress and not done.. hung! */
>>>           ring->hangcheck_fence = fence;
>>>           ring->hangcheck_progress_retries = 0;
>>> @@ -658,6 +688,7 @@ static void hangcheck_handler(struct timer_list *t)
>>>           DRM_DEV_ERROR(dev->dev, "%s:     submitted fence: %u\n",
>>>                   gpu->name, ring->fctx->last_fence);
>>>   +        gpu->hung_ring = ring;
>>>           kthread_queue_work(gpu->worker, &gpu->recover_work);
>>>       }
>>>   @@ -911,7 +942,7 @@ void msm_gpu_submit(struct msm_gpu *gpu, struct
>>> msm_gem_submit *submit)
>>>       submit->ring->cur_ctx_seqno = submit->queue->ctx->seqno;
>>>         pm_runtime_put(&gpu->pdev->dev);
>>> -    hangcheck_timer_reset(gpu);
>>> +    hangcheck_ring_timer_reset(gpu, submit->ring);
>> Should we reset hangcheck whenever there is a submission to a random
>> ring?
> The idea is that we only reset the timer for that ring. If that ring is
> stuck presumably whatever applications are running on it will stop
> submitting eventually since fences won't be signaled and so the timer
> should run.
> 
> That isn't really guaranteed but previously that was assumed.
> 
> Do you think this isn't a good assumption to make?

I missed that this was per-ring. If there are continous submissions at
the right interval to the same ring, it can delay the hang check
detection. I agree that this is an edge case.

It makes sense to reset the timer during the first submit to a ring.
After that it can reset itself from the timer handler until all submits
are retired. Would that require a lot of plumbing?

Anyway, since we are reworking the hangcheck, probably we can revisit
every aspect of it in this series. Up to you.

-Akhil

> 
>>
>> -Akhil
>>
>>>   }
>>>     /*
>>> @@ -1011,8 +1042,6 @@ int msm_gpu_init(struct drm_device *drm, struct
>>> platform_device *pdev,
>>>       if (funcs->progress)
>>>           priv->hangcheck_period /= 2;
>>>   -    timer_setup(&gpu->hangcheck_timer, hangcheck_handler, 0);
>>> -
>>>       spin_lock_init(&gpu->perf_lock);
>>>     @@ -1097,6 +1126,8 @@ int msm_gpu_init(struct drm_device *drm,
>>> struct platform_device *pdev,
>>>               goto fail;
>>>           }
>>>   +        timer_setup(&gpu->rb[i]->hangcheck_timer,
>>> hangcheck_handler, 0);
>>> +
>>>           memptrs += sizeof(struct msm_rbmemptrs);
>>>           memptrs_iova += sizeof(struct msm_rbmemptrs);
>>>       }
>>> diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/
>>> msm_gpu.h
>>> index
>>> a597f2bee30b6370ecc3639bfe1072c85993e789..7bf1b7f4bc4b61338bfa4c1463eb549f8c22d5c3 100644
>>> --- a/drivers/gpu/drm/msm/msm_gpu.h
>>> +++ b/drivers/gpu/drm/msm/msm_gpu.h
>>> @@ -93,6 +93,7 @@ struct msm_gpu_funcs {
>>>        */
>>>       bool (*progress)(struct msm_gpu *gpu, struct msm_ringbuffer
>>> *ring);
>>>       void (*sysprof_setup)(struct msm_gpu *gpu);
>>> +    int (*get_ctx_timestamp)(struct msm_ringbuffer *ring, uint64_t
>>> *value);
>>>   };
>>>     /* Additional state for iommu faults: */
>>> @@ -257,6 +258,8 @@ struct msm_gpu {
>>>       /* work for handling GPU recovery: */
>>>       struct kthread_work recover_work;
>>>   +    struct msm_ringbuffer *hung_ring;
>>> +
>>>       /** retire_event: notified when submits are retired: */
>>>       wait_queue_head_t retire_event;
>>>   diff --git a/drivers/gpu/drm/msm/msm_ringbuffer.h b/drivers/gpu/
>>> drm/msm/msm_ringbuffer.h
>>> index
>>> d1e49f701c8176e50d2b9a5cca35acee67f75209..316247fb089f26bd657ccf8464a5039e1cd1ac45 100644
>>> --- a/drivers/gpu/drm/msm/msm_ringbuffer.h
>>> +++ b/drivers/gpu/drm/msm/msm_ringbuffer.h
>>> @@ -37,6 +37,8 @@ struct msm_rbmemptrs {
>>>       volatile struct msm_gpu_submit_stats
>>> stats[MSM_GPU_SUBMIT_STATS_COUNT];
>>>       volatile u64 ttbr0;
>>>       volatile u32 context_idr;
>>> +    volatile u64 last_job_start_ctx;
>>> +    volatile u64 last_active_ctxcycles;
>>>   };
>>>     struct msm_cp_state {
>>> @@ -73,6 +75,10 @@ struct msm_ringbuffer {
>>>       uint64_t memptrs_iova;
>>>       struct msm_fence_context *fctx;
>>>   +    /* Hang and Inactivity Detection:
>>> +     */
>>> +    struct timer_list hangcheck_timer;
>>> +
>>>       /**
>>>        * hangcheck_progress_retries:
>>>        *
>>>
> 
> Best regards,