linux-kernel - Re: [PATCH v2 5/6] drm/amdgpu: Log IBs and ring name at coredump

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <50fa1365-ab6a-58a1-e82f-ebaf1b623010@igalia.com>
Date:   Fri, 14 Jul 2023 09:23:55 -0300
From:   André Almeida <andrealmeid@...lia.com>
To:     Christian König <ckoenig.leichtzumerken@...il.com>
Cc:     pierre-eric.pelloux-prayer@....com,
        dri-devel@...ts.freedesktop.org,
        'Marek Olšák' <maraeo@...il.com>,
        amd-gfx@...ts.freedesktop.org,
        Timur Kristóf <timur.kristof@...il.com>,
        linux-kernel@...r.kernel.org, michel.daenzer@...lbox.org,
        Samuel Pitoiset <samuel.pitoiset@...il.com>,
        kernel-dev@...lia.com, Bas Nieuwenhuizen <bas@...nieuwenhuizen.nl>,
        alexander.deucher@....com, christian.koenig@....com
Subject: Re: [PATCH v2 5/6] drm/amdgpu: Log IBs and ring name at coredump



Em 14/07/2023 04:57, Christian König escreveu:
> Am 13.07.23 um 23:32 schrieb André Almeida:
>> Log the IB addresses used by the hung job along with the stuck ring
>> name. Note that due to nested IBs, the one that caused the reset itself
>> may be in not listed address.
>>
>> Signed-off-by: André Almeida <andrealmeid@...lia.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 +++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 31 +++++++++++++++++++++-
>>   2 files changed, 33 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index e1cc83a89d46..cfeaf93934fd 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1086,6 +1086,9 @@ struct amdgpu_coredump_info {
>>       struct amdgpu_task_info         reset_task_info;
>>       struct timespec64               reset_time;
>>       bool                            reset_vram_lost;
>> +    u64                *ibs;
>> +    u32                num_ibs;
>> +    char                ring_name[16];
>>   };
>>   #endif
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 07546781b8b8..431ccc3d7857 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -5008,12 +5008,24 @@ static ssize_t amdgpu_devcoredump_read(char 
>> *buffer, loff_t offset,
>>                      coredump->adev->reset_dump_reg_value[i]);
>>       }
>> +    if (coredump->num_ibs) {
>> +        drm_printf(&p, "IBs:\n");
>> +        for (i = 0; i < coredump->num_ibs; i++)
>> +            drm_printf(&p, "\t[%d] 0x%llx\n", i, coredump->ibs[i]);
>> +    }
>> +
>> +    if (coredump->ring_name[0] != '\0')
>> +        drm_printf(&p, "ring name: %s\n", coredump->ring_name);
>> +
>>       return count - iter.remain;
>>   }
>>   static void amdgpu_devcoredump_free(void *data)
>>   {
>> -    kfree(data);
>> +    struct amdgpu_coredump_info *coredump = data;
>> +
>> +    kfree(coredump->ibs);
>> +    kfree(coredump);
>>   }
>>   static void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
>> @@ -5021,6 +5033,8 @@ static void amdgpu_coredump(struct amdgpu_device 
>> *adev, bool vram_lost,
>>   {
>>       struct amdgpu_coredump_info *coredump;
>>       struct drm_device *dev = adev_to_drm(adev);
>> +    struct amdgpu_job *job = reset_context->job;
>> +    int i;
>>       coredump = kmalloc(sizeof(*coredump), GFP_NOWAIT);
>> @@ -5038,6 +5052,21 @@ static void amdgpu_coredump(struct 
>> amdgpu_device *adev, bool vram_lost,
>>       coredump->adev = adev;
>> +    if (job && job->num_ibs) {
> 
> I really really really don't want any dependency of the core dump 
> feature towards the job.
> 

Because of the lifetime of job?

Do you think implementing amdgpu_job_get()/put() would help here?

> What we could do is to record the first executed IB VAs in the hw fence, 
> but I'm not sure how useful this is in the first place.
> 

I see, any hint here of the timedout job would be helpful AFAIK.

> We have some internal feature in progress to query the VA of the draw 
> command which cause the waves currently executing in the SQ to be 
> retrieved.
> 
>> +        struct amdgpu_ring *ring = to_amdgpu_ring(job->base.sched);
>> +        u32 num_ibs = job->num_ibs;
>> +
>> +        coredump->ibs = kmalloc_array(num_ibs, sizeof(coredump->ibs), 
>> GFP_NOWAIT);
> 
> This can fail pretty easily.

Because of its size?

> 
> Christian.
> 
>> +        if (coredump->ibs)
>> +            coredump->num_ibs = num_ibs;
>> +
>> +        for (i = 0; i < coredump->num_ibs; i++)
>> +            coredump->ibs[i] = job->ibs[i].gpu_addr;
>> +
>> +        if (ring)
>> +            strncpy(coredump->ring_name, ring->name, 16);
>> +    }
>> +
>>       ktime_get_ts64(&coredump->reset_time);
>>       dev_coredumpm(dev->dev, THIS_MODULE, coredump, 0, GFP_NOWAIT,
>