[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <254d31f8-0cbb-45d2-9686-2923e511811b@amd.com>
Date: Wed, 30 Apr 2025 14:50:44 +0200
From: "Sharma, Shashank" <shashank.sharma@....com>
To: Dan Carpenter <dan.carpenter@...aro.org>
Cc: "Deucher, Alexander" <Alexander.Deucher@....com>,
"Koenig, Christian" <Christian.Koenig@....com>,
David Airlie <airlied@...il.com>, Simona Vetter <simona@...ll.ch>,
"Khatri, Sunil" <Sunil.Khatri@....com>, "Yadav, Arvind"
<Arvind.Yadav@....com>,
"Paneer Selvam, Arunpravin" <Arunpravin.PaneerSelvam@....com>,
"amd-gfx@...ts.freedesktop.org" <amd-gfx@...ts.freedesktop.org>,
"dri-devel@...ts.freedesktop.org" <dri-devel@...ts.freedesktop.org>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
"kernel-janitors@...r.kernel.org" <kernel-janitors@...r.kernel.org>
Subject: Re: [PATCH] drm/amdgpu/userq: remove unnecessary NULL check
On 30/04/2025 11:49, Dan Carpenter wrote:
> On Wed, Apr 30, 2025 at 09:28:59AM +0000, Sharma, Shashank wrote:
>> [AMD Official Use Only - AMD Internal Distribution Only]
>>
>> Hello Dan,
>>
>> ________________________________
>> From: Dan Carpenter
>> Sent: Wednesday, April 30, 2025 10:05 AM
>> To: Deucher, Alexander
>> Cc: Koenig, Christian; David Airlie; Simona Vetter; Sharma, Shashank; Khatri, Sunil; Yadav, Arvind; Paneer Selvam, Arunpravin; amd-gfx@...ts.freedesktop.org; dri-devel@...ts.freedesktop.org; linux-kernel@...r.kernel.org; kernel-janitors@...r.kernel.org
>> Subject: [PATCH] drm/amdgpu/userq: remove unnecessary NULL check
>>
>> The "ticket" pointer points to in the middle of the &exec struct so it
>> can't be NULL. Remove the check.
>>
>> Signed-off-by: Dan Carpenter <dan.carpenter@...aro.org>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 2 +-
>> 1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>> index b0e8098a3988..7505d920fb3d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>> @@ -631,7 +631,7 @@ amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr)
>> clear = false;
>> unlock = true;
>> /* The caller is already holding the reservation lock */
>> - } else if (ticket && dma_resv_locking_ctx(resv) == ticket) {
>> + } else if (dma_resv_locking_ctx(resv) == ticket) {
>>
>> Its a Nack for me, There are a few situations (particularly during the
>> first launch of the desktop, and also when eviction fence and new queue
>> creation are working in parallel) where this ticket can be NULL, we
>> observed it during the stress validation and hence added this check,
>>
> It shouldn't be NULL. It sounds like you are experiencing stack
> corruption and this is just a bandaid.
>
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> 566 static int
> 567 amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr)
> 568 {
> 569 struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
> 570 struct amdgpu_vm *vm = &fpriv->vm;
> 571 struct amdgpu_device *adev = uq_mgr->adev;
> 572 struct amdgpu_bo_va *bo_va;
> 573 struct ww_acquire_ctx *ticket;
> 574 struct drm_exec exec;
> ^^^^^^^^^^^^^^^^^^^^^
> The "exec" struct is declared on the stack.
>
> 575 struct amdgpu_bo *bo;
> 576 struct dma_resv *resv;
> 577 bool clear, unlock;
> 578 int ret = 0;
> 579
> 580 drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
> 581 drm_exec_until_all_locked(&exec) {
> 582 ret = amdgpu_vm_lock_pd(vm, &exec, 2);
> 583 drm_exec_retry_on_contention(&exec);
> 584 if (unlikely(ret)) {
> 585 DRM_ERROR("Failed to lock PD\n");
> 586 goto unlock_all;
> 587 }
> 588
> 589 /* Lock the done list */
> 590 list_for_each_entry(bo_va, &vm->done, base.vm_status) {
> 591 bo = bo_va->base.bo;
> 592 if (!bo)
> 593 continue;
> 594
> 595 ret = drm_exec_lock_obj(&exec, &bo->tbo.base);
> 596 drm_exec_retry_on_contention(&exec);
> 597 if (unlikely(ret))
> 598 goto unlock_all;
> 599 }
> 600 }
> 601
> 602 spin_lock(&vm->status_lock);
> 603 while (!list_empty(&vm->moved)) {
> 604 bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va,
> 605 base.vm_status);
> 606 spin_unlock(&vm->status_lock);
> 607
> 608 /* Per VM BOs never need to bo cleared in the page tables */
> 609 ret = amdgpu_vm_bo_update(adev, bo_va, false);
> 610 if (ret)
> 611 goto unlock_all;
> 612 spin_lock(&vm->status_lock);
> 613 }
> 614
> 615 ticket = &exec.ticket;
> ^^^^^^^^^^^^^^^^^^^^^
> ticket is only set here. We know that &exec is non-NULL because it's
> declared on the stack. ticket is 4 bytes into the middle of a non-NULL
> struct. It is impossible for ticket to be NULL here.
Yep, you are right. I just did a code review, and probably we added that
NULL check before we had the right locks in place, and there was a race
between eviction thread and the UQ create thread, causing corruption.
Please feel free to use Acked-by: Shashank Sharma <shashank.sharma@....com>
- Shashank
>
> 616 while (!list_empty(&vm->invalidated)) {
> 617 bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va,
> 618 base.vm_status);
> 619 resv = bo_va->base.bo->tbo.base.resv;
> 620 spin_unlock(&vm->status_lock);
> 621
> 622 bo = bo_va->base.bo;
> 623 ret = amdgpu_userq_validate_vm_bo(NULL, bo);
> 624 if (ret) {
> 625 DRM_ERROR("Failed to validate BO\n");
> 626 goto unlock_all;
> 627 }
> 628
> 629 /* Try to reserve the BO to avoid clearing its ptes */
> 630 if (!adev->debug_vm && dma_resv_trylock(resv)) {
> 631 clear = false;
> 632 unlock = true;
> 633 /* The caller is already holding the reservation lock */
> 634 } else if (ticket && dma_resv_locking_ctx(resv) == ticket) {
>
> I've included the whole rest of the function so that we can see it is not
> set a second time.
>
> regards,
> dan carpenter
>
> 635 clear = false;
> 636 unlock = false;
> 637 /* Somebody else is using the BO right now */
> 638 } else {
> 639 clear = true;
> 640 unlock = false;
> 641 }
> 642
> 643 ret = amdgpu_vm_bo_update(adev, bo_va, clear);
> 644
> 645 if (unlock)
> 646 dma_resv_unlock(resv);
> 647 if (ret)
> 648 goto unlock_all;
> 649
> 650 spin_lock(&vm->status_lock);
> 651 }
> 652 spin_unlock(&vm->status_lock);
> 653
> 654 ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec);
> 655 if (ret)
> 656 DRM_ERROR("Failed to replace eviction fence\n");
> 657
> 658 unlock_all:
> 659 drm_exec_fini(&exec);
> 660 return ret;
> 661 }
>
>
Powered by blists - more mailing lists