lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <254d31f8-0cbb-45d2-9686-2923e511811b@amd.com>
Date: Wed, 30 Apr 2025 14:50:44 +0200
From: "Sharma, Shashank" <shashank.sharma@....com>
To: Dan Carpenter <dan.carpenter@...aro.org>
Cc: "Deucher, Alexander" <Alexander.Deucher@....com>,
 "Koenig, Christian" <Christian.Koenig@....com>,
 David Airlie <airlied@...il.com>, Simona Vetter <simona@...ll.ch>,
 "Khatri, Sunil" <Sunil.Khatri@....com>, "Yadav, Arvind"
 <Arvind.Yadav@....com>,
 "Paneer Selvam, Arunpravin" <Arunpravin.PaneerSelvam@....com>,
 "amd-gfx@...ts.freedesktop.org" <amd-gfx@...ts.freedesktop.org>,
 "dri-devel@...ts.freedesktop.org" <dri-devel@...ts.freedesktop.org>,
 "linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
 "kernel-janitors@...r.kernel.org" <kernel-janitors@...r.kernel.org>
Subject: Re: [PATCH] drm/amdgpu/userq: remove unnecessary NULL check


On 30/04/2025 11:49, Dan Carpenter wrote:
> On Wed, Apr 30, 2025 at 09:28:59AM +0000, Sharma, Shashank wrote:
>> [AMD Official Use Only - AMD Internal Distribution Only]
>>
>> Hello Dan,
>>
>> ________________________________
>> From: Dan Carpenter
>> Sent: Wednesday, April 30, 2025 10:05 AM
>> To: Deucher, Alexander
>> Cc: Koenig, Christian; David Airlie; Simona Vetter; Sharma, Shashank; Khatri, Sunil; Yadav, Arvind; Paneer Selvam, Arunpravin; amd-gfx@...ts.freedesktop.org; dri-devel@...ts.freedesktop.org; linux-kernel@...r.kernel.org; kernel-janitors@...r.kernel.org
>> Subject: [PATCH] drm/amdgpu/userq: remove unnecessary NULL check
>>
>> The "ticket" pointer points to in the middle of the &exec struct so it
>> can't be NULL.  Remove the check.
>>
>> Signed-off-by: Dan Carpenter <dan.carpenter@...aro.org>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 2 +-
>>   1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>> index b0e8098a3988..7505d920fb3d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>> @@ -631,7 +631,7 @@ amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr)
>>                           clear = false;
>>                           unlock = true;
>>                   /* The caller is already holding the reservation lock */
>> -               } else if (ticket && dma_resv_locking_ctx(resv) == ticket) {
>> +               } else if (dma_resv_locking_ctx(resv) == ticket) {
>>
>> Its a Nack for me, There are a few situations (particularly during the
>> first launch of the desktop, and also when eviction fence and new queue
>> creation are working in parallel) where this ticket can be NULL, we
>> observed it during the stress validation and hence added this check,
>>
> It shouldn't be NULL.  It sounds like you are experiencing stack
> corruption and this is just a bandaid.
>
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
>     566  static int
>     567  amdgpu_userq_validate_bos(struct amdgpu_userq_mgr *uq_mgr)
>     568  {
>     569          struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
>     570          struct amdgpu_vm *vm = &fpriv->vm;
>     571          struct amdgpu_device *adev = uq_mgr->adev;
>     572          struct amdgpu_bo_va *bo_va;
>     573          struct ww_acquire_ctx *ticket;
>     574          struct drm_exec exec;
>                  ^^^^^^^^^^^^^^^^^^^^^
> The "exec" struct is declared on the stack.
>
>     575          struct amdgpu_bo *bo;
>     576          struct dma_resv *resv;
>     577          bool clear, unlock;
>     578          int ret = 0;
>     579
>     580          drm_exec_init(&exec, DRM_EXEC_IGNORE_DUPLICATES, 0);
>     581          drm_exec_until_all_locked(&exec) {
>     582                  ret = amdgpu_vm_lock_pd(vm, &exec, 2);
>     583                  drm_exec_retry_on_contention(&exec);
>     584                  if (unlikely(ret)) {
>     585                          DRM_ERROR("Failed to lock PD\n");
>     586                          goto unlock_all;
>     587                  }
>     588
>     589                  /* Lock the done list */
>     590                  list_for_each_entry(bo_va, &vm->done, base.vm_status) {
>     591                          bo = bo_va->base.bo;
>     592                          if (!bo)
>     593                                  continue;
>     594
>     595                          ret = drm_exec_lock_obj(&exec, &bo->tbo.base);
>     596                          drm_exec_retry_on_contention(&exec);
>     597                          if (unlikely(ret))
>     598                                  goto unlock_all;
>     599                  }
>     600          }
>     601
>     602          spin_lock(&vm->status_lock);
>     603          while (!list_empty(&vm->moved)) {
>     604                  bo_va = list_first_entry(&vm->moved, struct amdgpu_bo_va,
>     605                                           base.vm_status);
>     606                  spin_unlock(&vm->status_lock);
>     607
>     608                  /* Per VM BOs never need to bo cleared in the page tables */
>     609                  ret = amdgpu_vm_bo_update(adev, bo_va, false);
>     610                  if (ret)
>     611                          goto unlock_all;
>     612                  spin_lock(&vm->status_lock);
>     613          }
>     614
>     615          ticket = &exec.ticket;
>                  ^^^^^^^^^^^^^^^^^^^^^
> ticket is only set here.  We know that &exec is non-NULL because it's
> declared on the stack.  ticket is 4 bytes into the middle of a non-NULL
> struct.  It is impossible for ticket to be NULL here.

Yep, you are right. I just did a code review, and probably we added that 
NULL check before we had the right locks in place, and there was a race 
between eviction thread and the UQ create thread, causing corruption. 
Please feel free to use Acked-by: Shashank Sharma <shashank.sharma@....com>

- Shashank

>
>     616          while (!list_empty(&vm->invalidated)) {
>     617                  bo_va = list_first_entry(&vm->invalidated, struct amdgpu_bo_va,
>     618                                           base.vm_status);
>     619                  resv = bo_va->base.bo->tbo.base.resv;
>     620                  spin_unlock(&vm->status_lock);
>     621
>     622                  bo = bo_va->base.bo;
>     623                  ret = amdgpu_userq_validate_vm_bo(NULL, bo);
>     624                  if (ret) {
>     625                          DRM_ERROR("Failed to validate BO\n");
>     626                          goto unlock_all;
>     627                  }
>     628
>     629                  /* Try to reserve the BO to avoid clearing its ptes */
>     630                  if (!adev->debug_vm && dma_resv_trylock(resv)) {
>     631                          clear = false;
>     632                          unlock = true;
>     633                  /* The caller is already holding the reservation lock */
>     634                  } else if (ticket && dma_resv_locking_ctx(resv) == ticket) {
>
> I've included the whole rest of the function so that we can see it is not
> set a second time.
>
> regards,
> dan carpenter
>
>     635                          clear = false;
>     636                          unlock = false;
>     637                  /* Somebody else is using the BO right now */
>     638                  } else {
>     639                          clear = true;
>     640                          unlock = false;
>     641                  }
>     642
>     643                  ret = amdgpu_vm_bo_update(adev, bo_va, clear);
>     644
>     645                  if (unlock)
>     646                          dma_resv_unlock(resv);
>     647                  if (ret)
>     648                          goto unlock_all;
>     649
>     650                  spin_lock(&vm->status_lock);
>     651          }
>     652          spin_unlock(&vm->status_lock);
>     653
>     654          ret = amdgpu_eviction_fence_replace_fence(&fpriv->evf_mgr, &exec);
>     655          if (ret)
>     656                  DRM_ERROR("Failed to replace eviction fence\n");
>     657
>     658  unlock_all:
>     659          drm_exec_fini(&exec);
>     660          return ret;
>     661  }
>
>

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ