lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <20220217090440.4468-1-qiang.yu@amd.com>
Date:   Thu, 17 Feb 2022 17:04:40 +0800
From:   Qiang Yu <qiang.yu@....com>
To:     Alex Deucher <alexander.deucher@....com>,
        Christian König <christian.koenig@....com>,
        "Pan, Xinhui" <Xinhui.Pan@....com>,
        David Airlie <airlied@...ux.ie>,
        Daniel Vetter <daniel@...ll.ch>,
        Sumit Semwal <sumit.semwal@...aro.org>
CC:     Qiang Yu <qiang.yu@....com>, <amd-gfx@...ts.freedesktop.org>,
        <dri-devel@...ts.freedesktop.org>, <linux-kernel@...r.kernel.org>,
        <linux-media@...r.kernel.org>, <linaro-mm-sig@...ts.linaro.org>
Subject: [PATCH] drm/amdgpu: check vm bo eviction valuable at last

Workstation application ANSA/META get this error dmesg:
[drm:amdgpu_gem_va_ioctl [amdgpu]] *ERROR* Couldn't update BO_VA (-16)

This is caused by:
1. create a 256MB buffer in invisible VRAM
2. CPU map the buffer and access it causes vm_fault and try to move
   it to visible VRAM
3. force visible VRAM space and traverse all VRAM bos to check if
   evicting this bo is valuable
4. when checking a VM bo (in invisible VRAM), amdgpu_vm_evictable()
   will set amdgpu_vm->evicting, but latter due to not in visible
   VRAM, won't really evict it so not add it to amdgpu_vm->evicted
5. before next CS to clear the amdgpu_vm->evicting, user VM ops
   ioctl will pass amdgpu_vm_ready() (check amdgpu_vm->evicted)
   but fail in amdgpu_vm_bo_update_mapping() (check
   amdgpu_vm->evicting) and get this error log

This error won't affect functionality as next CS will finish the
waiting VM ops. But we'd better make the amdgpu_vm->evicting
correctly reflact the vm status and clear the error log.

Signed-off-by: Qiang Yu <qiang.yu@....com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 85 ++++++++++++++-----------
 1 file changed, 47 insertions(+), 38 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 5a32ee66d8c8..88a27911054f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1306,45 +1306,11 @@ uint64_t amdgpu_ttm_tt_pte_flags(struct amdgpu_device *adev, struct ttm_tt *ttm,
 	return flags;
 }
 
-/*
- * amdgpu_ttm_bo_eviction_valuable - Check to see if we can evict a buffer
- * object.
- *
- * Return true if eviction is sensible. Called by ttm_mem_evict_first() on
- * behalf of ttm_bo_mem_force_space() which tries to evict buffer objects until
- * it can find space for a new object and by ttm_bo_force_list_clean() which is
- * used to clean out a memory space.
- */
-static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
-					    const struct ttm_place *place)
+static bool amdgpu_ttm_mem_eviction_valuable(struct ttm_buffer_object *bo,
+					     const struct ttm_place *place)
 {
 	unsigned long num_pages = bo->resource->num_pages;
 	struct amdgpu_res_cursor cursor;
-	struct dma_resv_list *flist;
-	struct dma_fence *f;
-	int i;
-
-	/* Swapout? */
-	if (bo->resource->mem_type == TTM_PL_SYSTEM)
-		return true;
-
-	if (bo->type == ttm_bo_type_kernel &&
-	    !amdgpu_vm_evictable(ttm_to_amdgpu_bo(bo)))
-		return false;
-
-	/* If bo is a KFD BO, check if the bo belongs to the current process.
-	 * If true, then return false as any KFD process needs all its BOs to
-	 * be resident to run successfully
-	 */
-	flist = dma_resv_shared_list(bo->base.resv);
-	if (flist) {
-		for (i = 0; i < flist->shared_count; ++i) {
-			f = rcu_dereference_protected(flist->shared[i],
-				dma_resv_held(bo->base.resv));
-			if (amdkfd_fence_check_mm(f, current->mm))
-				return false;
-		}
-	}
 
 	switch (bo->resource->mem_type) {
 	case AMDGPU_PL_PREEMPT:
@@ -1377,10 +1343,53 @@ static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
 		return false;
 
 	default:
-		break;
+		return ttm_bo_eviction_valuable(bo, place);
 	}
+}
 
-	return ttm_bo_eviction_valuable(bo, place);
+/*
+ * amdgpu_ttm_bo_eviction_valuable - Check to see if we can evict a buffer
+ * object.
+ *
+ * Return true if eviction is sensible. Called by ttm_mem_evict_first() on
+ * behalf of ttm_bo_mem_force_space() which tries to evict buffer objects until
+ * it can find space for a new object and by ttm_bo_force_list_clean() which is
+ * used to clean out a memory space.
+ */
+static bool amdgpu_ttm_bo_eviction_valuable(struct ttm_buffer_object *bo,
+					    const struct ttm_place *place)
+{
+	struct dma_resv_list *flist;
+	struct dma_fence *f;
+	int i;
+
+	/* Swapout? */
+	if (bo->resource->mem_type == TTM_PL_SYSTEM)
+		return true;
+
+	/* If bo is a KFD BO, check if the bo belongs to the current process.
+	 * If true, then return false as any KFD process needs all its BOs to
+	 * be resident to run successfully
+	 */
+	flist = dma_resv_shared_list(bo->base.resv);
+	if (flist) {
+		for (i = 0; i < flist->shared_count; ++i) {
+			f = rcu_dereference_protected(flist->shared[i],
+				dma_resv_held(bo->base.resv));
+			if (amdkfd_fence_check_mm(f, current->mm))
+				return false;
+		}
+	}
+
+	/* Check by different mem type. */
+	if (!amdgpu_ttm_mem_eviction_valuable(bo, place))
+		return false;
+
+	/* VM bo should be checked at last because it will mark VM evicting. */
+	if (bo->type == ttm_bo_type_kernel)
+		return amdgpu_vm_evictable(ttm_to_amdgpu_bo(bo));
+
+	return true;
 }
 
 static void amdgpu_ttm_vram_mm_access(struct amdgpu_device *adev, loff_t pos,
-- 
2.25.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ