lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <072e1ce590fe101a4cdbd5e91b1702efebb6d0fd.1664171943.git-series.apopple@nvidia.com>
Date:   Mon, 26 Sep 2022 16:03:10 +1000
From:   Alistair Popple <apopple@...dia.com>
To:     linux-mm@...ck.org, Andrew Morton <akpm@...ux-foundation.org>
Cc:     Michael Ellerman <mpe@...erman.id.au>,
        Nicholas Piggin <npiggin@...il.com>,
        Felix Kuehling <Felix.Kuehling@....com>,
        Alex Deucher <alexander.deucher@....com>,
        Christian König <christian.koenig@....com>,
        "Pan, Xinhui" <Xinhui.Pan@....com>,
        David Airlie <airlied@...ux.ie>,
        Daniel Vetter <daniel@...ll.ch>,
        Ben Skeggs <bskeggs@...hat.com>,
        Karol Herbst <kherbst@...hat.com>,
        Lyude Paul <lyude@...hat.com>,
        Ralph Campbell <rcampbell@...dia.com>,
        "Matthew Wilcox (Oracle)" <willy@...radead.org>,
        Alex Sierra <alex.sierra@....com>,
        John Hubbard <jhubbard@...dia.com>,
        linuxppc-dev@...ts.ozlabs.org, linux-kernel@...r.kernel.org,
        amd-gfx@...ts.freedesktop.org, nouveau@...ts.freedesktop.org,
        dri-devel@...ts.freedesktop.org, Jason Gunthorpe <jgg@...dia.com>,
        Dan Williams <dan.j.williams@...el.com>,
        Alistair Popple <apopple@...dia.com>
Subject: [PATCH 6/7] nouveau/dmem: Evict device private memory during release

When the module is unloaded or a GPU is unbound from the module it is
possible for device private pages to be left mapped in currently running
processes. This leads to a kernel crash when the pages are either freed
or accessed from the CPU because the GPU and associated data structures
and callbacks have all been freed.

Fix this by migrating any mappings back to normal CPU memory prior to
freeing the GPU memory chunks and associated device private pages.

Signed-off-by: Alistair Popple <apopple@...dia.com>

---

I assume the AMD driver might have a similar issue. However I can't see
where device private (or coherent) pages actually get unmapped/freed
during teardown as I couldn't find any relevant calls to
devm_memunmap(), memunmap(), devm_release_mem_region() or
release_mem_region(). So it appears that ZONE_DEVICE pages are not being
properly freed during module unload, unless I'm missing something?
---
 drivers/gpu/drm/nouveau/nouveau_dmem.c | 48 +++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+)

diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 66ebbd4..3b247b8 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -369,6 +369,52 @@ nouveau_dmem_suspend(struct nouveau_drm *drm)
 	mutex_unlock(&drm->dmem->mutex);
 }
 
+/*
+ * Evict all pages mapping a chunk.
+ */
+void
+nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk)
+{
+	unsigned long i, npages = range_len(&chunk->pagemap.range) >> PAGE_SHIFT;
+	unsigned long *src_pfns, *dst_pfns;
+	dma_addr_t *dma_addrs;
+	struct nouveau_fence *fence;
+
+	src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL);
+	dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL);
+	dma_addrs = kcalloc(npages, sizeof(*dma_addrs), GFP_KERNEL);
+
+	migrate_device_range(src_pfns, chunk->pagemap.range.start >> PAGE_SHIFT,
+			npages);
+
+	for (i = 0; i < npages; i++) {
+		if (src_pfns[i] & MIGRATE_PFN_MIGRATE) {
+			struct page *dpage;
+
+			/*
+			 * _GFP_NOFAIL because the GPU is going away and there
+			 * is nothing sensible we can do if we can't copy the
+			 * data back.
+			 */
+			dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL);
+			dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
+			nouveau_dmem_copy_one(chunk->drm,
+					migrate_pfn_to_page(src_pfns[i]), dpage,
+					&dma_addrs[i]);
+		}
+	}
+
+	nouveau_fence_new(chunk->drm->dmem->migrate.chan, false, &fence);
+	migrate_device_pages(src_pfns, dst_pfns, npages);
+	nouveau_dmem_fence_done(&fence);
+	migrate_device_finalize(src_pfns, dst_pfns, npages);
+	kfree(src_pfns);
+	kfree(dst_pfns);
+	for (i = 0; i < npages; i++)
+		dma_unmap_page(chunk->drm->dev->dev, dma_addrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL);
+	kfree(dma_addrs);
+}
+
 void
 nouveau_dmem_fini(struct nouveau_drm *drm)
 {
@@ -380,8 +426,10 @@ nouveau_dmem_fini(struct nouveau_drm *drm)
 	mutex_lock(&drm->dmem->mutex);
 
 	list_for_each_entry_safe(chunk, tmp, &drm->dmem->chunks, list) {
+		nouveau_dmem_evict_chunk(chunk);
 		nouveau_bo_unpin(chunk->bo);
 		nouveau_bo_ref(NULL, &chunk->bo);
+		WARN_ON(chunk->callocated);
 		list_del(&chunk->list);
 		memunmap_pages(&chunk->pagemap);
 		release_mem_region(chunk->pagemap.range.start,
-- 
git-series 0.9.1

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ