[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <15b38c90ef1eb0825b7492d633d46d901e428f7d.1729760996.git.qinyuntan@linux.alibaba.com>
Date: Thu, 24 Oct 2024 17:34:44 +0800
From: Qinyun Tan <qinyuntan@...ux.alibaba.com>
To: Andrew Morton <akpm@...ux-foundation.org>,
Alex Williamson <alex.williamson@...hat.com>
Cc: linux-mm@...ck.org,
kvm@...r.kernel.org,
linux-kernel@...r.kernel.org,
Qinyun Tan <qinyuntan@...ux.alibaba.com>,
Guanghui Feng <guanghuifeng@...ux.alibaba.com>,
Xunlei Pang <xlpang@...ux.alibaba.com>
Subject: [PATCH v1: vfio: avoid unnecessary pin memory when dma map io address space 2/2] vfio: avoid unnecessary pin memory when dma map io address space
When user application call ioctl(VFIO_IOMMU_MAP_DMA) to map a dma address,
the general handler 'vfio_pin_map_dma' attempts to pin the memory and
then create the mapping in the iommu.
However, some mappings aren't backed by a struct page, for example an
mmap'd MMIO range for our own or another device. In this scenario, a vma
with flag VM_IO | VM_PFNMAP, the pin operation will fail. Moreover, the
pin operation incurs a large overhead which will result in a longer
startup time for the VM. We don't actually need a pin in this scenario.
To address this issue, we introduce a new DMA MAP flag
'VFIO_DMA_MAP_FLAG_MMIO_DONT_PIN' to skip the 'vfio_pin_pages_remote'
operation in the DMA map process for mmio memory. Additionally, we add
the 'VM_PGOFF_IS_PFN' flag for vfio_pci_mmap address, ensuring that we can
directly obtain the pfn through vma->vm_pgoff.
This approach allows us to avoid unnecessary memory pinning operations,
which would otherwise introduce additional overhead during DMA mapping.
In my tests, using vfio to pass through an 8-card AMD GPU which with a
large bar size (128GB*8), the time mapping the 192GB*8 bar was reduced
from about 50.79s to 1.57s.
Signed-off-by: Qinyun Tan <qinyuntan@...ux.alibaba.com>
Signed-off-by: Guanghui Feng <guanghuifeng@...ux.alibaba.com>
Reviewed-by: Xunlei Pang <xlpang@...ux.alibaba.com>
---
drivers/vfio/pci/vfio_pci_core.c | 2 +-
drivers/vfio/vfio_iommu_type1.c | 64 +++++++++++++++++++++++++-------
include/uapi/linux/vfio.h | 11 ++++++
3 files changed, 62 insertions(+), 15 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 1ab58da9f38a6..9e8743429e490 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1802,7 +1802,7 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
* the VMA flags.
*/
vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
- VM_DONTEXPAND | VM_DONTDUMP);
+ VM_DONTEXPAND | VM_DONTDUMP | VM_PGOFF_IS_PFN);
vma->vm_ops = &vfio_pci_mmap_ops;
return 0;
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index bf391b40e576f..156e668de117d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1439,7 +1439,7 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
}
static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
- size_t map_size)
+ size_t map_size, unsigned int map_flags)
{
dma_addr_t iova = dma->iova;
unsigned long vaddr = dma->vaddr;
@@ -1448,27 +1448,61 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
long npage;
unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
int ret = 0;
+ struct mm_struct *mm = current->mm;
+ bool mmio_dont_pin = map_flags & VFIO_DMA_MAP_FLAG_MMIO_DONT_PIN;
+
+ /* This code path is only user initiated */
+ if (!mm) {
+ ret = -ENODEV;
+ goto out;
+ }
vfio_batch_init(&batch);
while (size) {
- /* Pin a contiguous chunk of memory */
- npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
- size >> PAGE_SHIFT, &pfn, limit,
- &batch);
- if (npage <= 0) {
- WARN_ON(!npage);
- ret = (int)npage;
- break;
+ struct vm_area_struct *vma;
+ unsigned long start = vaddr + dma->size;
+ bool do_pin_pages = true;
+
+ if (mmio_dont_pin) {
+ mmap_read_lock(mm);
+
+ vma = find_vma_intersection(mm, start, start+1);
+
+ /*
+ * If this dma address rang belongs to the IO address space with VMA flags
+ * VM_IO | VM_PFNMAP | VM_PGOFF_IS_PFN, it doesn't need to be pinned.
+ * Simply skip the pin operation to avoid unnecessary overhead.
+ */
+ if (vma && (vma->vm_flags & VM_PFNMAP) && (vma->vm_flags & VM_IO)
+ && (vma->vm_flags & VM_PGOFF_IS_PFN)) {
+ pfn = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ npage = min_t(long, (vma->vm_end - start), size) >> PAGE_SHIFT;
+ do_pin_pages = false;
+ }
+ mmap_read_unlock(mm);
+ }
+
+ if (do_pin_pages) {
+ /* Pin a contiguous chunk of memory */
+ npage = vfio_pin_pages_remote(dma, start, size >> PAGE_SHIFT, &pfn,
+ limit, &batch);
+ if (npage <= 0) {
+ WARN_ON(!npage);
+ ret = (int)npage;
+ break;
+ }
}
/* Map it! */
ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
dma->prot);
if (ret) {
- vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
- npage, true);
- vfio_batch_unpin(&batch, dma);
+ if (do_pin_pages) {
+ vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
+ npage, true);
+ vfio_batch_unpin(&batch, dma);
+ }
break;
}
@@ -1479,6 +1513,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
vfio_batch_fini(&batch);
dma->iommu_mapped = true;
+out:
if (ret)
vfio_remove_dma(iommu, dma);
@@ -1645,7 +1680,7 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
if (list_empty(&iommu->domain_list))
dma->size = size;
else
- ret = vfio_pin_map_dma(iommu, dma, size);
+ ret = vfio_pin_map_dma(iommu, dma, size, map->flags);
if (!ret && iommu->dirty_page_tracking) {
ret = vfio_dma_bitmap_alloc(dma, pgsize);
@@ -2639,6 +2674,7 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
case VFIO_TYPE1_IOMMU:
case VFIO_TYPE1v2_IOMMU:
case VFIO_TYPE1_NESTING_IOMMU:
+ case VFIO_DMA_MAP_MMIO_DONT_PIN:
case VFIO_UNMAP_ALL:
return 1;
case VFIO_UPDATE_VADDR:
@@ -2811,7 +2847,7 @@ static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
struct vfio_iommu_type1_dma_map map;
unsigned long minsz;
uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
- VFIO_DMA_MAP_FLAG_VADDR;
+ VFIO_DMA_MAP_FLAG_VADDR | VFIO_DMA_MAP_FLAG_MMIO_DONT_PIN;
minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 2b68e6cdf1902..ca391ec41b3c3 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -56,6 +56,16 @@
*/
#define VFIO_UPDATE_VADDR 10
+/*
+ * Support VFIO_DMA_MAP_FLAG_MMIO_DONT_PIN for DMA mapping. For MMIO addresses,
+ * we do not need to pin the pages or establish address mapping in the MMU
+ * at this stage. We only need to establish the address mapping in the IOMMU for
+ * the ioctl(VFIO_IOMMU_MAP_DMA). The page table mapping in the MMU will be
+ * dynamically established through the page fault mechanism when the page
+ * is accessed in the future.
+ */
+#define VFIO_DMA_MAP_MMIO_DONT_PIN 11
+
/*
* The IOCTL interface is designed for extensibility by embedding the
* structure length (argsz) and flags into structures passed between
@@ -1560,6 +1570,7 @@ struct vfio_iommu_type1_dma_map {
#define VFIO_DMA_MAP_FLAG_READ (1 << 0) /* readable from device */
#define VFIO_DMA_MAP_FLAG_WRITE (1 << 1) /* writable from device */
#define VFIO_DMA_MAP_FLAG_VADDR (1 << 2)
+#define VFIO_DMA_MAP_FLAG_MMIO_DONT_PIN (1 << 3) /* MMIO doesn't need pin page */
__u64 vaddr; /* Process virtual address */
__u64 iova; /* IO virtual address */
__u64 size; /* Size of mapping (bytes) */
--
2.43.5
Powered by blists - more mailing lists