[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250613062920.68801-1-lizhe.67@bytedance.com>
Date: Fri, 13 Jun 2025 14:29:20 +0800
From: lizhe.67@...edance.com
To: alex.williamson@...hat.com
Cc: david@...hat.com,
kvm@...r.kernel.org,
linux-kernel@...r.kernel.org,
lizhe.67@...edance.com,
peterx@...hat.com
Subject: Re: [RFC v2] vfio/type1: optimize vfio_unpin_pages_remote() for large folio
On Thu, 12 Jun 2025 16:32:39 -0600, alex.williamson@...hat.com wrote:
> > drivers/vfio/vfio_iommu_type1.c | 53 +++++++++++++++++++++++++--------
> > 1 file changed, 41 insertions(+), 12 deletions(-)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index 28ee4b8d39ae..2f6c0074d7b3 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -469,17 +469,28 @@ static bool is_invalid_reserved_pfn(unsigned long pfn)
> > return true;
> > }
> >
> > -static int put_pfn(unsigned long pfn, int prot)
> > +static inline void _put_pfns(struct page *page, int npages, int prot)
> > {
> > - if (!is_invalid_reserved_pfn(pfn)) {
> > - struct page *page = pfn_to_page(pfn);
> > + unpin_user_page_range_dirty_lock(page, npages, prot & IOMMU_WRITE);
> > +}
> >
> > - unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
> > - return 1;
> > +/*
> > + * The caller must ensure that these npages PFNs belong to the same folio.
> > + */
> > +static inline int put_pfns(unsigned long pfn, int npages, int prot)
> > +{
> > + if (!is_invalid_reserved_pfn(pfn)) {
> > + _put_pfns(pfn_to_page(pfn), npages, prot);
> > + return npages;
> > }
> > return 0;
> > }
> >
> > +static inline int put_pfn(unsigned long pfn, int prot)
> > +{
> > + return put_pfns(pfn, 1, prot);
> > +}
> > +
> > #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
> >
> > static void __vfio_batch_init(struct vfio_batch *batch, bool single)
> > @@ -805,15 +816,33 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
> > unsigned long pfn, unsigned long npage,
> > bool do_accounting)
> > {
> > - long unlocked = 0, locked = 0;
> > - long i;
> > + long unlocked = 0, locked = vpfn_pages(dma, iova, npage);
> >
> > - for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
> > - if (put_pfn(pfn++, dma->prot)) {
> > - unlocked++;
> > - if (vfio_find_vpfn(dma, iova))
> > - locked++;
> > + while (npage) {
> > + struct folio *folio;
> > + struct page *page;
> > + long step = 1;
> > +
> > + if (is_invalid_reserved_pfn(pfn))
> > + goto next;
> > +
> > + page = pfn_to_page(pfn);
> > + folio = page_folio(page);
> > +
> > + if (!folio_test_large(folio)) {
> > + _put_pfns(page, 1, dma->prot);
> > + } else {
> > + step = min_t(long, npage,
> > + folio_nr_pages(folio) -
> > + folio_page_idx(folio, page));
> > + _put_pfns(page, step, dma->prot);
> > }
> > +
> > + unlocked += step;
> > +next:
>
> Usage of @step is inconsistent, goto isn't really necessary either, how
> about:
>
> while (npage) {
> unsigned long step = 1;
>
> if (!is_invalid_reserved_pfn(pfn)) {
> struct page *page = pfn_to_page(pfn);
> struct folio *folio = page_folio(page);
> long nr_pages = folio_nr_pages(folio);
>
> if (nr_pages > 1)
> step = min_t(long, npage,
> nr_pages -
> folio_page_idx(folio, page));
>
> _put_pfns(page, step, dma->prot);
> unlocked += step;
> }
>
That's great. This implementation is much better.
I'm a bit uncertain about the best type to use for the 'step'
variable here. I've been trying to keep things consistent with the
put_pfn() function, so I set the type of the second parameter in
_put_pfns() to 'int'(we pass 'step' as the second argument to
_put_pfns()).
Using unsigned long for 'step' should definitely work here, as the
number of pages in a large folio currently falls within the range
that can be represented by an int. However, there is still a
potential risk of truncation that we need to be mindful of.
> > + pfn += step;
> > + iova += PAGE_SIZE * step;
> > + npage -= step;
> > }
> >
> > if (do_accounting)
>
> AIUI, the idea is that we know we have npage contiguous pfns and we
> currently test invalid/reserved, call pfn_to_page(), call
> unpin_user_pages_dirty_lock(), and test vpfn for each individually.
>
> This instead wants to batch the vpfn accounted pfns using the range
> helper added for the mapping patch,
Yes. We use vpfn_pages() just to track the locked pages.
> infer that continuous pfns have the
> same invalid/reserved state, the pages are sequential, and that we can
> use the end of the folio to mark any inflections in those assumptions
> otherwise. Do I have that correct?
Yes. I think we're definitely on the same page here.
> I think this could be split into two patches, one simply batching the
> vpfn accounting and the next introducing the folio dependency. The
> contributions of each to the overall performance improvement would be
> interesting.
I've made an initial attempt, and here are the two patches after
splitting them up.
1. batch-vpfn-accounting-patch:
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 28ee4b8d39ae..c8ddcee5aa68 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -805,16 +805,12 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
unsigned long pfn, unsigned long npage,
bool do_accounting)
{
- long unlocked = 0, locked = 0;
+ long unlocked = 0, locked = vpfn_pages(dma, iova, npage);
long i;
- for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
- if (put_pfn(pfn++, dma->prot)) {
+ for (i = 0; i < npage; i++, iova += PAGE_SIZE)
+ if (put_pfn(pfn++, dma->prot))
unlocked++;
- if (vfio_find_vpfn(dma, iova))
- locked++;
- }
- }
if (do_accounting)
vfio_lock_acct(dma, locked - unlocked, true);
-----------------
2. large-folio-optimization-patch:
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index c8ddcee5aa68..48c2ba4ba4eb 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -469,17 +469,28 @@ static bool is_invalid_reserved_pfn(unsigned long pfn)
return true;
}
-static int put_pfn(unsigned long pfn, int prot)
+static inline void _put_pfns(struct page *page, int npages, int prot)
{
- if (!is_invalid_reserved_pfn(pfn)) {
- struct page *page = pfn_to_page(pfn);
+ unpin_user_page_range_dirty_lock(page, npages, prot & IOMMU_WRITE);
+}
- unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
- return 1;
+/*
+ * The caller must ensure that these npages PFNs belong to the same folio.
+ */
+static inline int put_pfns(unsigned long pfn, int npages, int prot)
+{
+ if (!is_invalid_reserved_pfn(pfn)) {
+ _put_pfns(pfn_to_page(pfn), npages, prot);
+ return npages;
}
return 0;
}
+static inline int put_pfn(unsigned long pfn, int prot)
+{
+ return put_pfns(pfn, 1, prot);
+}
+
#define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
static void __vfio_batch_init(struct vfio_batch *batch, bool single)
@@ -806,11 +817,28 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
bool do_accounting)
{
long unlocked = 0, locked = vpfn_pages(dma, iova, npage);
- long i;
- for (i = 0; i < npage; i++, iova += PAGE_SIZE)
- if (put_pfn(pfn++, dma->prot))
- unlocked++;
+ while (npage) {
+ long step = 1;
+
+ if (!is_invalid_reserved_pfn(pfn)) {
+ struct page *page = pfn_to_page(pfn);
+ struct folio *folio = page_folio(page);
+ long nr_pages = folio_nr_pages(folio);
+
+ if (nr_pages > 1)
+ step = min_t(long, npage,
+ nr_pages -
+ folio_page_idx(folio, page));
+
+ _put_pfns(page, step, dma->prot);
+ unlocked += step;
+ }
+
+ pfn += step;
+ iova += PAGE_SIZE * step;
+ npage -= step;
+ }
if (do_accounting)
vfio_lock_acct(dma, locked - unlocked, true);
-----------------
Here are the results of the performance tests.
Base(v6.15):
./vfio-pci-mem-dma-map 0000:03:00.0 16
------- AVERAGE (MADV_HUGEPAGE) --------
VFIO MAP DMA in 0.048 s (333.5 GB/s)
VFIO UNMAP DMA in 0.139 s (115.1 GB/s)
------- AVERAGE (MAP_POPULATE) --------
VFIO MAP DMA in 0.273 s (58.6 GB/s)
VFIO UNMAP DMA in 0.302 s (52.9 GB/s)
------- AVERAGE (HUGETLBFS) --------
VFIO MAP DMA in 0.052 s (305.3 GB/s)
VFIO UNMAP DMA in 0.141 s (113.8 GB/s)
Base + Map + batch-vpfn-accounting-patch:
------- AVERAGE (MADV_HUGEPAGE) --------
VFIO MAP DMA in 0.027 s (591.1 GB/s)
VFIO UNMAP DMA in 0.138 s (115.7 GB/s)
------- AVERAGE (MAP_POPULATE) --------
VFIO MAP DMA in 0.292 s (54.8 GB/s)
VFIO UNMAP DMA in 0.308 s (52.0 GB/s)
------- AVERAGE (HUGETLBFS) --------
VFIO MAP DMA in 0.032 s (505.5 GB/s)
VFIO UNMAP DMA in 0.140 s (114.1 GB/s)
Base + Map + batch-vpfn-accounting-patch + large-folio-optimization-patch:
------- AVERAGE (MADV_HUGEPAGE) --------
VFIO MAP DMA in 0.027 s (591.2 GB/s)
VFIO UNMAP DMA in 0.049 s (327.6 GB/s)
------- AVERAGE (MAP_POPULATE) --------
VFIO MAP DMA in 0.291 s (55.0 GB/s)
VFIO UNMAP DMA in 0.306 s (52.3 GB/s)
------- AVERAGE (HUGETLBFS) --------
VFIO MAP DMA in 0.032 s (498.3 GB/s)
VFIO UNMAP DMA in 0.049 s (326.2 GB/s)
It seems that batching the vpfn accounting doesn't seem to have much
of an impact in my environment. Perhaps this is because the rbtree
for vfpn is empty, allowing vfio_find_vpfn to execute quickly?
Thanks,
Zhe
Powered by blists - more mailing lists