[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20250520062031.3023-1-lizhe.67@bytedance.com>
Date: Tue, 20 May 2025 14:20:31 +0800
From: lizhe.67@...edance.com
To: alex.williamson@...hat.com
Cc: kvm@...r.kernel.org,
linux-kernel@...r.kernel.org,
lizhe.67@...edance.com,
muchun.song@...ux.dev,
peterx@...hat.com
Subject: Re: [PATCH v2] vfio/type1: optimize vfio_pin_pages_remote() for hugetlbfs folio
On Mon, 19 May 2025 10:07:24 -0600, alex.williamson@...hat.com wrote:
>> /*
>> - * Helper Functions for host iova-pfn list
>> + * Find the first vfio_pfn that overlapping the range
>> + * [iova, iova + PAGE_SIZE * npage) in rb tree
>> */
>> -static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
>> +static struct vfio_pfn *vfio_find_vpfn_range(struct vfio_dma *dma,
>> + dma_addr_t iova, unsigned long npage)
>> {
>> struct vfio_pfn *vpfn;
>> struct rb_node *node = dma->pfn_list.rb_node;
>> + dma_addr_t end_iova = iova + PAGE_SIZE * npage;
>>
>> while (node) {
>> vpfn = rb_entry(node, struct vfio_pfn, node);
>>
>> - if (iova < vpfn->iova)
>> + if (end_iova <= vpfn->iova)
>> node = node->rb_left;
>> else if (iova > vpfn->iova)
>> node = node->rb_right;
>> @@ -337,6 +340,14 @@ static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
>> return NULL;
>> }
>>
>> +/*
>> + * Helper Functions for host iova-pfn list
>> + */
>
>This comment should still precede the renamed function above, it's in
>reference to this section of code related to searching, inserting, and
>removing entries from the pfn list.
Thanks!
I will place it in the comments before the function vfio_find_vpfn_range()
in v3 patch.
>> @@ -681,32 +692,67 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
>> * and rsvd here, and therefore continues to use the batch.
>> */
>> while (true) {
>> + int page_step = 1;
>> + long lock_acct_step = 1;
>> + struct folio *folio = page_folio(batch->pages[batch->offset]);
>> + bool found_vpfn;
>> +
>> if (pfn != *pfn_base + pinned ||
>> rsvd != is_invalid_reserved_pfn(pfn))
>> goto out;
>>
>> + /* Handle hugetlbfs page */
>> + if (folio_test_hugetlb(folio)) {
>
>Why do we care to specifically test for hugetlb vs
>folio_large_nr_pages(), at which point we can just use folio_nr_pages()
>directly here.
>
>> + unsigned long start_pfn = PHYS_PFN(vaddr);
>
>Using this macro on a vaddr looks wrong.
>
>> +
>> + /*
>> + * Note: The current page_step does not achieve the optimal
>> + * performance in scenarios where folio_nr_pages() exceeds
>> + * batch->capacity. It is anticipated that future enhancements
>> + * will address this limitation.
>> + */
>> + page_step = min(batch->size,
>> + ALIGN(start_pfn + 1, folio_nr_pages(folio)) - start_pfn);
>
>Why do we assume start_pfn is the beginning of the folio?
>
>> + found_vpfn = !!vfio_find_vpfn_range(dma, iova, page_step);
>> + if (rsvd || !found_vpfn) {
>> + lock_acct_step = page_step;
>> + } else {
>> + dma_addr_t tmp_iova = iova;
>> + int i;
>> +
>> + lock_acct_step = 0;
>> + for (i = 0; i < page_step; ++i, tmp_iova += PAGE_SIZE)
>> + if (!vfio_find_vpfn(dma, tmp_iova))
>> + lock_acct_step++;
>> + if (lock_acct_step)
>> + found_vpfn = false;
>
>Why are we making this so complicated versus falling back to iterating
>at page per page?
>
>> + }
>> + } else {
>> + found_vpfn = vfio_find_vpfn(dma, iova);
>> + }
>> +
>> /*
>> * Reserved pages aren't counted against the user,
>> * externally pinned pages are already counted against
>> * the user.
>> */
>> - if (!rsvd && !vfio_find_vpfn(dma, iova)) {
>> + if (!rsvd && !found_vpfn) {
>> if (!dma->lock_cap &&
>> - mm->locked_vm + lock_acct + 1 > limit) {
>> + mm->locked_vm + lock_acct + lock_acct_step > limit) {
>> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
>> __func__, limit << PAGE_SHIFT);
>> ret = -ENOMEM;
>> goto unpin_out;
>> }
>> - lock_acct++;
>> + lock_acct += lock_acct_step;
>> }
>>
>> - pinned++;
>> - npage--;
>> - vaddr += PAGE_SIZE;
>> - iova += PAGE_SIZE;
>> - batch->offset++;
>> - batch->size--;
>> + pinned += page_step;
>> + npage -= page_step;
>> + vaddr += PAGE_SIZE * page_step;
>> + iova += PAGE_SIZE * page_step;
>> + batch->offset += page_step;
>> + batch->size -= page_step;
>>
>> if (!batch->size)
>> break;
>
>Why is something like below (untested) not sufficient?
>
>NB. (vaddr - folio_address()) still needs some scrutiny to determine if
>it's valid.
>
>@@ -681,32 +692,40 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> * and rsvd here, and therefore continues to use the batch.
> */
> while (true) {
>+ struct folio *folio = page_folio(batch->pages[batch->offset]);
>+ long nr_pages;
>+
> if (pfn != *pfn_base + pinned ||
> rsvd != is_invalid_reserved_pfn(pfn))
> goto out;
>
>+ nr_pages = min(batch->size, folio_nr_pages(folio) -
>+ folio_page_idx(folio, batch->pages[batch->offset]);
>+ if (nr_pages > 1 && vfio_find_vpfn_range(dma, iova, nr_pages))
>+ nr_pages = 1;
>+
> /*
> * Reserved pages aren't counted against the user,
> * externally pinned pages are already counted against
> * the user.
> */
>- if (!rsvd && !vfio_find_vpfn(dma, iova)) {
>+ if (!rsvd && (nr_pages > 1 || !vfio_find_vpfn(dma, iova))) {
> if (!dma->lock_cap &&
>- mm->locked_vm + lock_acct + 1 > limit) {
>+ mm->locked_vm + lock_acct + nr_pages > limit) {
> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> __func__, limit << PAGE_SHIFT);
> ret = -ENOMEM;
> goto unpin_out;
> }
>- lock_acct++;
>+ lock_acct += nr_pages;
> }
>
>- pinned++;
>- npage--;
>- vaddr += PAGE_SIZE;
>- iova += PAGE_SIZE;
>- batch->offset++;
>- batch->size--;
>+ pinned += nr_pages;
>+ npage -= nr_pages;
>+ vaddr += PAGE_SIZE * nr_pages;
>+ iova += PAGE_SIZE * nr_pages;
>+ batch->offset += nr_pages;
>+ batch->size -= nr_pages;
>
> if (!batch->size)
> break;
The input parameter vaddr is a user-space address, while folio_address()
returns a kernel space address. I agree that using folio_page_idx() is
the best approach.
This approach indeed simplifies things a lot. I have conducted basic
functionality tests on it and have not found any issues. Please allow me
to integrate it into the v3 patch. Thanks!
Powered by blists - more mailing lists