[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251201221355.wvsm6qxwxax46v4t@amd.com>
Date: Mon, 1 Dec 2025 16:13:55 -0600
From: Michael Roth <michael.roth@....com>
To: Yan Zhao <yan.y.zhao@...el.com>
CC: <kvm@...r.kernel.org>, <linux-coco@...ts.linux.dev>, <linux-mm@...ck.org>,
<linux-kernel@...r.kernel.org>, <thomas.lendacky@....com>,
<pbonzini@...hat.com>, <seanjc@...gle.com>, <vbabka@...e.cz>,
<ashish.kalra@....com>, <liam.merwick@...cle.com>, <david@...hat.com>,
<vannapurve@...gle.com>, <ackerleytng@...gle.com>, <aik@....com>,
<ira.weiny@...el.com>
Subject: Re: [PATCH 3/3] KVM: guest_memfd: GUP source pages prior to
populating guest memory
On Mon, Nov 24, 2025 at 05:31:46PM +0800, Yan Zhao wrote:
> On Fri, Nov 21, 2025 at 07:01:44AM -0600, Michael Roth wrote:
> > On Thu, Nov 20, 2025 at 05:11:48PM +0800, Yan Zhao wrote:
> > > On Thu, Nov 13, 2025 at 05:07:59PM -0600, Michael Roth wrote:
> > > > Currently the post-populate callbacks handle copying source pages into
> > > > private GPA ranges backed by guest_memfd, where kvm_gmem_populate()
> > > > acquires the filemap invalidate lock, then calls a post-populate
> > > > callback which may issue a get_user_pages() on the source pages prior to
> > > > copying them into the private GPA (e.g. TDX).
> > > >
> > > > This will not be compatible with in-place conversion, where the
> > > > userspace page fault path will attempt to acquire filemap invalidate
> > > > lock while holding the mm->mmap_lock, leading to a potential ABBA
> > > > deadlock[1].
> > > >
> > > > Address this by hoisting the GUP above the filemap invalidate lock so
> > > > that these page faults path can be taken early, prior to acquiring the
> > > > filemap invalidate lock.
> > > >
> > > > It's not currently clear whether this issue is reachable with the
> > > > current implementation of guest_memfd, which doesn't support in-place
> > > > conversion, however it does provide a consistent mechanism to provide
> > > > stable source/target PFNs to callbacks rather than punting to
> > > > vendor-specific code, which allows for more commonality across
> > > > architectures, which may be worthwhile even without in-place conversion.
> > > >
> > > > Suggested-by: Sean Christopherson <seanjc@...gle.com>
> > > > Signed-off-by: Michael Roth <michael.roth@....com>
> > > > ---
> > > > arch/x86/kvm/svm/sev.c | 40 ++++++++++++++++++++++++++------------
> > > > arch/x86/kvm/vmx/tdx.c | 21 +++++---------------
> > > > include/linux/kvm_host.h | 3 ++-
> > > > virt/kvm/guest_memfd.c | 42 ++++++++++++++++++++++++++++++++++------
> > > > 4 files changed, 71 insertions(+), 35 deletions(-)
> > > >
> > > > diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> > > > index 0835c664fbfd..d0ac710697a2 100644
> > > > --- a/arch/x86/kvm/svm/sev.c
> > > > +++ b/arch/x86/kvm/svm/sev.c
> > > > @@ -2260,7 +2260,8 @@ struct sev_gmem_populate_args {
> > > > };
> > > >
> > > > static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn,
> > > > - void __user *src, int order, void *opaque)
> > > > + struct page **src_pages, loff_t src_offset,
> > > > + int order, void *opaque)
> > > > {
> > > > struct sev_gmem_populate_args *sev_populate_args = opaque;
> > > > struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
> > > > @@ -2268,7 +2269,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf
> > > > int npages = (1 << order);
> > > > gfn_t gfn;
> > > >
> > > > - if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src))
> > > > + if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_pages))
> > > > return -EINVAL;
> > > >
> > > > for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
> > > > @@ -2284,14 +2285,21 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf
> > > > goto err;
> > > > }
> > > >
> > > > - if (src) {
> > > > - void *vaddr = kmap_local_pfn(pfn + i);
> > > > + if (src_pages) {
> > > > + void *src_vaddr = kmap_local_pfn(page_to_pfn(src_pages[i]));
> > > > + void *dst_vaddr = kmap_local_pfn(pfn + i);
> > > >
> > > > - if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) {
> > > > - ret = -EFAULT;
> > > > - goto err;
> > > > + memcpy(dst_vaddr, src_vaddr + src_offset, PAGE_SIZE - src_offset);
> > > > + kunmap_local(src_vaddr);
> > > > +
> > > > + if (src_offset) {
> > > > + src_vaddr = kmap_local_pfn(page_to_pfn(src_pages[i + 1]));
> > > > +
> > > > + memcpy(dst_vaddr + PAGE_SIZE - src_offset, src_vaddr, src_offset);
> > > > + kunmap_local(src_vaddr);
> > > IIUC, src_offset is the src's offset from the first page. e.g.,
> > > src could be 0x7fea82684100, with src_offset=0x100, while npages could be 512.
> > >
> > > Then it looks like the two memcpy() calls here only work when npages == 1 ?
> >
> > src_offset ends up being the offset into the pair of src pages that we
> > are using to fully populate a single dest page with each iteration. So
> > if we start at src_offset, read a page worth of data, then we are now at
> > src_offset in the next src page and the loop continues that way even if
> > npages > 1.
> >
> > If src_offset is 0 we never have to bother with straddling 2 src pages so
> > the 2nd memcpy is skipped on every iteration.
> >
> > That's the intent at least. Is there a flaw in the code/reasoning that I
> > missed?
> Oh, I got you. SNP expects a single src_offset applies for each src page.
>
> So if npages = 2, there're 4 memcpy() calls.
>
> src: |---------|---------|---------| (VA contiguous)
> ^ ^ ^
> | | |
> dst: |---------|---------| (PA contiguous)
>
>
> I previously incorrectly thought kvm_gmem_populate() should pass in src_offset
> as 0 for the 2nd src page.
>
> Would you consider checking if params.uaddr is PAGE_ALIGNED() in
> snp_launch_update() to simplify the design?
This was an option mentioned in the cover letter and during PUCK. I am
not opposed if that's the direction we decide, but I also don't think
it makes big difference since:
int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
struct page **src_pages, loff_t src_offset,
int order, void *opaque);
basically reduces to Sean's originally proposed:
int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
struct page *src_pages, int order,
void *opaque);
for any platform that enforces that the src is page-aligned, which
doesn't seem like a huge technical burden, IMO, despite me initially
thinking it would be gross when I brought this up during the PUCK call
that preceeding this posting.
>
> > >
> > > > }
> > > > - kunmap_local(vaddr);
> > > > +
> > > > + kunmap_local(dst_vaddr);
> > > > }
> > > >
> > > > ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K,
> > > > @@ -2331,12 +2339,20 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf
> > > > if (!snp_page_reclaim(kvm, pfn + i) &&
> > > > sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
> > > > sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) {
> > > > - void *vaddr = kmap_local_pfn(pfn + i);
> > > > + void *src_vaddr = kmap_local_pfn(page_to_pfn(src_pages[i]));
> > > > + void *dst_vaddr = kmap_local_pfn(pfn + i);
> > > >
> > > > - if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE))
> > > > - pr_debug("Failed to write CPUID page back to userspace\n");
> > > > + memcpy(src_vaddr + src_offset, dst_vaddr, PAGE_SIZE - src_offset);
> > > > + kunmap_local(src_vaddr);
> > > > +
> > > > + if (src_offset) {
> > > > + src_vaddr = kmap_local_pfn(page_to_pfn(src_pages[i + 1]));
> > > > +
> > > > + memcpy(src_vaddr, dst_vaddr + PAGE_SIZE - src_offset, src_offset);
> > > > + kunmap_local(src_vaddr);
> > > > + }
> > > >
> > > > - kunmap_local(vaddr);
> > > > + kunmap_local(dst_vaddr);
> > > > }
> > > >
> > > > /* pfn + i is hypervisor-owned now, so skip below cleanup for it. */
> > > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > > > index 57ed101a1181..dd5439ec1473 100644
> > > > --- a/arch/x86/kvm/vmx/tdx.c
> > > > +++ b/arch/x86/kvm/vmx/tdx.c
> > > > @@ -3115,37 +3115,26 @@ struct tdx_gmem_post_populate_arg {
> > > > };
> > > >
> > > > static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> > > > - void __user *src, int order, void *_arg)
> > > > + struct page **src_pages, loff_t src_offset,
> > > > + int order, void *_arg)
> > > > {
> > > > struct tdx_gmem_post_populate_arg *arg = _arg;
> > > > struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > > > u64 err, entry, level_state;
> > > > gpa_t gpa = gfn_to_gpa(gfn);
> > > > - struct page *src_page;
> > > > int ret, i;
> > > >
> > > > if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm))
> > > > return -EIO;
> > > >
> > > > - if (KVM_BUG_ON(!PAGE_ALIGNED(src), kvm))
> > > > + /* Source should be page-aligned, in which case src_offset will be 0. */
> > > > + if (KVM_BUG_ON(src_offset))
> > > if (KVM_BUG_ON(src_offset, kvm))
> > >
> > > > return -EINVAL;
> > > >
> > > > - /*
> > > > - * Get the source page if it has been faulted in. Return failure if the
> > > > - * source page has been swapped out or unmapped in primary memory.
> > > > - */
> > > > - ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
> > > > - if (ret < 0)
> > > > - return ret;
> > > > - if (ret != 1)
> > > > - return -ENOMEM;
> > > > -
> > > > - kvm_tdx->page_add_src = src_page;
> > > > + kvm_tdx->page_add_src = src_pages[i];
> > > src_pages[0] ? i is not initialized.
> >
> > Sorry, I switched on TDX options for compile testing but I must have done a
> > sloppy job confirming it actually built. I'll re-test push these and squash
> > in the fixes in the github tree.
> >
> > >
> > > Should there also be a KVM_BUG_ON(order > 0, kvm) ?
> >
> > Seems reasonable, but I'm not sure this is the right patch. Maybe I
> > could squash it into the preceeding documentation patch so as to not
> > give the impression this patch changes those expectations in any way.
> I don't think it should be documented as a user requirement.
I didn't necessarily mean in the documentation, but mainly some patch
other than this. If we add that check here as part of this patch, we
give the impression that the order expectations are changing as a result
of the changes here, when in reality they are exactly the same as
before.
If not the documentation patch here, then I don't think it really fits
in this series at all and would be more of a standalone patch against
kvm/next.
The change here:
- if (KVM_BUG_ON(!PAGE_ALIGNED(src), kvm))
+ /* Source should be page-aligned, in which case src_offset will be 0. */
+ if (KVM_BUG_ON(src_offset))
made sense as part of this patch, because now that we are passing struct
page *src_pages, we can no longer infer alignment from 'src' field, and
instead need to infer it from src_offset being 0.
>
> However, we need to comment out that this assertion is due to that
> tdx_vcpu_init_mem_region() passes npages as 1 to kvm_gmem_populate().
You mean for the KVM_BUG_ON(order > 0, kvm) you're proposing to add?
Again, if feels awkward to address this as part of this series since it
is an existing/unchanged behavior and not really the intent of this
patchset.
>
> > >
> > > > ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
> > > > kvm_tdx->page_add_src = NULL;
> > > >
> > > > - put_page(src_page);
> > > > -
> > > > if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION))
> > > > return ret;
> > > >
> > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > index d93f75b05ae2..7e9d2403c61f 100644
> > > > --- a/include/linux/kvm_host.h
> > > > +++ b/include/linux/kvm_host.h
> > > > @@ -2581,7 +2581,8 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord
> > > > * Returns the number of pages that were populated.
> > > > */
> > > > typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> > > > - void __user *src, int order, void *opaque);
> > > > + struct page **src_pages, loff_t src_offset,
> > > > + int order, void *opaque);
> > > >
> > > > long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
> > > > kvm_gmem_populate_cb post_populate, void *opaque);
> > > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> > > > index 9160379df378..e9ac3fd4fd8f 100644
> > > > --- a/virt/kvm/guest_memfd.c
> > > > +++ b/virt/kvm/guest_memfd.c
> > > > @@ -814,14 +814,17 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > > EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
> > > >
> > > > #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
> > > > +
> > > > +#define GMEM_GUP_NPAGES (1UL << PMD_ORDER)
> > > Limiting GMEM_GUP_NPAGES to PMD_ORDER may only work when the max_order of a huge
> > > folio is 2MB. What if the max_order returned from __kvm_gmem_get_pfn() is 1GB
> > > when src_pages[] can only hold up to 512 pages?
> >
> > This was necessarily chosen in prep for hugepages, but more about my
> > unease at letting userspace GUP arbitrarilly large ranges. PMD_ORDER
> > happens to align with 2MB hugepages while seeming like a reasonable
> > batching value so that's why I chose it.
> >
> > Even with 1GB support, I wasn't really planning to increase it. SNP
> > doesn't really make use of RMP sizes >2MB, and it sounds like TDX
> > handles promotion in a completely different path. So atm I'm leaning
> > toward just letting GMEM_GUP_NPAGES be the cap for the max page size we
> > support for kvm_gmem_populate() path and not bothering to change it
> > until a solid use-case arises.
> The problem is that with hugetlb-based guest_memfd, the folio itself could be
> of 1GB, though SNP and TDX can force mapping at only 4KB.
If TDX wants to unload handling of page-clearing to its per-page
post-populate callback and tie that its shared/private tracking that's
perfectly fine by me.
*How* TDX tells gmem it wants this different behavior is a topic for a
follow-up patchset, Vishal suggested kernel-internal flags to
kvm_gmem_create(), which seemed reasonable to me. In that case, uptodate
flag would probably just default to set and punt to post-populate/prep
hooks, because we absolutely *do not* want to have to re-introduce per-4K
tracking of this type of state within gmem, since getting rid of that sort
of tracking requirement within gmem is the entire motivation of this
series. And since, within this series, the uptodate flag and
prep-tracking both have the same 4K granularity, it seems unecessary to
address this here.
If you were to send a patchset on top of this (or even independently) that
introduces said kernel-internal gmem flag to offload uptodate-tracking to
post-populate/prep hooks, and utilize it to optimize the current 4K-only
TDX implementation by letting TDX module handle the initial
page-clearing, then I think that change/discussion can progress without
being blocked in any major way by this series.
But I don't think we need to flesh all that out here, so long as we are
aware of this as a future change/requirement and have reasonable
indication that it is compatible with this series.
>
> Then since max_order = folio_order(folio) (at least in the tree for [1]),
> WARN_ON() in kvm_gmem_populate() could still be hit:
>
> folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &max_order);
> WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
> (npages - i) < (1 << max_order));
Yes, in the SNP implementation of hugetlb I ended up removing this
warning, and in that case I also ended up forcing kvm_gmem_populate() to
be 4K-only:
https://github.com/AMDESE/linux/blob/snp-hugetlb-v2-wip0/virt/kvm/guest_memfd.c#L2372
but it makes a lot more sense to make those restrictions and changes in
the context of hugepage support, rather than this series which is trying
very hard to not do hugepage enablement, but simply keep what's partially
there intact while reworking other things that have proven to be
continued impediments to both in-place conversion and hugepage
enablement.
Also, there's talk now of enabling hugepages even without in-place
conversion for hugetlbfs, and that will likely be the same path we
follow for THP to remain in alignment. Rather than anticipating what all
these changes will mean WRT hugepage implementation/requirements, I
think it will be fruitful to remove some of the baggage that will
complicate that process/discussion like this patchset attempts.
-Mike
>
> TDX is even easier to hit this warning because it always passes npages as 1.
>
> [1] https://lore.kernel.org/all/cover.1747264138.git.ackerleytng@google.com
>
>
> > > Increasing GMEM_GUP_NPAGES to (1UL << PUD_ORDER) is probabaly not a good idea.
> > >
> > > Given both TDX/SNP map at 4KB granularity, why not just invoke post_populate()
> > > per 4KB while removing the max_order from post_populate() parameters, as done
> > > in Sean's sketch patch [1]?
> >
> > That's an option too, but SNP can make use of 2MB pages in the
> > post-populate callback so I don't want to shut the door on that option
> > just yet if it's not too much of a pain to work in. Given the guest BIOS
> > lives primarily in 1 or 2 of these 2MB regions the benefits might be
> > worthwhile, and SNP doesn't have a post-post-populate promotion path
> > like TDX (at least, not one that would help much for guest boot times)
> I see.
>
> So, what about below change?
>
> --- a/virt/kvm/guest_memfd.c
> +++ b/virt/kvm/guest_memfd.c
> @@ -878,11 +878,10 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> }
>
> folio_unlock(folio);
> - WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
> - (npages - i) < (1 << max_order));
>
> ret = -EINVAL;
> - while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
> + while (!IS_ALIGNED(gfn, 1 << max_order) || (npages - i) < (1 << max_order) ||
> + !kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
> KVM_MEMORY_ATTRIBUTE_PRIVATE,
> KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
> if (!max_order)
>
>
>
> >
> > >
> > > Then the WARN_ON() in kvm_gmem_populate() can be removed, which would be easily
> > > triggered by TDX when max_order > 0 && npages == 1:
> > >
> > > WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
> > > (npages - i) < (1 << max_order));
> > >
> > >
> > > [1] https://lore.kernel.org/all/aHEwT4X0RcfZzHlt@google.com/
> > >
> > > > long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
> > > > kvm_gmem_populate_cb post_populate, void *opaque)
> > > > {
> > > > struct kvm_memory_slot *slot;
> > > > - void __user *p;
> > > > -
> > > > + struct page **src_pages;
> > > > int ret = 0, max_order;
> > > > - long i;
> > > > + loff_t src_offset = 0;
> > > > + long i, src_npages;
> > > >
> > > > lockdep_assert_held(&kvm->slots_lock);
> > > >
> > > > @@ -836,9 +839,28 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > > > if (!file)
> > > > return -EFAULT;
> > > >
> > > > + npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
> > > > + npages = min_t(ulong, npages, GMEM_GUP_NPAGES);
> > > > +
> > > > + if (src) {
> > > > + src_npages = IS_ALIGNED((unsigned long)src, PAGE_SIZE) ? npages : npages + 1;
> > > > +
> > > > + src_pages = kmalloc_array(src_npages, sizeof(struct page *), GFP_KERNEL);
> > > > + if (!src_pages)
> > > > + return -ENOMEM;
> > > > +
> > > > + ret = get_user_pages_fast((unsigned long)src, src_npages, 0, src_pages);
> > > > + if (ret < 0)
> > > > + return ret;
> > > > +
> > > > + if (ret != src_npages)
> > > > + return -ENOMEM;
> > > > +
> > > > + src_offset = (loff_t)(src - PTR_ALIGN_DOWN(src, PAGE_SIZE));
> > > > + }
> > > > +
> > > > filemap_invalidate_lock(file->f_mapping);
> > > >
> > > > - npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
> > > > for (i = 0; i < npages; i += (1 << max_order)) {
> > > > struct folio *folio;
> > > > gfn_t gfn = start_gfn + i;
> > > > @@ -869,8 +891,8 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > > > max_order--;
> > > > }
> > > >
> > > > - p = src ? src + i * PAGE_SIZE : NULL;
> > > > - ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
> > > > + ret = post_populate(kvm, gfn, pfn, src ? &src_pages[i] : NULL,
> > > > + src_offset, max_order, opaque);
> > > Why src_offset is not 0 starting from the 2nd page?
> > >
> > > > if (!ret)
> > > > folio_mark_uptodate(folio);
> > > >
> > > > @@ -882,6 +904,14 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > > >
> > > > filemap_invalidate_unlock(file->f_mapping);
> > > >
> > > > + if (src) {
> > > > + long j;
> > > > +
> > > > + for (j = 0; j < src_npages; j++)
> > > > + put_page(src_pages[j]);
> > > > + kfree(src_pages);
> > > > + }
> > > > +
> > > > return ret && !i ? ret : i;
> > > > }
> > > > EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);
> > > > --
> > > > 2.25.1
> > > >
Powered by blists - more mailing lists