linux-kernel - Re: [PATCH 3/3] KVM: guest_memfd: GUP source pages prior to populating guest memory

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251203142648.trx6sslxvxr26yzd@amd.com>
Date: Wed, 3 Dec 2025 08:26:48 -0600
From: Michael Roth <michael.roth@....com>
To: Yan Zhao <yan.y.zhao@...el.com>
CC: <kvm@...r.kernel.org>, <linux-coco@...ts.linux.dev>, <linux-mm@...ck.org>,
	<linux-kernel@...r.kernel.org>, <thomas.lendacky@....com>,
	<pbonzini@...hat.com>, <seanjc@...gle.com>, <vbabka@...e.cz>,
	<ashish.kalra@....com>, <liam.merwick@...cle.com>, <david@...hat.com>,
	<vannapurve@...gle.com>, <ackerleytng@...gle.com>, <aik@....com>,
	<ira.weiny@...el.com>
Subject: Re: [PATCH 3/3] KVM: guest_memfd: GUP source pages prior to
 populating guest memory

On Wed, Dec 03, 2025 at 10:46:27AM +0800, Yan Zhao wrote:
> On Mon, Dec 01, 2025 at 04:13:55PM -0600, Michael Roth wrote:
> > On Mon, Nov 24, 2025 at 05:31:46PM +0800, Yan Zhao wrote:
> > > On Fri, Nov 21, 2025 at 07:01:44AM -0600, Michael Roth wrote:
> > > > On Thu, Nov 20, 2025 at 05:11:48PM +0800, Yan Zhao wrote:
> > > > > On Thu, Nov 13, 2025 at 05:07:59PM -0600, Michael Roth wrote:
> > > > > > Currently the post-populate callbacks handle copying source pages into
> > > > > > private GPA ranges backed by guest_memfd, where kvm_gmem_populate()
> > > > > > acquires the filemap invalidate lock, then calls a post-populate
> > > > > > callback which may issue a get_user_pages() on the source pages prior to
> > > > > > copying them into the private GPA (e.g. TDX).
> > > > > > 
> > > > > > This will not be compatible with in-place conversion, where the
> > > > > > userspace page fault path will attempt to acquire filemap invalidate
> > > > > > lock while holding the mm->mmap_lock, leading to a potential ABBA
> > > > > > deadlock[1].
> > > > > > 
> > > > > > Address this by hoisting the GUP above the filemap invalidate lock so
> > > > > > that these page faults path can be taken early, prior to acquiring the
> > > > > > filemap invalidate lock.
> > > > > > 
> > > > > > It's not currently clear whether this issue is reachable with the
> > > > > > current implementation of guest_memfd, which doesn't support in-place
> > > > > > conversion, however it does provide a consistent mechanism to provide
> > > > > > stable source/target PFNs to callbacks rather than punting to
> > > > > > vendor-specific code, which allows for more commonality across
> > > > > > architectures, which may be worthwhile even without in-place conversion.
> > > > > > 
> > > > > > Suggested-by: Sean Christopherson <seanjc@...gle.com>
> > > > > > Signed-off-by: Michael Roth <michael.roth@....com>
> > > > > > ---
> > > > > >  arch/x86/kvm/svm/sev.c   | 40 ++++++++++++++++++++++++++------------
> > > > > >  arch/x86/kvm/vmx/tdx.c   | 21 +++++---------------
> > > > > >  include/linux/kvm_host.h |  3 ++-
> > > > > >  virt/kvm/guest_memfd.c   | 42 ++++++++++++++++++++++++++++++++++------
> > > > > >  4 files changed, 71 insertions(+), 35 deletions(-)
> > > > > > 
> > > > > > diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
> > > > > > index 0835c664fbfd..d0ac710697a2 100644
> > > > > > --- a/arch/x86/kvm/svm/sev.c
> > > > > > +++ b/arch/x86/kvm/svm/sev.c
> > > > > > @@ -2260,7 +2260,8 @@ struct sev_gmem_populate_args {
> > > > > >  };
> > > > > >  
> > > > > >  static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pfn,
> > > > > > -				  void __user *src, int order, void *opaque)
> > > > > > +				  struct page **src_pages, loff_t src_offset,
> > > > > > +				  int order, void *opaque)
> > > > > >  {
> > > > > >  	struct sev_gmem_populate_args *sev_populate_args = opaque;
> > > > > >  	struct kvm_sev_info *sev = to_kvm_sev_info(kvm);
> > > > > > @@ -2268,7 +2269,7 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf
> > > > > >  	int npages = (1 << order);
> > > > > >  	gfn_t gfn;
> > > > > >  
> > > > > > -	if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src))
> > > > > > +	if (WARN_ON_ONCE(sev_populate_args->type != KVM_SEV_SNP_PAGE_TYPE_ZERO && !src_pages))
> > > > > >  		return -EINVAL;
> > > > > >  
> > > > > >  	for (gfn = gfn_start, i = 0; gfn < gfn_start + npages; gfn++, i++) {
> > > > > > @@ -2284,14 +2285,21 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf
> > > > > >  			goto err;
> > > > > >  		}
> > > > > >  
> > > > > > -		if (src) {
> > > > > > -			void *vaddr = kmap_local_pfn(pfn + i);
> > > > > > +		if (src_pages) {
> > > > > > +			void *src_vaddr = kmap_local_pfn(page_to_pfn(src_pages[i]));
> > > > > > +			void *dst_vaddr = kmap_local_pfn(pfn + i);
> > > > > >  
> > > > > > -			if (copy_from_user(vaddr, src + i * PAGE_SIZE, PAGE_SIZE)) {
> > > > > > -				ret = -EFAULT;
> > > > > > -				goto err;
> > > > > > +			memcpy(dst_vaddr, src_vaddr + src_offset, PAGE_SIZE - src_offset);
> > > > > > +			kunmap_local(src_vaddr);
> > > > > > +
> > > > > > +			if (src_offset) {
> > > > > > +				src_vaddr = kmap_local_pfn(page_to_pfn(src_pages[i + 1]));
> > > > > > +
> > > > > > +				memcpy(dst_vaddr + PAGE_SIZE - src_offset, src_vaddr, src_offset);
> > > > > > +				kunmap_local(src_vaddr);
> > > > > IIUC, src_offset is the src's offset from the first page. e.g.,
> > > > > src could be 0x7fea82684100, with src_offset=0x100, while npages could be 512.
> > > > > 
> > > > > Then it looks like the two memcpy() calls here only work when npages == 1 ?
> > > > 
> > > > src_offset ends up being the offset into the pair of src pages that we
> > > > are using to fully populate a single dest page with each iteration. So
> > > > if we start at src_offset, read a page worth of data, then we are now at
> > > > src_offset in the next src page and the loop continues that way even if
> > > > npages > 1.
> > > > 
> > > > If src_offset is 0 we never have to bother with straddling 2 src pages so
> > > > the 2nd memcpy is skipped on every iteration.
> > > > 
> > > > That's the intent at least. Is there a flaw in the code/reasoning that I
> > > > missed?
> > > Oh, I got you. SNP expects a single src_offset applies for each src page.
> > > 
> > > So if npages = 2, there're 4 memcpy() calls.
> > > 
> > > src:  |---------|---------|---------|  (VA contiguous)
> > >           ^         ^         ^
> > >           |         |         |
> > > dst:      |---------|---------|   (PA contiguous)
> > > 
> > > 
> > > I previously incorrectly thought kvm_gmem_populate() should pass in src_offset
> > > as 0 for the 2nd src page.
> > > 
> > > Would you consider checking if params.uaddr is PAGE_ALIGNED() in
> > > snp_launch_update() to simplify the design?
> > 
> > This was an option mentioned in the cover letter and during PUCK. I am
> > not opposed if that's the direction we decide, but I also don't think
> > it makes big difference since:
> > 
> >    int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> >                                struct page **src_pages, loff_t src_offset,
> >                                int order, void *opaque);
> > 
> > basically reduces to Sean's originally proposed:
> > 
> >    int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> >                                struct page *src_pages, int order,
> >                                void *opaque);
> 
> Hmm, the requirement of having each copy to dst_page account for src_offset
> (which actually results in 2 copies) is quite confusing. I initially thought the
> src_offset only applied to the first dst_page.

What I'm wondering though is if I'd done a better job of documenting
this aspect, e.g. with the following comment added above
kvm_gmem_populate_cb:

  /*
   * ...
   * 'src_pages': array of GUP'd struct page pointers corresponding to
   *              the pages that store the data that is to be copied
   *              into the HPA corresponding to 'pfn'
   * 'src_offset': byte offset, relative to the first page in the array
   *               of pages pointed to by 'src_pages', to begin copying
   *               the data from.
   *
   * NOTE: if the caller of kvm_gmem_populate() enforces that 'src' is
   * page-aligned, then 'src_offset' will always be zero, and src_pages
   * will contain only 1 page to copy from, beginning at byte offset 0.
   * In this case, callers can assume src_offset is 0.
   */
  int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
                              struct page **src_pages, loff_t src_offset,
                              int order, void *opaque);

could the confusion have been avoided, or is it still unwieldly?

I don't mind that users like SNP need to deal with the extra bits, but
I'm hoping for users like TDX it isn't too cludgy.

> 
> This will also cause kvm_gmem_populate() to allocate 1 more src_npages than
> npages for dst pages.

That's more of a decision on the part of userspace deciding to use
non-page-aligned 'src' pointer to begin with.

> 
> > for any platform that enforces that the src is page-aligned, which
> > doesn't seem like a huge technical burden, IMO, despite me initially
> > thinking it would be gross when I brought this up during the PUCK call
> > that preceeding this posting.
> > > 
> > > > > 
> > > > > >  			}
> > > > > > -			kunmap_local(vaddr);
> > > > > > +
> > > > > > +			kunmap_local(dst_vaddr);
> > > > > >  		}
> > > > > >  
> > > > > >  		ret = rmp_make_private(pfn + i, gfn << PAGE_SHIFT, PG_LEVEL_4K,
> > > > > > @@ -2331,12 +2339,20 @@ static int sev_gmem_post_populate(struct kvm *kvm, gfn_t gfn_start, kvm_pfn_t pf
> > > > > >  	if (!snp_page_reclaim(kvm, pfn + i) &&
> > > > > >  	    sev_populate_args->type == KVM_SEV_SNP_PAGE_TYPE_CPUID &&
> > > > > >  	    sev_populate_args->fw_error == SEV_RET_INVALID_PARAM) {
> > > > > > -		void *vaddr = kmap_local_pfn(pfn + i);
> > > > > > +		void *src_vaddr = kmap_local_pfn(page_to_pfn(src_pages[i]));
> > > > > > +		void *dst_vaddr = kmap_local_pfn(pfn + i);
> > > > > >  
> > > > > > -		if (copy_to_user(src + i * PAGE_SIZE, vaddr, PAGE_SIZE))
> > > > > > -			pr_debug("Failed to write CPUID page back to userspace\n");
> > > > > > +		memcpy(src_vaddr + src_offset, dst_vaddr, PAGE_SIZE - src_offset);
> > > > > > +		kunmap_local(src_vaddr);
> > > > > > +
> > > > > > +		if (src_offset) {
> > > > > > +			src_vaddr = kmap_local_pfn(page_to_pfn(src_pages[i + 1]));
> > > > > > +
> > > > > > +			memcpy(src_vaddr, dst_vaddr + PAGE_SIZE - src_offset, src_offset);
> > > > > > +			kunmap_local(src_vaddr);
> > > > > > +		}
> > > > > >  
> > > > > > -		kunmap_local(vaddr);
> > > > > > +		kunmap_local(dst_vaddr);
> > > > > >  	}
> > > > > >  
> > > > > >  	/* pfn + i is hypervisor-owned now, so skip below cleanup for it. */
> > > > > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> > > > > > index 57ed101a1181..dd5439ec1473 100644
> > > > > > --- a/arch/x86/kvm/vmx/tdx.c
> > > > > > +++ b/arch/x86/kvm/vmx/tdx.c
> > > > > > @@ -3115,37 +3115,26 @@ struct tdx_gmem_post_populate_arg {
> > > > > >  };
> > > > > >  
> > > > > >  static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> > > > > > -				  void __user *src, int order, void *_arg)
> > > > > > +				  struct page **src_pages, loff_t src_offset,
> > > > > > +				  int order, void *_arg)
> > > > > >  {
> > > > > >  	struct tdx_gmem_post_populate_arg *arg = _arg;
> > > > > >  	struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> > > > > >  	u64 err, entry, level_state;
> > > > > >  	gpa_t gpa = gfn_to_gpa(gfn);
> > > > > > -	struct page *src_page;
> > > > > >  	int ret, i;
> > > > > >  
> > > > > >  	if (KVM_BUG_ON(kvm_tdx->page_add_src, kvm))
> > > > > >  		return -EIO;
> > > > > >  
> > > > > > -	if (KVM_BUG_ON(!PAGE_ALIGNED(src), kvm))
> > > > > > +	/* Source should be page-aligned, in which case src_offset will be 0. */
> > > > > > +	if (KVM_BUG_ON(src_offset))
> > > > > 	if (KVM_BUG_ON(src_offset, kvm))
> > > > > 
> > > > > >  		return -EINVAL;
> > > > > >  
> > > > > > -	/*
> > > > > > -	 * Get the source page if it has been faulted in. Return failure if the
> > > > > > -	 * source page has been swapped out or unmapped in primary memory.
> > > > > > -	 */
> > > > > > -	ret = get_user_pages_fast((unsigned long)src, 1, 0, &src_page);
> > > > > > -	if (ret < 0)
> > > > > > -		return ret;
> > > > > > -	if (ret != 1)
> > > > > > -		return -ENOMEM;
> > > > > > -
> > > > > > -	kvm_tdx->page_add_src = src_page;
> > > > > > +	kvm_tdx->page_add_src = src_pages[i];
> > > > > src_pages[0] ? i is not initialized. 
> > > > 
> > > > Sorry, I switched on TDX options for compile testing but I must have done a
> > > > sloppy job confirming it actually built. I'll re-test push these and squash
> > > > in the fixes in the github tree.
> > > > 
> > > > > 
> > > > > Should there also be a KVM_BUG_ON(order > 0, kvm) ?
> > > > 
> > > > Seems reasonable, but I'm not sure this is the right patch. Maybe I
> > > > could squash it into the preceeding documentation patch so as to not
> > > > give the impression this patch changes those expectations in any way.
> > > I don't think it should be documented as a user requirement.
> > 
> > I didn't necessarily mean in the documentation, but mainly some patch
> > other than this. If we add that check here as part of this patch, we
> > give the impression that the order expectations are changing as a result
> > of the changes here, when in reality they are exactly the same as
> > before.
> > 
> > If not the documentation patch here, then I don't think it really fits
> > in this series at all and would be more of a standalone patch against
> > kvm/next.
> > 
> > The change here:
> > 
> >  -	if (KVM_BUG_ON(!PAGE_ALIGNED(src), kvm))
> >  +	/* Source should be page-aligned, in which case src_offset will be 0. */
> >  +	if (KVM_BUG_ON(src_offset))
> > 
> > made sense as part of this patch, because now that we are passing struct
> > page *src_pages, we can no longer infer alignment from 'src' field, and
> > instead need to infer it from src_offset being 0.
> > 
> > > 
> > > However, we need to comment out that this assertion is due to that
> > > tdx_vcpu_init_mem_region() passes npages as 1 to kvm_gmem_populate().
> > 
> > You mean for the KVM_BUG_ON(order > 0, kvm) you're proposing to add?
> > Again, if feels awkward to address this as part of this series since it
> > is an existing/unchanged behavior and not really the intent of this
> > patchset.
> That's true. src_pages[0] just makes it more eye-catching.
> What about just adding a comment for src_pages[0] instead of KVM_BUG_ON()?

That seems fair/relevant for this series.

> 
> > > > > >  	ret = kvm_tdp_mmu_map_private_pfn(arg->vcpu, gfn, pfn);
> > > > > >  	kvm_tdx->page_add_src = NULL;
> > > > > >  
> > > > > > -	put_page(src_page);
> > > > > > -
> > > > > >  	if (ret || !(arg->flags & KVM_TDX_MEASURE_MEMORY_REGION))
> > > > > >  		return ret;
> > > > > >  
> > > > > > diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> > > > > > index d93f75b05ae2..7e9d2403c61f 100644
> > > > > > --- a/include/linux/kvm_host.h
> > > > > > +++ b/include/linux/kvm_host.h
> > > > > > @@ -2581,7 +2581,8 @@ int kvm_arch_gmem_prepare(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn, int max_ord
> > > > > >   * Returns the number of pages that were populated.
> > > > > >   */
> > > > > >  typedef int (*kvm_gmem_populate_cb)(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
> > > > > > -				    void __user *src, int order, void *opaque);
> > > > > > +				    struct page **src_pages, loff_t src_offset,
> > > > > > +				    int order, void *opaque);
> > > > > >  
> > > > > >  long kvm_gmem_populate(struct kvm *kvm, gfn_t gfn, void __user *src, long npages,
> > > > > >  		       kvm_gmem_populate_cb post_populate, void *opaque);
> > > > > > diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
> > > > > > index 9160379df378..e9ac3fd4fd8f 100644
> > > > > > --- a/virt/kvm/guest_memfd.c
> > > > > > +++ b/virt/kvm/guest_memfd.c
> > > > > > @@ -814,14 +814,17 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
> > > > > >  EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
> > > > > >  
> > > > > >  #ifdef CONFIG_HAVE_KVM_ARCH_GMEM_POPULATE
> > > > > > +
> > > > > > +#define GMEM_GUP_NPAGES (1UL << PMD_ORDER)
> > > > > Limiting GMEM_GUP_NPAGES to PMD_ORDER may only work when the max_order of a huge
> > > > > folio is 2MB. What if the max_order returned from  __kvm_gmem_get_pfn() is 1GB
> > > > > when src_pages[] can only hold up to 512 pages?
> > > > 
> > > > This was necessarily chosen in prep for hugepages, but more about my
> > > > unease at letting userspace GUP arbitrarilly large ranges. PMD_ORDER
> > > > happens to align with 2MB hugepages while seeming like a reasonable
> > > > batching value so that's why I chose it.
> > > >
> > > > Even with 1GB support, I wasn't really planning to increase it. SNP
> > > > doesn't really make use of RMP sizes >2MB, and it sounds like TDX
> > > > handles promotion in a completely different path. So atm I'm leaning
> > > > toward just letting GMEM_GUP_NPAGES be the cap for the max page size we
> > > > support for kvm_gmem_populate() path and not bothering to change it
> > > > until a solid use-case arises.
> > > The problem is that with hugetlb-based guest_memfd, the folio itself could be
> > > of 1GB, though SNP and TDX can force mapping at only 4KB.
> > 
> > If TDX wants to unload handling of page-clearing to its per-page
> > post-populate callback and tie that its shared/private tracking that's
> > perfectly fine by me.
> > 
> > *How* TDX tells gmem it wants this different behavior is a topic for a
> > follow-up patchset, Vishal suggested kernel-internal flags to
> > kvm_gmem_create(), which seemed reasonable to me. In that case, uptodate
> Not sure which flag you are referring to with "Vishal suggested kernel-internal
> flags to kvm_gmem_create()".
> 
> However, my point is that when the backend folio is 1GB in size (leading to
> max_order being PUD_ORDER), even if SNP only maps at 2MB to RMP, it may hit the
> warning of "!IS_ALIGNED(gfn, 1 << max_order)".

I think I've had to remove that warning every time I start working on
some new spin of THP/hugetlbfs-based SNP. I'm not objecting to that. But it's
obvious there, in those contexts, and I can explain exactly why it's being
removed.

It's not obvious in this series, where all we have are hand-wavy thoughts
about what hugepages will look like. For all we know we might decide that
kvm_gmem_populate() path should just pre-split hugepages to make all the
logic easier, or we decide to lock it in at 4K-only and just strip all the
hugepage stuff out. I don't really know, and this doesn't seem like the place
to try to hash all that out when nothing in this series will cause this
existing WARN_ON to be tripped.

> 
> For TDX, it's worse because it always passes npages as 1, so it will also hit
> the warning of "(npages - i) < (1 << max_order)".
> 
> Given that this patch already considers huge pages for SNP, it feels half-baked
> to leave the WARN_ON() for future handling.
>     WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
>             (npages - i) < (1 << max_order));
> 
> > flag would probably just default to set and punt to post-populate/prep
> > hooks, because we absolutely *do not* want to have to re-introduce per-4K
> > tracking of this type of state within gmem, since getting rid of that sort
> > of tracking requirement within gmem is the entire motivation of this
> > series. And since, within this series, the uptodate flag and
> > prep-tracking both have the same 4K granularity, it seems unecessary to
> > address this here.
> > 
> > If you were to send a patchset on top of this (or even independently) that
> > introduces said kernel-internal gmem flag to offload uptodate-tracking to
> > post-populate/prep hooks, and utilize it to optimize the current 4K-only
> > TDX implementation by letting TDX module handle the initial
> > page-clearing, then I think that change/discussion can progress without
> > being blocked in any major way by this series.
> > 
> > But I don't think we need to flesh all that out here, so long as we are
> > aware of this as a future change/requirement and have reasonable
> > indication that it is compatible with this series.
> > 
> > > 
> > > Then since max_order = folio_order(folio) (at least in the tree for [1]), 
> > > WARN_ON() in kvm_gmem_populate() could still be hit:
> > > 
> > > folio = __kvm_gmem_get_pfn(file, slot, index, &pfn, &max_order);
> > > WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
> > >         (npages - i) < (1 << max_order));
> > 
> > Yes, in the SNP implementation of hugetlb I ended up removing this
> > warning, and in that case I also ended up forcing kvm_gmem_populate() to
> > be 4K-only:
> > 
> >   https://github.com/AMDESE/linux/blob/snp-hugetlb-v2-wip0/virt/kvm/guest_memfd.c#L2372
> 
> For 1G (aka HugeTLB) page, this fix is also needed, which was missed in [1] and
> I pointed out to Ackerley at [2].
> 
> [1] https://github.com/googleprodkernel/linux-cc/tree/gmem-1g-page-support-rfc-v2
> [2] https://lore.kernel.org/all/aFPGPVbzo92t565h@yzhao56-desk.sh.intel.com/

Yes, we'll likely need some kind of change here.

I think, if we're trying to find common ground to build hugepage support
on, you can assume this will be removed. But I just don't think we need
to squash that into this series in order to make progress on those ends.

> 
> > but it makes a lot more sense to make those restrictions and changes in
> > the context of hugepage support, rather than this series which is trying
> > very hard to not do hugepage enablement, but simply keep what's partially
> > there intact while reworking other things that have proven to be
> > continued impediments to both in-place conversion and hugepage
> > enablement.
> Not sure how fixing the warning in this series could impede hugepage enabling :)
> 
> But if you prefer, I don't mind keeping it locally for longer.

It's the whole burden of needing to anticipate hugepage design, while it
is in a state of potentially massive flux just before LPC, in order to
make tiny incremental progress toward enabling in-place conversion,
which is something I think we can get upstream much sooner. Look at your
changelog for the change above, for instance: it has no relevance in the
context of this series. What do I put in its place? Bug reports about
my experimental tree? It's just not the right place to try to justify
these changes.

And most of this weirdness stems from the fact that we prematurely added
partial hugepage enablement to begin with. Let's not repeat these mistakes,
and address changes in the proper context where we know they make sense.

I considered stripping out the existing hugepage support as a pre-patch
to avoid leaving these uncertainties in place while we are reworking
things, but it felt like needless churn. But that's where I'm coming
from with this series: let's get in-place conversion landed, get the API
fleshed out, get it working, and then re-assess hugepages with all these
common/intersecting bits out of the way. If we can remove some obstacles
for hugepages as part of that, great, but that is not the main intent
here.

-Mike

> 
> > Also, there's talk now of enabling hugepages even without in-place
> > conversion for hugetlbfs, and that will likely be the same path we
> > follow for THP to remain in alignment. Rather than anticipating what all
> > these changes will mean WRT hugepage implementation/requirements, I
> > think it will be fruitful to remove some of the baggage that will
> > complicate that process/discussion like this patchset attempts.
> > 
> > -Mike
> > 
> > > 
> > > TDX is even easier to hit this warning because it always passes npages as 1.
> > > 
> > > [1] https://lore.kernel.org/all/cover.1747264138.git.ackerleytng@google.com
> > > 
> > >  
> > > > > Increasing GMEM_GUP_NPAGES to (1UL << PUD_ORDER) is probabaly not a good idea.
> > > > > 
> > > > > Given both TDX/SNP map at 4KB granularity, why not just invoke post_populate()
> > > > > per 4KB while removing the max_order from post_populate() parameters, as done
> > > > > in Sean's sketch patch [1]?
> > > > 
> > > > That's an option too, but SNP can make use of 2MB pages in the
> > > > post-populate callback so I don't want to shut the door on that option
> > > > just yet if it's not too much of a pain to work in. Given the guest BIOS
> > > > lives primarily in 1 or 2 of these 2MB regions the benefits might be
> > > > worthwhile, and SNP doesn't have a post-post-populate promotion path
> > > > like TDX (at least, not one that would help much for guest boot times)
> > > I see.
> > > 
> > > So, what about below change?
> > > 
> > > --- a/virt/kvm/guest_memfd.c
> > > +++ b/virt/kvm/guest_memfd.c
> > > @@ -878,11 +878,10 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > >                 }
> > > 
> > >                 folio_unlock(folio);
> > > -               WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
> > > -                       (npages - i) < (1 << max_order));
> > > 
> > >                 ret = -EINVAL;
> > > -               while (!kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
> > > +               while (!IS_ALIGNED(gfn, 1 << max_order) || (npages - i) < (1 << max_order) ||
> > > +                      !kvm_range_has_memory_attributes(kvm, gfn, gfn + (1 << max_order),
> > >                                                         KVM_MEMORY_ATTRIBUTE_PRIVATE,
> > >                                                         KVM_MEMORY_ATTRIBUTE_PRIVATE)) {
> > >                         if (!max_order)
> > > 
> > > 
> > > 
> > > > 
> > > > > 
> > > > > Then the WARN_ON() in kvm_gmem_populate() can be removed, which would be easily
> > > > > triggered by TDX when max_order > 0 && npages == 1:
> > > > > 
> > > > >       WARN_ON(!IS_ALIGNED(gfn, 1 << max_order) ||
> > > > >               (npages - i) < (1 << max_order));
> > > > > 
> > > > > 
> > > > > [1] https://lore.kernel.org/all/aHEwT4X0RcfZzHlt@google.com/
> > > > > 
> > > > > >  long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
> > > > > >  		       kvm_gmem_populate_cb post_populate, void *opaque)
> > > > > >  {
> > > > > >  	struct kvm_memory_slot *slot;
> > > > > > -	void __user *p;
> > > > > > -
> > > > > > +	struct page **src_pages;
> > > > > >  	int ret = 0, max_order;
> > > > > > -	long i;
> > > > > > +	loff_t src_offset = 0;
> > > > > > +	long i, src_npages;
> > > > > >  
> > > > > >  	lockdep_assert_held(&kvm->slots_lock);
> > > > > >  
> > > > > > @@ -836,9 +839,28 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > > > > >  	if (!file)
> > > > > >  		return -EFAULT;
> > > > > >  
> > > > > > +	npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
> > > > > > +	npages = min_t(ulong, npages, GMEM_GUP_NPAGES);
> > > > > > +
> > > > > > +	if (src) {
> > > > > > +		src_npages = IS_ALIGNED((unsigned long)src, PAGE_SIZE) ? npages : npages + 1;
> > > > > > +
> > > > > > +		src_pages = kmalloc_array(src_npages, sizeof(struct page *), GFP_KERNEL);
> > > > > > +		if (!src_pages)
> > > > > > +			return -ENOMEM;
> > > > > > +
> > > > > > +		ret = get_user_pages_fast((unsigned long)src, src_npages, 0, src_pages);
> > > > > > +		if (ret < 0)
> > > > > > +			return ret;
> > > > > > +
> > > > > > +		if (ret != src_npages)
> > > > > > +			return -ENOMEM;
> > > > > > +
> > > > > > +		src_offset = (loff_t)(src - PTR_ALIGN_DOWN(src, PAGE_SIZE));
> > > > > > +	}
> > > > > > +
> > > > > >  	filemap_invalidate_lock(file->f_mapping);
> > > > > >  
> > > > > > -	npages = min_t(ulong, slot->npages - (start_gfn - slot->base_gfn), npages);
> > > > > >  	for (i = 0; i < npages; i += (1 << max_order)) {
> > > > > >  		struct folio *folio;
> > > > > >  		gfn_t gfn = start_gfn + i;
> > > > > > @@ -869,8 +891,8 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > > > > >  			max_order--;
> > > > > >  		}
> > > > > >  
> > > > > > -		p = src ? src + i * PAGE_SIZE : NULL;
> > > > > > -		ret = post_populate(kvm, gfn, pfn, p, max_order, opaque);
> > > > > > +		ret = post_populate(kvm, gfn, pfn, src ? &src_pages[i] : NULL,
> > > > > > +				    src_offset, max_order, opaque);
> > > > > Why src_offset is not 0 starting from the 2nd page?
> > > > > 
> > > > > >  		if (!ret)
> > > > > >  			folio_mark_uptodate(folio);
> > > > > >  
> > > > > > @@ -882,6 +904,14 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
> > > > > >  
> > > > > >  	filemap_invalidate_unlock(file->f_mapping);
> > > > > >  
> > > > > > +	if (src) {
> > > > > > +		long j;
> > > > > > +
> > > > > > +		for (j = 0; j < src_npages; j++)
> > > > > > +			put_page(src_pages[j]);
> > > > > > +		kfree(src_pages);
> > > > > > +	}
> > > > > > +
> > > > > >  	return ret && !i ? ret : i;
> > > > > >  }
> > > > > >  EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);
> > > > > > -- 
> > > > > > 2.25.1
> > > > > >