[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20251007221420.344669-6-seanjc@google.com>
Date: Tue, 7 Oct 2025 15:14:13 -0700
From: Sean Christopherson <seanjc@...gle.com>
To: Marc Zyngier <maz@...nel.org>, Oliver Upton <oliver.upton@...ux.dev>,
Paolo Bonzini <pbonzini@...hat.com>, Sean Christopherson <seanjc@...gle.com>
Cc: linux-arm-kernel@...ts.infradead.org, kvmarm@...ts.linux.dev,
kvm@...r.kernel.org, linux-kernel@...r.kernel.org,
David Hildenbrand <david@...hat.com>, Fuad Tabba <tabba@...gle.com>,
Ackerley Tng <ackerleytng@...gle.com>, Shivank Garg <shivankg@....com>,
Ashish Kalra <ashish.kalra@....com>, Vlastimil Babka <vbabka@...e.cz>
Subject: [PATCH v12 05/12] KVM: guest_memfd: Enforce NUMA mempolicy using
shared policy
From: Shivank Garg <shivankg@....com>
Previously, guest-memfd allocations followed local NUMA node id in absence
of process mempolicy, resulting in arbitrary memory allocation.
Moreover, mbind() couldn't be used by the VMM as guest memory wasn't
mapped into userspace when allocation occurred.
Enable NUMA policy support by implementing vm_ops for guest-memfd mmap
operation. This allows the VMM to map the memory and use mbind() to set the
desired NUMA policy. The policy is stored in the inode structure via
kvm_gmem_inode_info, as memory policy is a property of the memory (struct
inode) itself. The policy is then retrieved via mpol_shared_policy_lookup()
and passed to filemap_grab_folio_mpol() to ensure that allocations follow
the specified memory policy.
This enables the VMM to control guest memory NUMA placement by calling
mbind() on the mapped memory regions, providing fine-grained control over
guest memory allocation across NUMA nodes.
The policy change only affect future allocations and does not migrate
existing memory. This matches mbind(2)'s default behavior which affects
only new allocations unless overridden with MPOL_MF_MOVE/MPOL_MF_MOVE_ALL
flags, which are not supported for guest_memfd as it is unmovable.
Suggested-by: David Hildenbrand <david@...hat.com>
Acked-by: David Hildenbrand <david@...hat.com>
Acked-by: Vlastimil Babka <vbabka@...e.cz>
Signed-off-by: Shivank Garg <shivankg@....com>
Tested-by: Ashish Kalra <ashish.kalra@....com>
[sean: document the ABI impact of the ->get_policy() hook]
Signed-off-by: Sean Christopherson <seanjc@...gle.com>
---
virt/kvm/guest_memfd.c | 69 ++++++++++++++++++++++++++++++++++++++++--
1 file changed, 67 insertions(+), 2 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index cc3b25155726..95267c92983b 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -4,6 +4,7 @@
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/kvm_host.h>
+#include <linux/mempolicy.h>
#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
@@ -27,6 +28,7 @@ struct gmem_file {
};
struct gmem_inode {
+ struct shared_policy policy;
struct inode vfs_inode;
};
@@ -112,6 +114,19 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
return r;
}
+static struct mempolicy *kvm_gmem_get_folio_policy(struct gmem_inode *gi,
+ pgoff_t index)
+{
+#ifdef CONFIG_NUMA
+ struct mempolicy *mpol;
+
+ mpol = mpol_shared_policy_lookup(&gi->policy, index);
+ return mpol ? mpol : get_task_policy(current);
+#else
+ return NULL;
+#endif
+}
+
/*
* Returns a locked folio on success. The caller is responsible for
* setting the up-to-date flag before the memory is mapped into the guest.
@@ -124,7 +139,25 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
{
/* TODO: Support huge pages. */
- return filemap_grab_folio(inode->i_mapping, index);
+ struct mempolicy *policy;
+ struct folio *folio;
+
+ /*
+ * Fast-path: See if folio is already present in mapping to avoid
+ * policy_lookup.
+ */
+ folio = __filemap_get_folio(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED, 0);
+ if (!IS_ERR(folio))
+ return folio;
+
+ policy = kvm_gmem_get_folio_policy(GMEM_I(inode), index);
+ folio = __filemap_get_folio_mpol(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mapping_gfp_mask(inode->i_mapping), policy);
+ mpol_cond_put(policy);
+
+ return folio;
}
static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
@@ -413,8 +446,38 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
return ret;
}
+#ifdef CONFIG_NUMA
+static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+
+ return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol);
+}
+
+static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
+ unsigned long addr, pgoff_t *pgoff)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+
+ *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
+
+ /*
+ * Note! Directly return whatever the lookup returns, do NOT return
+ * the current task's policy as is done when looking up the policy for
+ * a specific folio. Kernel ABI for get_mempolicy() is to return
+ * MPOL_DEFAULT when there is no defined policy, not whatever the
+ * default policy resolves to.
+ */
+ return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
+}
+#endif /* CONFIG_NUMA */
+
static const struct vm_operations_struct kvm_gmem_vm_ops = {
- .fault = kvm_gmem_fault_user_mapping,
+ .fault = kvm_gmem_fault_user_mapping,
+#ifdef CONFIG_NUMA
+ .get_policy = kvm_gmem_get_policy,
+ .set_policy = kvm_gmem_set_policy,
+#endif
};
static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
@@ -867,11 +930,13 @@ static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
if (!gi)
return NULL;
+ mpol_shared_policy_init(&gi->policy, NULL);
return &gi->vfs_inode;
}
static void kvm_gmem_destroy_inode(struct inode *inode)
{
+ mpol_free_shared_policy(&GMEM_I(inode)->policy);
}
static void kvm_gmem_free_inode(struct inode *inode)
--
2.51.0.710.ga91ca5db03-goog
Powered by blists - more mailing lists