[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <bd163de3118b626d1005aa88e71ef2fb72f0be0f.1726009989.git.ackerleytng@google.com>
Date: Tue, 10 Sep 2024 23:43:57 +0000
From: Ackerley Tng <ackerleytng@...gle.com>
To: tabba@...gle.com, quic_eberman@...cinc.com, roypat@...zon.co.uk,
jgg@...dia.com, peterx@...hat.com, david@...hat.com, rientjes@...gle.com,
fvdl@...gle.com, jthoughton@...gle.com, seanjc@...gle.com,
pbonzini@...hat.com, zhiquan1.li@...el.com, fan.du@...el.com,
jun.miao@...el.com, isaku.yamahata@...el.com, muchun.song@...ux.dev,
mike.kravetz@...cle.com
Cc: erdemaktas@...gle.com, vannapurve@...gle.com, ackerleytng@...gle.com,
qperret@...gle.com, jhubbard@...dia.com, willy@...radead.org,
shuah@...nel.org, brauner@...nel.org, bfoster@...hat.com,
kent.overstreet@...ux.dev, pvorel@...e.cz, rppt@...nel.org,
richard.weiyang@...il.com, anup@...infault.org, haibo1.xu@...el.com,
ajones@...tanamicro.com, vkuznets@...hat.com, maciej.wieczor-retman@...el.com,
pgonda@...gle.com, oliver.upton@...ux.dev, linux-kernel@...r.kernel.org,
linux-mm@...ck.org, kvm@...r.kernel.org, linux-kselftest@...r.kernel.org,
linux-fsdevel@...ck.org
Subject: [RFC PATCH 26/39] KVM: guest_memfd: Track faultability within a
struct kvm_gmem_private
The faultability xarray is stored on the inode since faultability is a
property of the guest_memfd's memory contents.
In this RFC, presence of an entry in the xarray indicates faultable,
but this could be flipped so that presence indicates unfaultable. For
flexibility, a special value "FAULT" is used instead of a simple
boolean.
However, at some stages of a VM's lifecycle there could be more
private pages, and at other stages there could be more shared pages.
This is likely to be replaced by a better data structure in a future
revision to better support ranges.
Also store struct kvm_gmem_hugetlb in struct kvm_gmem_hugetlb as a
pointer. inode->i_mapping->i_private_data.
Co-developed-by: Fuad Tabba <tabba@...gle.com>
Signed-off-by: Fuad Tabba <tabba@...gle.com>
Co-developed-by: Ackerley Tng <ackerleytng@...gle.com>
Signed-off-by: Ackerley Tng <ackerleytng@...gle.com>
Co-developed-by: Vishal Annapurve <vannapurve@...gle.com>
Signed-off-by: Vishal Annapurve <vannapurve@...gle.com>
---
virt/kvm/guest_memfd.c | 105 ++++++++++++++++++++++++++++++++++++-----
1 file changed, 94 insertions(+), 11 deletions(-)
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 8151df2c03e5..b603518f7b62 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -26,11 +26,21 @@ struct kvm_gmem_hugetlb {
struct hugepage_subpool *spool;
};
-static struct kvm_gmem_hugetlb *kvm_gmem_hgmem(struct inode *inode)
+struct kvm_gmem_inode_private {
+ struct xarray faultability;
+ struct kvm_gmem_hugetlb *hgmem;
+};
+
+static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
{
return inode->i_mapping->i_private_data;
}
+static struct kvm_gmem_hugetlb *kvm_gmem_hgmem(struct inode *inode)
+{
+ return kvm_gmem_private(inode)->hgmem;
+}
+
static bool is_kvm_gmem_hugetlb(struct inode *inode)
{
u64 flags = (u64)inode->i_private;
@@ -38,6 +48,57 @@ static bool is_kvm_gmem_hugetlb(struct inode *inode)
return flags & KVM_GUEST_MEMFD_HUGETLB;
}
+#define KVM_GMEM_FAULTABILITY_VALUE 0x4641554c54 /* FAULT */
+
+/**
+ * Set faultability of given range of inode indices [@start, @end) to
+ * @faultable. Return 0 if attributes were successfully updated or negative
+ * errno on error.
+ */
+static int kvm_gmem_set_faultable(struct inode *inode, pgoff_t start, pgoff_t end,
+ bool faultable)
+{
+ struct xarray *faultability;
+ void *val;
+ pgoff_t i;
+
+ /*
+ * The expectation is that fewer pages are faultable, hence save memory
+ * entries are created for faultable pages as opposed to creating
+ * entries for non-faultable pages.
+ */
+ val = faultable ? xa_mk_value(KVM_GMEM_FAULTABILITY_VALUE) : NULL;
+ faultability = &kvm_gmem_private(inode)->faultability;
+
+ /*
+ * TODO replace this with something else (maybe interval
+ * tree?). store_range doesn't quite do what we expect if overlapping
+ * ranges are specified: if we store_range(5, 10, val) and then
+ * store_range(7, 12, NULL), the entire range [5, 12] will be NULL. For
+ * now, use the slower xa_store() to store individual entries on indices
+ * to avoid this.
+ */
+ for (i = start; i < end; i++) {
+ int r;
+
+ r = xa_err(xa_store(faultability, i, val, GFP_KERNEL_ACCOUNT));
+ if (r)
+ return r;
+ }
+
+ return 0;
+}
+
+/**
+ * Return true if the page at @index is allowed to be faulted in.
+ */
+static bool kvm_gmem_is_faultable(struct inode *inode, pgoff_t index)
+{
+ struct xarray *faultability = &kvm_gmem_private(inode)->faultability;
+
+ return xa_to_value(xa_load(faultability, index)) == KVM_GMEM_FAULTABILITY_VALUE;
+}
+
/**
* folio_file_pfn - like folio_file_page, but return a pfn.
* @folio: The folio which contains this index.
@@ -895,11 +956,21 @@ static void kvm_gmem_hugetlb_teardown(struct inode *inode)
static void kvm_gmem_evict_inode(struct inode *inode)
{
+ struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
+
+ /*
+ * .evict_inode can be called before faultability is set up if there are
+ * issues during inode creation.
+ */
+ if (private)
+ xa_destroy(&private->faultability);
+
if (is_kvm_gmem_hugetlb(inode))
kvm_gmem_hugetlb_teardown(inode);
else
truncate_inode_pages_final(inode->i_mapping);
+ kfree(private);
clear_inode(inode);
}
@@ -1028,7 +1099,9 @@ static const struct inode_operations kvm_gmem_iops = {
.setattr = kvm_gmem_setattr,
};
-static int kvm_gmem_hugetlb_setup(struct inode *inode, loff_t size, u64 flags)
+static int kvm_gmem_hugetlb_setup(struct inode *inode,
+ struct kvm_gmem_inode_private *private,
+ loff_t size, u64 flags)
{
struct kvm_gmem_hugetlb *hgmem;
struct hugepage_subpool *spool;
@@ -1036,6 +1109,10 @@ static int kvm_gmem_hugetlb_setup(struct inode *inode, loff_t size, u64 flags)
struct hstate *h;
long hpages;
+ hgmem = kzalloc(sizeof(*hgmem), GFP_KERNEL);
+ if (!hgmem)
+ return -ENOMEM;
+
page_size_log = (flags >> KVM_GUEST_MEMFD_HUGE_SHIFT) & KVM_GUEST_MEMFD_HUGE_MASK;
h = hstate_sizelog(page_size_log);
@@ -1046,21 +1123,16 @@ static int kvm_gmem_hugetlb_setup(struct inode *inode, loff_t size, u64 flags)
if (!spool)
goto err;
- hgmem = kzalloc(sizeof(*hgmem), GFP_KERNEL);
- if (!hgmem)
- goto err_subpool;
-
inode->i_blkbits = huge_page_shift(h);
hgmem->h = h;
hgmem->spool = spool;
- inode->i_mapping->i_private_data = hgmem;
+ private->hgmem = hgmem;
return 0;
-err_subpool:
- kfree(spool);
err:
+ kfree(hgmem);
return -ENOMEM;
}
@@ -1068,6 +1140,7 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
loff_t size, u64 flags)
{
const struct qstr qname = QSTR_INIT(name, strlen(name));
+ struct kvm_gmem_inode_private *private;
struct inode *inode;
int err;
@@ -1079,12 +1152,20 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
if (err)
goto out;
+ err = -ENOMEM;
+ private = kzalloc(sizeof(*private), GFP_KERNEL);
+ if (!private)
+ goto out;
+
if (flags & KVM_GUEST_MEMFD_HUGETLB) {
- err = kvm_gmem_hugetlb_setup(inode, size, flags);
+ err = kvm_gmem_hugetlb_setup(inode, private, size, flags);
if (err)
- goto out;
+ goto free_private;
}
+ xa_init(&private->faultability);
+ inode->i_mapping->i_private_data = private;
+
inode->i_private = (void *)(unsigned long)flags;
inode->i_op = &kvm_gmem_iops;
inode->i_mapping->a_ops = &kvm_gmem_aops;
@@ -1097,6 +1178,8 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
return inode;
+free_private:
+ kfree(private);
out:
iput(inode);
--
2.46.0.598.g6f2099f65c-goog
Powered by blists - more mailing lists