lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <bd163de3118b626d1005aa88e71ef2fb72f0be0f.1726009989.git.ackerleytng@google.com>
Date: Tue, 10 Sep 2024 23:43:57 +0000
From: Ackerley Tng <ackerleytng@...gle.com>
To: tabba@...gle.com, quic_eberman@...cinc.com, roypat@...zon.co.uk, 
	jgg@...dia.com, peterx@...hat.com, david@...hat.com, rientjes@...gle.com, 
	fvdl@...gle.com, jthoughton@...gle.com, seanjc@...gle.com, 
	pbonzini@...hat.com, zhiquan1.li@...el.com, fan.du@...el.com, 
	jun.miao@...el.com, isaku.yamahata@...el.com, muchun.song@...ux.dev, 
	mike.kravetz@...cle.com
Cc: erdemaktas@...gle.com, vannapurve@...gle.com, ackerleytng@...gle.com, 
	qperret@...gle.com, jhubbard@...dia.com, willy@...radead.org, 
	shuah@...nel.org, brauner@...nel.org, bfoster@...hat.com, 
	kent.overstreet@...ux.dev, pvorel@...e.cz, rppt@...nel.org, 
	richard.weiyang@...il.com, anup@...infault.org, haibo1.xu@...el.com, 
	ajones@...tanamicro.com, vkuznets@...hat.com, maciej.wieczor-retman@...el.com, 
	pgonda@...gle.com, oliver.upton@...ux.dev, linux-kernel@...r.kernel.org, 
	linux-mm@...ck.org, kvm@...r.kernel.org, linux-kselftest@...r.kernel.org, 
	linux-fsdevel@...ck.org
Subject: [RFC PATCH 26/39] KVM: guest_memfd: Track faultability within a
 struct kvm_gmem_private

The faultability xarray is stored on the inode since faultability is a
property of the guest_memfd's memory contents.

In this RFC, presence of an entry in the xarray indicates faultable,
but this could be flipped so that presence indicates unfaultable. For
flexibility, a special value "FAULT" is used instead of a simple
boolean.

However, at some stages of a VM's lifecycle there could be more
private pages, and at other stages there could be more shared pages.

This is likely to be replaced by a better data structure in a future
revision to better support ranges.

Also store struct kvm_gmem_hugetlb in struct kvm_gmem_hugetlb as a
pointer. inode->i_mapping->i_private_data.

Co-developed-by: Fuad Tabba <tabba@...gle.com>
Signed-off-by: Fuad Tabba <tabba@...gle.com>
Co-developed-by: Ackerley Tng <ackerleytng@...gle.com>
Signed-off-by: Ackerley Tng <ackerleytng@...gle.com>
Co-developed-by: Vishal Annapurve <vannapurve@...gle.com>
Signed-off-by: Vishal Annapurve <vannapurve@...gle.com>

---
 virt/kvm/guest_memfd.c | 105 ++++++++++++++++++++++++++++++++++++-----
 1 file changed, 94 insertions(+), 11 deletions(-)

diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index 8151df2c03e5..b603518f7b62 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -26,11 +26,21 @@ struct kvm_gmem_hugetlb {
 	struct hugepage_subpool *spool;
 };
 
-static struct kvm_gmem_hugetlb *kvm_gmem_hgmem(struct inode *inode)
+struct kvm_gmem_inode_private {
+	struct xarray faultability;
+	struct kvm_gmem_hugetlb *hgmem;
+};
+
+static struct kvm_gmem_inode_private *kvm_gmem_private(struct inode *inode)
 {
 	return inode->i_mapping->i_private_data;
 }
 
+static struct kvm_gmem_hugetlb *kvm_gmem_hgmem(struct inode *inode)
+{
+	return kvm_gmem_private(inode)->hgmem;
+}
+
 static bool is_kvm_gmem_hugetlb(struct inode *inode)
 {
 	u64 flags = (u64)inode->i_private;
@@ -38,6 +48,57 @@ static bool is_kvm_gmem_hugetlb(struct inode *inode)
 	return flags & KVM_GUEST_MEMFD_HUGETLB;
 }
 
+#define KVM_GMEM_FAULTABILITY_VALUE 0x4641554c54  /* FAULT */
+
+/**
+ * Set faultability of given range of inode indices [@start, @end) to
+ * @faultable. Return 0 if attributes were successfully updated or negative
+ * errno on error.
+ */
+static int kvm_gmem_set_faultable(struct inode *inode, pgoff_t start, pgoff_t end,
+				  bool faultable)
+{
+	struct xarray *faultability;
+	void *val;
+	pgoff_t i;
+
+	/*
+	 * The expectation is that fewer pages are faultable, hence save memory
+	 * entries are created for faultable pages as opposed to creating
+	 * entries for non-faultable pages.
+	 */
+	val = faultable ? xa_mk_value(KVM_GMEM_FAULTABILITY_VALUE) : NULL;
+	faultability = &kvm_gmem_private(inode)->faultability;
+
+	/*
+	 * TODO replace this with something else (maybe interval
+	 * tree?). store_range doesn't quite do what we expect if overlapping
+	 * ranges are specified: if we store_range(5, 10, val) and then
+	 * store_range(7, 12, NULL), the entire range [5, 12] will be NULL.  For
+	 * now, use the slower xa_store() to store individual entries on indices
+	 * to avoid this.
+	 */
+	for (i = start; i < end; i++) {
+		int r;
+
+		r = xa_err(xa_store(faultability, i, val, GFP_KERNEL_ACCOUNT));
+		if (r)
+			return r;
+	}
+
+	return 0;
+}
+
+/**
+ * Return true if the page at @index is allowed to be faulted in.
+ */
+static bool kvm_gmem_is_faultable(struct inode *inode, pgoff_t index)
+{
+	struct xarray *faultability = &kvm_gmem_private(inode)->faultability;
+
+	return xa_to_value(xa_load(faultability, index)) == KVM_GMEM_FAULTABILITY_VALUE;
+}
+
 /**
  * folio_file_pfn - like folio_file_page, but return a pfn.
  * @folio: The folio which contains this index.
@@ -895,11 +956,21 @@ static void kvm_gmem_hugetlb_teardown(struct inode *inode)
 
 static void kvm_gmem_evict_inode(struct inode *inode)
 {
+	struct kvm_gmem_inode_private *private = kvm_gmem_private(inode);
+
+	/*
+	 * .evict_inode can be called before faultability is set up if there are
+	 * issues during inode creation.
+	 */
+	if (private)
+		xa_destroy(&private->faultability);
+
 	if (is_kvm_gmem_hugetlb(inode))
 		kvm_gmem_hugetlb_teardown(inode);
 	else
 		truncate_inode_pages_final(inode->i_mapping);
 
+	kfree(private);
 	clear_inode(inode);
 }
 
@@ -1028,7 +1099,9 @@ static const struct inode_operations kvm_gmem_iops = {
 	.setattr	= kvm_gmem_setattr,
 };
 
-static int kvm_gmem_hugetlb_setup(struct inode *inode, loff_t size, u64 flags)
+static int kvm_gmem_hugetlb_setup(struct inode *inode,
+				  struct kvm_gmem_inode_private *private,
+				  loff_t size, u64 flags)
 {
 	struct kvm_gmem_hugetlb *hgmem;
 	struct hugepage_subpool *spool;
@@ -1036,6 +1109,10 @@ static int kvm_gmem_hugetlb_setup(struct inode *inode, loff_t size, u64 flags)
 	struct hstate *h;
 	long hpages;
 
+	hgmem = kzalloc(sizeof(*hgmem), GFP_KERNEL);
+	if (!hgmem)
+		return -ENOMEM;
+
 	page_size_log = (flags >> KVM_GUEST_MEMFD_HUGE_SHIFT) & KVM_GUEST_MEMFD_HUGE_MASK;
 	h = hstate_sizelog(page_size_log);
 
@@ -1046,21 +1123,16 @@ static int kvm_gmem_hugetlb_setup(struct inode *inode, loff_t size, u64 flags)
 	if (!spool)
 		goto err;
 
-	hgmem = kzalloc(sizeof(*hgmem), GFP_KERNEL);
-	if (!hgmem)
-		goto err_subpool;
-
 	inode->i_blkbits = huge_page_shift(h);
 
 	hgmem->h = h;
 	hgmem->spool = spool;
-	inode->i_mapping->i_private_data = hgmem;
 
+	private->hgmem = hgmem;
 	return 0;
 
-err_subpool:
-	kfree(spool);
 err:
+	kfree(hgmem);
 	return -ENOMEM;
 }
 
@@ -1068,6 +1140,7 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
 						      loff_t size, u64 flags)
 {
 	const struct qstr qname = QSTR_INIT(name, strlen(name));
+	struct kvm_gmem_inode_private *private;
 	struct inode *inode;
 	int err;
 
@@ -1079,12 +1152,20 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
 	if (err)
 		goto out;
 
+	err = -ENOMEM;
+	private = kzalloc(sizeof(*private), GFP_KERNEL);
+	if (!private)
+		goto out;
+
 	if (flags & KVM_GUEST_MEMFD_HUGETLB) {
-		err = kvm_gmem_hugetlb_setup(inode, size, flags);
+		err = kvm_gmem_hugetlb_setup(inode, private, size, flags);
 		if (err)
-			goto out;
+			goto free_private;
 	}
 
+	xa_init(&private->faultability);
+	inode->i_mapping->i_private_data = private;
+
 	inode->i_private = (void *)(unsigned long)flags;
 	inode->i_op = &kvm_gmem_iops;
 	inode->i_mapping->a_ops = &kvm_gmem_aops;
@@ -1097,6 +1178,8 @@ static struct inode *kvm_gmem_inode_make_secure_inode(const char *name,
 
 	return inode;
 
+free_private:
+	kfree(private);
 out:
 	iput(inode);
 
-- 
2.46.0.598.g6f2099f65c-goog


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ