linux-kernel - [RFC PATCH 4/5] mm: shmem: allow preserving file over FDBOX + KHO

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250307005830.65293-5-ptyadav@amazon.de>
Date: Fri, 7 Mar 2025 00:57:38 +0000
From: Pratyush Yadav <ptyadav@...zon.de>
To: <linux-kernel@...r.kernel.org>
CC: Pratyush Yadav <ptyadav@...zon.de>, Jonathan Corbet <corbet@....net>,
	"Eric Biederman" <ebiederm@...ssion.com>, Arnd Bergmann <arnd@...db.de>,
	"Greg Kroah-Hartman" <gregkh@...uxfoundation.org>, Alexander Viro
	<viro@...iv.linux.org.uk>, Christian Brauner <brauner@...nel.org>, Jan Kara
	<jack@...e.cz>, Hugh Dickins <hughd@...gle.com>, Alexander Graf
	<graf@...zon.com>, Benjamin Herrenschmidt <benh@...nel.crashing.org>, "David
 Woodhouse" <dwmw2@...radead.org>, James Gowans <jgowans@...zon.com>, "Mike
 Rapoport" <rppt@...nel.org>, Paolo Bonzini <pbonzini@...hat.com>, "Pasha
 Tatashin" <tatashin@...gle.com>, Anthony Yznaga <anthony.yznaga@...cle.com>,
	Dave Hansen <dave.hansen@...el.com>, David Hildenbrand <david@...hat.com>,
	Jason Gunthorpe <jgg@...dia.com>, Matthew Wilcox <willy@...radead.org>, "Wei
 Yang" <richard.weiyang@...il.com>, Andrew Morton <akpm@...ux-foundation.org>,
	<linux-fsdevel@...r.kernel.org>, <linux-doc@...r.kernel.org>,
	<linux-mm@...ck.org>, <kexec@...ts.infradead.org>
Subject: [RFC PATCH 4/5] mm: shmem: allow preserving file over FDBOX + KHO

For applications with a large amount of memory that takes time to
rebuild, reboots to consume kernel upgrades can be very expensive. FDBox
allows preserving file descriptors over kexec using KHO. Combining that
with memfd gives those applications reboot-persistent memory that they
can use to quickly save and reconstruct that state.

Since memfd is backed either by hugetlbfs or shmem, use shmem as the
first backend for memfd that is FDBOX + KHO capable.

To preserve the file's contents during KHO activation, the file's page
cache must be walked and all entries removed, and their indices stored.
Use the newly introduced shmem_undo_range_ops to achieve this. Walk each
entry and before truncating it, take a refcount on the folio so it does
not get freed, and store its physical address and index in the kho_mem
and indices arrays.

Swap pages, partial folios, and huge folios are not supported yet.
Encountering those results in an error.

On the restore side, an empty file is created and then the mems array
walked to insert the pages into the page cache. The logic in
shmem_alloc_and_add_folio() is roughly followed.

Signed-off-by: Pratyush Yadav <ptyadav@...zon.de>
---
 include/linux/shmem_fs.h |   6 +
 mm/shmem.c               | 333 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 339 insertions(+)

diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 0b273a7b9f01d..263416f357fe1 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -205,6 +205,12 @@ extern int shmem_mfill_atomic_pte(pmd_t *dst_pmd,
 #endif /* CONFIG_SHMEM */
 #endif /* CONFIG_USERFAULTFD */
 
+#if defined(CONFIG_FDBOX) && defined(CONFIG_KEXEC_HANDOVER)
+bool is_node_shmem(const void *fdt, int offset);
+int shmem_fdbox_kho_write(struct fdbox_fd *ffd, void *fdt);
+struct file *shmem_fdbox_kho_recover(const void *fdt, int offset);
+#endif
+
 /*
  * Used space is stored as unsigned 64-bit value in bytes but
  * quota core supports only signed 64-bit values so use that
diff --git a/mm/shmem.c b/mm/shmem.c
index d6d9266b27b75..c2efdb34a1a18 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -41,6 +41,12 @@
 #include <linux/swapfile.h>
 #include <linux/iversion.h>
 #include <linux/unicode.h>
+#include <linux/libfdt.h>
+#include <linux/fdbox.h>
+#include <linux/vmalloc.h>
+#include <linux/kexec.h>
+#include <linux/kexec_handover.h>
+#include <linux/cleanup.h>
 #include "swap.h"
 
 static struct vfsmount *shm_mnt __ro_after_init;
@@ -5283,6 +5289,333 @@ static int shmem_error_remove_folio(struct address_space *mapping,
 	return 0;
 }
 
+#if defined(CONFIG_FDBOX) && defined(CONFIG_KEXEC_HANDOVER)
+static const char fdbox_kho_compatible[] = "fdbox,shmem-v1";
+
+bool is_node_shmem(const void *fdt, int offset)
+{
+	return fdt_node_check_compatible(fdt, offset, fdbox_kho_compatible) == 0;
+}
+
+struct shmem_fdbox_put_arg {
+	struct kho_mem *mems;
+	unsigned long *indices;
+	unsigned long nr_mems;
+	unsigned long idx;
+};
+
+static long shmem_fdbox_undo_swap(struct address_space *mapping, pgoff_t index,
+				  void *old, void *arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static int shmem_fdbox_undo_folio(struct address_space *mapping,
+				  struct folio *folio, void *__arg)
+{
+	struct shmem_fdbox_put_arg *arg = __arg;
+	struct kho_mem *mem;
+
+	if (arg->idx == arg->nr_mems)
+		return -ENOSPC;
+
+	if (folio_nr_pages(folio) != 1)
+		return -EOPNOTSUPP;
+
+	/*
+	 * Grab an extra refcount to the folio so it sticks around after
+	 * truncation.
+	 */
+	folio_get(folio);
+
+	mem = arg->mems + arg->idx;
+
+	mem->addr = PFN_PHYS(folio_pfn(folio));
+	mem->size = PAGE_SIZE;
+	arg->indices[arg->idx] = folio_index(folio);
+	arg->idx++;
+
+	truncate_inode_folio(mapping, folio);
+	return 0;
+}
+
+static int shmem_fdbox_undo_partial_folio(struct folio *folio, pgoff_t lstart,
+					  pgoff_t lend, void *arg)
+{
+	return -EOPNOTSUPP;
+}
+
+static const struct shmem_undo_range_ops shmem_fdbox_undo_ops = {
+	.undo_swap = shmem_fdbox_undo_swap,
+	.undo_folio = shmem_fdbox_undo_folio,
+	.undo_partial_folio = shmem_fdbox_undo_partial_folio,
+};
+
+static struct kho_mem *shmem_fdbox_kho_get_mems(struct inode *inode,
+						unsigned long **indicesp,
+						unsigned long *nr)
+{
+	struct shmem_inode_info *info = SHMEM_I(inode);
+	unsigned long *indices __free(kvfree) = NULL;
+	struct kho_mem *mems __free(kvfree) = NULL;
+	struct shmem_fdbox_put_arg arg;
+	unsigned long nr_mems;
+	int ret, i;
+
+	scoped_guard(spinlock, &info->lock) {
+		/* TODO: Support swapped pages. Perhaps swap them back in? */
+		if (info->swapped)
+			return ERR_PTR(-EOPNOTSUPP);
+
+		/*
+		 * Estimate the size of the array using the size of the inode,
+		 * assuming there are no contiguous pages.
+		 */
+		nr_mems = info->alloced;
+	}
+
+	mems = kvmalloc_array(nr_mems, sizeof(*mems), GFP_KERNEL);
+	if (!mems)
+		return ERR_PTR(-ENOMEM);
+
+	indices = kvmalloc_array(nr_mems, sizeof(*indices), GFP_KERNEL);
+	if (!indices)
+		return ERR_PTR(-ENOMEM);
+
+	arg.mems = mems;
+	arg.indices = indices;
+	arg.nr_mems = nr_mems;
+	arg.idx = 0;
+
+	ret = shmem_undo_range(inode, 0, -1, false, &shmem_fdbox_undo_ops, &arg);
+	if (ret < 0) {
+		pr_err("shmem: failed to undo fdbox range: %d\n", ret);
+		goto err;
+	}
+
+	*nr = arg.idx;
+	*indicesp = no_free_ptr(indices);
+	return_ptr(mems);
+
+err:
+	/*
+	 * TODO: This kills the whole file on failure to KHO. We should keep the
+	 * contents around for another try later. The problem is, if re-adding
+	 * pages fails, there would be no recovery at that point. Ideally, we
+	 * should first serialize the whole file, and only then remove things
+	 * from page cache so we are sure to never fail.
+	 */
+	for (i = 0; i < arg.idx; i++) {
+		struct folio *folio = page_folio(phys_to_page(mems[i].addr));
+
+		folio_put(folio);
+	}
+
+	/* Undo the rest of the file. This should not fail. */
+	WARN_ON(shmem_undo_range(inode, 0, -1, false, &shmem_default_undo_ops, NULL));
+	return ERR_PTR(ret);
+}
+
+int shmem_fdbox_kho_write(struct fdbox_fd *box_fd, void *fdt)
+{
+	struct inode *inode = box_fd->file->f_inode;
+	unsigned long *indices __free(kvfree) = NULL;
+	struct kho_mem *mems __free(kvfree) = NULL;
+	u64 pos = box_fd->file->f_pos, size = inode->i_size;
+	unsigned long nr_mems, i;
+	int ret = 0;
+
+	/*
+	 * mems can be larger than sizeof(*mems) * nr_mems, but we should only
+	 * look at things in the range of 0 to nr_mems.
+	 */
+	mems = shmem_fdbox_kho_get_mems(inode, &indices, &nr_mems);
+	if (IS_ERR(mems))
+		return PTR_ERR(mems);
+
+	/*
+	 * fdbox should have already started the node. We can start adding
+	 * properties directly.
+	 */
+	ret |= fdt_property(fdt, "compatible", fdbox_kho_compatible,
+			    sizeof(fdbox_kho_compatible));
+	ret |= fdt_property(fdt, "pos", &pos, sizeof(u64));
+	ret |= fdt_property(fdt, "size",  &size, sizeof(u64));
+	ret |= fdt_property(fdt, "mem", mems, sizeof(*mems) * nr_mems);
+	ret |= fdt_property(fdt, "indices", indices, sizeof(*indices) * nr_mems);
+
+	if (ret) {
+		pr_err("shmem: failed to add properties to FDT!\n");
+		ret = -EINVAL;
+		goto err;
+	}
+
+	return 0;
+
+err:
+	/*
+	 * TODO: This kills the whole file on failure to KHO. We should keep the
+	 * contents around for another try later. The problem is, if re-adding
+	 * pages fails, there would be no recovery at that point. Ideally, we
+	 * should first serialize the whole file, and only then remove things
+	 * from page cache so we are sure to never fail.
+	 */
+	for (i = 0; i < nr_mems; i++) {
+		struct folio *folio = page_folio(phys_to_page(mems[i].addr));
+
+		folio_put(folio);
+	}
+	return ret;
+}
+
+struct file *shmem_fdbox_kho_recover(const void *fdt, int offset)
+{
+	struct address_space *mapping;
+	char pathbuf[1024] = "", *path;
+	const unsigned long *indices;
+	const struct kho_mem *mems;
+	unsigned long nr_mems, i = 0;
+	const u64 *pos, *size;
+	struct inode *inode;
+	struct file *file;
+	int len, ret;
+
+	ret = fdt_node_check_compatible(fdt, offset, fdbox_kho_compatible);
+	if (ret) {
+		pr_err("shmem: invalid compatible\n");
+		goto err;
+	}
+
+	mems = fdt_getprop(fdt, offset, "mem", &len);
+	if (!mems || len % sizeof(*mems)) {
+		pr_err("shmem: invalid mems property\n");
+		goto err;
+	}
+	nr_mems = len / sizeof(*mems);
+
+	indices = fdt_getprop(fdt, offset, "indices", &len);
+	if (!indices || len % sizeof(unsigned long)) {
+		pr_err("shmem: invalid indices property\n");
+		goto err_return;
+	}
+	if (len / sizeof(unsigned long) != nr_mems) {
+		pr_err("shmem: number of indices and mems do not match\n");
+		goto err_return;
+	}
+
+	size = fdt_getprop(fdt, offset, "size", &len);
+	if (!size || len != sizeof(u64)) {
+		pr_err("shmem: invalid size property\n");
+		goto err_return;
+	}
+
+	pos = fdt_getprop(fdt, offset, "pos", &len);
+	if (!pos || len != sizeof(u64)) {
+		pr_err("shmem: invalid pos property\n");
+		goto err_return;
+	}
+
+	/*
+	 * TODO: This sets UID/GID, cgroup accounting to root. Should this
+	 * be given to the first user that maps the FD instead?
+	 */
+	file = shmem_file_setup(fdt_get_name(fdt, offset, NULL), 0,
+				VM_NORESERVE);
+	if (IS_ERR(file)) {
+		pr_err("shmem: failed to setup file\n");
+		goto err_return;
+	}
+
+	inode = file->f_inode;
+	mapping = inode->i_mapping;
+	vfs_setpos(file, *pos, MAX_LFS_FILESIZE);
+
+	for (; i < nr_mems; i++) {
+		struct folio *folio;
+		void *va;
+
+		if (mems[i].size != PAGE_SIZE) {
+			pr_err("shmem: unknown kho_mem size %llx. Expected %lx\n",
+			       mems[i].size, PAGE_SIZE);
+			goto err_return;
+		}
+
+		va = kho_claim_mem(&mems[i]);
+		folio = virt_to_folio(va);
+
+		/* Set up the folio for insertion. */
+
+		/*
+		 * TODO: This breaks falloc-ed folios since now they get marked
+		 * uptodate when they might not actually be zeroed out yet. Need
+		 * a way to distinguish falloc-ed folios.
+		 */
+		folio_mark_uptodate(folio);
+		folio_mark_dirty(folio);
+
+		/*
+		 * TODO: Should find a way to unify this and
+		 * shmem_alloc_and_add_folio().
+		 */
+		__folio_set_locked(folio);
+		__folio_set_swapbacked(folio);
+
+		ret = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
+		if (ret) {
+			folio_unlock(folio);
+			folio_put(folio);
+			fput(file);
+			pr_err("shmem: failed to charge folio index %lu\n", i);
+			goto err_return_next;
+		}
+
+		ret = shmem_add_to_page_cache(folio, mapping, indices[i], NULL,
+					      mapping_gfp_mask(mapping));
+		if (ret) {
+			folio_unlock(folio);
+			folio_put(folio);
+			fput(file);
+			pr_err("shmem: failed to add to page cache folio index %lu\n", i);
+			goto err_return_next;
+		}
+
+		ret = shmem_inode_acct_blocks(inode, 1);
+		if (ret) {
+			folio_unlock(folio);
+			folio_put(folio);
+			fput(file);
+			pr_err("shmem: failed to account folio index %lu\n", i);
+			goto err_return_next;
+		}
+
+		shmem_recalc_inode(inode, 1, 0);
+		folio_add_lru(folio);
+		folio_unlock(folio);
+		folio_put(folio);
+	}
+
+	inode->i_size = *size;
+
+	return file;
+
+err_return:
+	kho_return_mem(mems + i);
+err_return_next:
+	for (i = i + 1; i < nr_mems; i++)
+		kho_return_mem(mems + i);
+err:
+	ret = fdt_get_path(fdt, offset, pathbuf, sizeof(pathbuf));
+	if (ret)
+		path = "unknown";
+	else
+		path = pathbuf;
+
+	pr_err("shmem: error when recovering KHO node '%s'\n", path);
+	return NULL;
+}
+
+#endif /* CONFIG_FDBOX && CONFIG_KEXEC_HANDOVER */
+
 static const struct address_space_operations shmem_aops = {
 	.writepage	= shmem_writepage,
 	.dirty_folio	= noop_dirty_folio,
-- 
2.47.1