[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250307005830.65293-6-ptyadav@amazon.de>
Date: Fri, 7 Mar 2025 00:57:39 +0000
From: Pratyush Yadav <ptyadav@...zon.de>
To: <linux-kernel@...r.kernel.org>
CC: Pratyush Yadav <ptyadav@...zon.de>, Jonathan Corbet <corbet@....net>,
"Eric Biederman" <ebiederm@...ssion.com>, Arnd Bergmann <arnd@...db.de>,
"Greg Kroah-Hartman" <gregkh@...uxfoundation.org>, Alexander Viro
<viro@...iv.linux.org.uk>, Christian Brauner <brauner@...nel.org>, Jan Kara
<jack@...e.cz>, Hugh Dickins <hughd@...gle.com>, Alexander Graf
<graf@...zon.com>, Benjamin Herrenschmidt <benh@...nel.crashing.org>, "David
Woodhouse" <dwmw2@...radead.org>, James Gowans <jgowans@...zon.com>, "Mike
Rapoport" <rppt@...nel.org>, Paolo Bonzini <pbonzini@...hat.com>, "Pasha
Tatashin" <tatashin@...gle.com>, Anthony Yznaga <anthony.yznaga@...cle.com>,
Dave Hansen <dave.hansen@...el.com>, David Hildenbrand <david@...hat.com>,
Jason Gunthorpe <jgg@...dia.com>, Matthew Wilcox <willy@...radead.org>, "Wei
Yang" <richard.weiyang@...il.com>, Andrew Morton <akpm@...ux-foundation.org>,
<linux-fsdevel@...r.kernel.org>, <linux-doc@...r.kernel.org>,
<linux-mm@...ck.org>, <kexec@...ts.infradead.org>
Subject: [RFC PATCH 5/5] mm/memfd: allow preserving FD over FDBOX + KHO
For applications with a large amount of memory that takes time to
rebuild, reboots to consume kernel upgrades can be very expensive. FDBox
allows preserving file descriptors over kexec using KHO. Combining that
with memfd gives those applications reboot-persistent memory that they
can use to quickly save and reconstruct that state.
While memfd is backed by either hugetlbfs or shmem, currently only
support on shmem is added for this. Allow saving and restoring shmem FDs
over FDBOX + KHO.
The memfd FDT node itself does not contain much information. It just
creates a subnode and passes it over to shmem to do its thing. Similar
behaviour is followed on the restore side.
Since there are now two paths of getting a shmem file, refactor the file
setup into its own function called memfd_setup_file(). It sets up the
file flags, mode, etc., and sets fdbox ops if enabled.
Signed-off-by: Pratyush Yadav <ptyadav@...zon.de>
---
mm/memfd.c | 128 ++++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 116 insertions(+), 12 deletions(-)
diff --git a/mm/memfd.c b/mm/memfd.c
index 37f7be57c2f50..1c32e66197f6d 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -7,6 +7,8 @@
* This file is released under the GPL.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/fs.h>
#include <linux/vfs.h>
#include <linux/pagemap.h>
@@ -19,8 +21,12 @@
#include <linux/shmem_fs.h>
#include <linux/memfd.h>
#include <linux/pid_namespace.h>
+#include <linux/fdbox.h>
+#include <linux/libfdt.h>
#include <uapi/linux/memfd.h>
+static const struct fdbox_file_ops memfd_fdbox_fops;
+
/*
* We need a tag: a new tag would expand every xa_node by 8 bytes,
* so reuse a tag which we firmly believe is never set or cleared on tmpfs
@@ -418,21 +424,10 @@ static char *alloc_name(const char __user *uname)
return ERR_PTR(error);
}
-static struct file *alloc_file(const char *name, unsigned int flags)
+static void memfd_setup_file(struct file *file, unsigned int flags)
{
unsigned int *file_seals;
- struct file *file;
- if (flags & MFD_HUGETLB) {
- file = hugetlb_file_setup(name, 0, VM_NORESERVE,
- HUGETLB_ANONHUGE_INODE,
- (flags >> MFD_HUGE_SHIFT) &
- MFD_HUGE_MASK);
- } else {
- file = shmem_file_setup(name, 0, VM_NORESERVE);
- }
- if (IS_ERR(file))
- return file;
file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
file->f_flags |= O_LARGEFILE;
@@ -452,6 +447,27 @@ static struct file *alloc_file(const char *name, unsigned int flags)
*file_seals &= ~F_SEAL_SEAL;
}
+#if defined(CONFIG_FDBOX) && defined(CONFIG_KEXEC_HANDOVER)
+ file->f_fdbox_op = &memfd_fdbox_fops;
+#endif
+}
+
+static struct file *alloc_file(const char *name, unsigned int flags)
+{
+ struct file *file;
+
+ if (flags & MFD_HUGETLB) {
+ file = hugetlb_file_setup(name, 0, VM_NORESERVE,
+ HUGETLB_ANONHUGE_INODE,
+ (flags >> MFD_HUGE_SHIFT) &
+ MFD_HUGE_MASK);
+ } else {
+ file = shmem_file_setup(name, 0, VM_NORESERVE);
+ }
+ if (IS_ERR(file))
+ return file;
+
+ memfd_setup_file(file, flags);
return file;
}
@@ -493,3 +509,91 @@ SYSCALL_DEFINE2(memfd_create,
kfree(name);
return error;
}
+
+#if defined(CONFIG_FDBOX) && defined(CONFIG_KEXEC_HANDOVER)
+static const char memfd_fdbox_compatible[] = "fdbox,memfd-v1";
+
+static struct file *memfd_fdbox_kho_recover(const void *fdt, int offset)
+{
+ struct file *file;
+ int ret, subnode;
+
+ ret = fdt_node_check_compatible(fdt, offset, memfd_fdbox_compatible);
+ if (ret) {
+ pr_err("kho: invalid compatible\n");
+ return NULL;
+ }
+
+ /* Make sure there is exactly one subnode. */
+ subnode = fdt_first_subnode(fdt, offset);
+ if (subnode < 0) {
+ pr_err("kho: no subnode for underlying storage found!\n");
+ return NULL;
+ }
+ if (fdt_next_subnode(fdt, subnode) >= 0) {
+ pr_err("kho: too many subnodes. Expected only 1.\n");
+ return NULL;
+ }
+
+ if (is_node_shmem(fdt, subnode)) {
+ file = shmem_fdbox_kho_recover(fdt, subnode);
+ if (!file)
+ return NULL;
+
+ memfd_setup_file(file, 0);
+ return file;
+ }
+
+ return NULL;
+}
+
+static int memfd_fdbox_kho_write(struct fdbox_fd *box_fd, void *fdt)
+{
+ int ret = 0;
+
+ ret |= fdt_property(fdt, "compatible", memfd_fdbox_compatible,
+ sizeof(memfd_fdbox_compatible));
+
+ /* TODO: Track seals on the file as well. */
+
+ ret |= fdt_begin_node(fdt, "");
+ if (ret) {
+ pr_err("kho: failed to set up memfd node\n");
+ return -EINVAL;
+ }
+
+ if (shmem_file(box_fd->file))
+ ret = shmem_fdbox_kho_write(box_fd, fdt);
+ else
+ /* TODO: HugeTLB support. */
+ ret = -EOPNOTSUPP;
+
+ if (ret)
+ return ret;
+
+ ret = fdt_end_node(fdt);
+ if (ret) {
+ pr_err("kho: failed to end memfd node!\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+static const struct fdbox_file_ops memfd_fdbox_fops = {
+ .kho_write = memfd_fdbox_kho_write,
+};
+
+static int __init memfd_fdbox_init(void)
+{
+ int error;
+
+ error = fdbox_register_handler(memfd_fdbox_compatible,
+ memfd_fdbox_kho_recover);
+ if (error)
+ pr_err("Could not register fdbox handler: %d\n", error);
+
+ return 0;
+}
+late_initcall(memfd_fdbox_init);
+#endif /* CONFIG_FDBOX && CONFIG_KEXEC_HANDOVER */
--
2.47.1
Powered by blists - more mailing lists