[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240805093245.889357-8-jgowans@amazon.com>
Date: Mon, 5 Aug 2024 11:32:42 +0200
From: James Gowans <jgowans@...zon.com>
To: <linux-kernel@...r.kernel.org>
CC: James Gowans <jgowans@...zon.com>, Sean Christopherson
<seanjc@...gle.com>, Paolo Bonzini <pbonzini@...hat.com>, Alexander Viro
<viro@...iv.linux.org.uk>, Steve Sistare <steven.sistare@...cle.com>,
Christian Brauner <brauner@...nel.org>, Jan Kara <jack@...e.cz>, "Anthony
Yznaga" <anthony.yznaga@...cle.com>, Mike Rapoport <rppt@...nel.org>, "Andrew
Morton" <akpm@...ux-foundation.org>, <linux-mm@...ck.org>, Jason Gunthorpe
<jgg@...pe.ca>, <linux-fsdevel@...r.kernel.org>, Usama Arif
<usama.arif@...edance.com>, <kvm@...r.kernel.org>, Alexander Graf
<graf@...zon.com>, David Woodhouse <dwmw@...zon.co.uk>, Paul Durrant
<pdurrant@...zon.co.uk>, Nicolas Saenz Julienne <nsaenz@...zon.es>
Subject: [PATCH 07/10] guestmemfs: Persist filesystem metadata via KHO
Filesystem metadata consists of: physical memory extents, superblock,
inodes block and allocation bitmap. Here serialisation and
deserialisation of all of these is done via the KHO framework.
A serialisation callback is added which is run when KHO activate is
triggered. This creates the device tree blob for the metadata and marks
the memory as persistent via struct kho_mem(s).
When the filesystem is mounted it attempts to re-hydrate metadata from
KHO. Only if this fails (first boot, for example) then it allocates
fresh metadata pages.
The privatet data struct is switched from holding a reference to the
persistent superblock to now referencing the regular struct super_block.
This is necessary for the serialisation code. Better would be to be able
to define callback private data, if that were possible.
Signed-off-by: James Gowans <jgowans@...zon.com>
---
fs/guestmemfs/Makefile | 2 +
fs/guestmemfs/guestmemfs.c | 72 ++++++---
fs/guestmemfs/guestmemfs.h | 8 +
fs/guestmemfs/serialise.c | 296 +++++++++++++++++++++++++++++++++++++
4 files changed, 355 insertions(+), 23 deletions(-)
create mode 100644 fs/guestmemfs/serialise.c
diff --git a/fs/guestmemfs/Makefile b/fs/guestmemfs/Makefile
index e93e43ba274b..8b95cac34564 100644
--- a/fs/guestmemfs/Makefile
+++ b/fs/guestmemfs/Makefile
@@ -4,3 +4,5 @@
#
obj-y += guestmemfs.o inode.o dir.o allocator.o file.o
+
+obj-$(CONFIG_KEXEC_KHO) += serialise.o
diff --git a/fs/guestmemfs/guestmemfs.c b/fs/guestmemfs/guestmemfs.c
index 38f20ad25286..cf47e5100504 100644
--- a/fs/guestmemfs/guestmemfs.c
+++ b/fs/guestmemfs/guestmemfs.c
@@ -3,6 +3,7 @@
#include "guestmemfs.h"
#include <linux/dcache.h>
#include <linux/fs.h>
+#include <linux/kexec.h>
#include <linux/module.h>
#include <linux/fs_context.h>
#include <linux/io.h>
@@ -10,7 +11,7 @@
#include <linux/statfs.h>
phys_addr_t guestmemfs_base, guestmemfs_size;
-struct guestmemfs_sb *psb;
+struct super_block *guestmemfs_sb;
static int statfs(struct dentry *root, struct kstatfs *buf)
{
@@ -33,26 +34,39 @@ static int guestmemfs_fill_super(struct super_block *sb, struct fs_context *fc)
struct inode *inode;
struct dentry *dentry;
- psb = kzalloc(sizeof(*psb), GFP_KERNEL);
- psb->inodes = kzalloc(2 << 20, GFP_KERNEL);
- if (!psb->inodes)
- return -ENOMEM;
- psb->allocator_bitmap = kzalloc(1 << 20, GFP_KERNEL);
- if (!psb->allocator_bitmap)
- return -ENOMEM;
-
/*
* Keep a reference to the persistent super block in the
* ephemeral super block.
*/
- sb->s_fs_info = psb;
- spin_lock_init(&psb->allocation_lock);
- guestmemfs_initialise_inode_store(sb);
- guestmemfs_zero_allocations(sb);
- guestmemfs_get_persisted_inode(sb, 1)->flags = GUESTMEMFS_INODE_FLAG_DIR;
- strscpy(guestmemfs_get_persisted_inode(sb, 1)->filename, ".",
- GUESTMEMFS_FILENAME_LEN);
- psb->next_free_ino = 2;
+ sb->s_fs_info = guestmemfs_restore_from_kho();
+
+ if (GUESTMEMFS_PSB(sb)) {
+ pr_info("Restored super block from KHO\n");
+ } else {
+ struct guestmemfs_sb *psb;
+
+ pr_info("Did not restore from KHO - allocating free\n");
+ psb = kzalloc(sizeof(*psb), GFP_KERNEL);
+ psb->inodes = kzalloc(2 << 20, GFP_KERNEL);
+ if (!psb->inodes)
+ return -ENOMEM;
+ psb->allocator_bitmap = kzalloc(1 << 20, GFP_KERNEL);
+ if (!psb->allocator_bitmap)
+ return -ENOMEM;
+ sb->s_fs_info = psb;
+ spin_lock_init(&psb->allocation_lock);
+ guestmemfs_initialise_inode_store(sb);
+ guestmemfs_zero_allocations(sb);
+ guestmemfs_get_persisted_inode(sb, 1)->flags = GUESTMEMFS_INODE_FLAG_DIR;
+ strscpy(guestmemfs_get_persisted_inode(sb, 1)->filename, ".",
+ GUESTMEMFS_FILENAME_LEN);
+ GUESTMEMFS_PSB(sb)->next_free_ino = 2;
+ }
+ /*
+ * Keep a reference to this sb; the serialise callback needs it
+ * and has no oher way to get it.
+ */
+ guestmemfs_sb = sb;
sb->s_op = &guestmemfs_super_ops;
@@ -98,11 +112,18 @@ static struct file_system_type guestmemfs_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
+
+static struct notifier_block trace_kho_nb = {
+ .notifier_call = guestmemfs_serialise_to_kho,
+};
+
static int __init guestmemfs_init(void)
{
int ret;
ret = register_filesystem(&guestmemfs_fs_type);
+ if (IS_ENABLED(CONFIG_FTRACE_KHO))
+ register_kho_notifier(&trace_kho_nb);
return ret;
}
@@ -120,13 +141,18 @@ early_param("guestmemfs", parse_guestmemfs_extents);
void __init guestmemfs_reserve_mem(void)
{
- guestmemfs_base = memblock_phys_alloc(guestmemfs_size, 4 << 10);
- if (guestmemfs_base) {
- memblock_reserved_mark_noinit(guestmemfs_base, guestmemfs_size);
- memblock_mark_nomap(guestmemfs_base, guestmemfs_size);
- } else {
- pr_warn("Failed to alloc %llu bytes for guestmemfs\n", guestmemfs_size);
+ if (guestmemfs_size) {
+ guestmemfs_base = memblock_phys_alloc(guestmemfs_size, 4 << 10);
+
+ if (guestmemfs_base) {
+ memblock_reserved_mark_noinit(guestmemfs_base, guestmemfs_size);
+ memblock_mark_nomap(guestmemfs_base, guestmemfs_size);
+ pr_debug("guestmemfs reserved base=%llu from memblocks\n", guestmemfs_base);
+ } else {
+ pr_warn("Failed to alloc %llu bytes for guestmemfs\n", guestmemfs_size);
+ }
}
+
}
MODULE_ALIAS_FS("guestmemfs");
diff --git a/fs/guestmemfs/guestmemfs.h b/fs/guestmemfs/guestmemfs.h
index 0f2788ce740e..263d995b75ed 100644
--- a/fs/guestmemfs/guestmemfs.h
+++ b/fs/guestmemfs/guestmemfs.h
@@ -10,11 +10,14 @@
/* Units of bytes */
extern phys_addr_t guestmemfs_base, guestmemfs_size;
+extern struct super_block *guestmemfs_sb;
struct guestmemfs_sb {
/* Inode number */
unsigned long next_free_ino;
unsigned long allocated_inodes;
+
+ /* Ephemeral fields - must be updated on deserialise */
struct guestmemfs_inode *inodes;
void *allocator_bitmap;
spinlock_t allocation_lock;
@@ -46,6 +49,11 @@ long guestmemfs_alloc_block(struct super_block *sb);
struct inode *guestmemfs_inode_get(struct super_block *sb, unsigned long ino);
struct guestmemfs_inode *guestmemfs_get_persisted_inode(struct super_block *sb, int ino);
+int guestmemfs_serialise_to_kho(struct notifier_block *self,
+ unsigned long cmd,
+ void *v);
+struct guestmemfs_sb *guestmemfs_restore_from_kho(void);
+
extern const struct file_operations guestmemfs_dir_fops;
extern const struct file_operations guestmemfs_file_fops;
extern const struct inode_operations guestmemfs_file_inode_operations;
diff --git a/fs/guestmemfs/serialise.c b/fs/guestmemfs/serialise.c
new file mode 100644
index 000000000000..eb70d496a3eb
--- /dev/null
+++ b/fs/guestmemfs/serialise.c
@@ -0,0 +1,296 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include "guestmemfs.h"
+#include <linux/kexec.h>
+#include <linux/memblock.h>
+
+/*
+ * Responsible for serialisation and deserialisation of filesystem metadata
+ * to and from KHO to survive kexec. The deserialisation logic needs to mirror
+ * serialisation, so putting them in the same file.
+ *
+ * The format of the device tree structure is:
+ *
+ * /guestmemfs
+ * compatible = "guestmemfs-v1"
+ * fs_mem {
+ * mem = [ ... ]
+ * };
+ * superblock {
+ * mem = [
+ * persistent super block,
+ * inodes,
+ * allocator_bitmap,
+ * };
+ * mappings_block {
+ * mem = [ ... ]
+ * };
+ * // For every mappings_block mem, which inode it belongs to.
+ * mappings_to_inode {
+ * num_inodes,
+ * mem = [ ... ],
+ * }
+ */
+
+static int serialise_superblock(struct super_block *sb, void *fdt)
+{
+ struct kho_mem mem[3];
+ int err = 0;
+ struct guestmemfs_sb *psb = sb->s_fs_info;
+
+ err |= fdt_begin_node(fdt, "superblock");
+
+ mem[0].addr = virt_to_phys(psb);
+ mem[0].len = sizeof(*psb);
+
+ mem[1].addr = virt_to_phys(psb->inodes);
+ mem[1].len = 2 << 20;
+
+ mem[2].addr = virt_to_phys(psb->allocator_bitmap);
+ mem[2].len = 1 << 20;
+
+ err |= fdt_property(fdt, "mem", &mem, sizeof(mem));
+ err |= fdt_end_node(fdt);
+
+ return err;
+}
+
+static int serialise_mappings_blocks(struct super_block *sb, void *fdt)
+{
+ struct kho_mem *mappings_mems;
+ struct kho_mem mappings_to_inode_mem;
+ struct guestmemfs_sb *psb = sb->s_fs_info;
+ int inode_idx;
+ size_t num_inodes = PMD_SIZE / sizeof(struct guestmemfs_inode);
+ struct guestmemfs_inode *inode;
+ int err = 0;
+ int *mappings_to_inode;
+ int mappings_to_inode_idx = 0;
+
+ mappings_to_inode = kzalloc(PAGE_SIZE, GFP_KERNEL);
+
+ mappings_mems = kcalloc(psb->allocated_inodes, sizeof(struct kho_mem), GFP_KERNEL);
+
+ for (inode_idx = 1; inode_idx < num_inodes; ++inode_idx) {
+ inode = guestmemfs_get_persisted_inode(sb, inode_idx);
+ if (inode->flags & GUESTMEMFS_INODE_FLAG_FILE) {
+ mappings_mems[mappings_to_inode_idx].addr = virt_to_phys(inode->mappings);
+ mappings_mems[mappings_to_inode_idx].len = PAGE_SIZE;
+ mappings_to_inode[mappings_to_inode_idx] = inode_idx;
+ mappings_to_inode_idx++;
+ }
+ }
+
+ err |= fdt_begin_node(fdt, "mappings_blocks");
+ err |= fdt_property(fdt, "mem", mappings_mems,
+ sizeof(struct kho_mem) * mappings_to_inode_idx);
+ err |= fdt_end_node(fdt);
+
+
+ err |= fdt_begin_node(fdt, "mappings_to_inode");
+ mappings_to_inode_mem.addr = virt_to_phys(mappings_to_inode);
+ mappings_to_inode_mem.len = PAGE_SIZE;
+ err |= fdt_property(fdt, "mem", &mappings_to_inode_mem,
+ sizeof(mappings_to_inode_mem));
+ err |= fdt_property(fdt, "num_inodes", &psb->allocated_inodes,
+ sizeof(psb->allocated_inodes));
+
+ err |= fdt_end_node(fdt);
+
+ return err;
+}
+
+int guestmemfs_serialise_to_kho(struct notifier_block *self,
+ unsigned long cmd,
+ void *v)
+{
+ static const char compatible[] = "guestmemfs-v1";
+ struct kho_mem mem;
+ void *fdt = v;
+ int err = 0;
+
+ switch (cmd) {
+ case KEXEC_KHO_ABORT:
+ /* No rollback action needed. */
+ return NOTIFY_DONE;
+ case KEXEC_KHO_DUMP:
+ /* Handled below */
+ break;
+ default:
+ return NOTIFY_BAD;
+ }
+
+ err |= fdt_begin_node(fdt, "guestmemfs");
+ err |= fdt_property(fdt, "compatible", compatible, sizeof(compatible));
+
+ err |= fdt_begin_node(fdt, "fs_mem");
+ mem.addr = guestmemfs_base | KHO_MEM_ADDR_FLAG_NOINIT;
+ mem.len = guestmemfs_size;
+ err |= fdt_property(fdt, "mem", &mem, sizeof(mem));
+ err |= fdt_end_node(fdt);
+
+ err |= serialise_superblock(guestmemfs_sb, fdt);
+ err |= serialise_mappings_blocks(guestmemfs_sb, fdt);
+
+ err |= fdt_end_node(fdt);
+
+ pr_info("Serialised extends [0x%llx + 0x%llx] via KHO: %i\n",
+ guestmemfs_base, guestmemfs_size, err);
+
+ return err;
+}
+
+static struct guestmemfs_sb *deserialise_superblock(const void *fdt, int root_off)
+{
+ const struct kho_mem *mem;
+ int mem_len;
+ struct guestmemfs_sb *old_sb;
+ int off;
+
+ off = fdt_subnode_offset(fdt, root_off, "superblock");
+ mem = fdt_getprop(fdt, off, "mem", &mem_len);
+
+ if (mem_len != 3 * sizeof(struct kho_mem)) {
+ pr_err("Incorrect mem_len; got %i\n", mem_len);
+ return NULL;
+ }
+
+ old_sb = kho_claim_mem(mem);
+ old_sb->inodes = kho_claim_mem(mem + 1);
+ old_sb->allocator_bitmap = kho_claim_mem(mem + 2);
+
+ return old_sb;
+}
+
+static int deserialise_mappings_blocks(const void *fdt, int root_off,
+ struct guestmemfs_sb *sb)
+{
+ int off;
+ int len = 0;
+ const unsigned long *num_inodes;
+ const struct kho_mem *mappings_to_inode_mem;
+ int *mappings_to_inode;
+ int mappings_block;
+ const struct kho_mem *mappings_blocks_mems;
+
+ /*
+ * Array of struct kho_mem - one for each persisted mappings
+ * blocks.
+ */
+ off = fdt_subnode_offset(fdt, root_off, "mappings_blocks");
+ mappings_blocks_mems = fdt_getprop(fdt, off, "mem", &len);
+
+ /*
+ * Array specifying which inode a specific index into the
+ * mappings_blocks kho_mem array corresponds to. num_inodes
+ * indicates the size of the array which is the number of mappings
+ * blocks which need to be restored.
+ */
+ off = fdt_subnode_offset(fdt, root_off, "mappings_to_inode");
+ if (off < 0) {
+ pr_warn("No fs_mem available in KHO\n");
+ return -EINVAL;
+ }
+ num_inodes = fdt_getprop(fdt, off, "num_inodes", &len);
+ if (len != sizeof(num_inodes)) {
+ pr_warn("Invalid num_inodes len: %i\n", len);
+ return -EINVAL;
+ }
+ mappings_to_inode_mem = fdt_getprop(fdt, off, "mem", &len);
+ if (len != sizeof(*mappings_to_inode_mem)) {
+ pr_warn("Invalid mappings_to_inode_mem len: %i\n", len);
+ return -EINVAL;
+ }
+ mappings_to_inode = kho_claim_mem(mappings_to_inode_mem);
+
+ /*
+ * Re-assigned the mappings block to the inodes. Indexes into
+ * mappings_to_inode specifies which inode to assign each mappings
+ * block to.
+ */
+ for (mappings_block = 0; mappings_block < *num_inodes; ++mappings_block) {
+ int inode = mappings_to_inode[mappings_block];
+
+ sb->inodes[inode].mappings = kho_claim_mem(&mappings_blocks_mems[mappings_block]);
+ }
+
+ return 0;
+}
+
+static int deserialise_fs_mem(const void *fdt, int root_off)
+{
+ int err;
+ /* Offset into the KHO DT */
+ int off;
+ int len = 0;
+ const struct kho_mem *mem;
+
+ off = fdt_subnode_offset(fdt, root_off, "fs_mem");
+ if (off < 0) {
+ pr_info("No fs_mem available in KHO\n");
+ return -EINVAL;
+ }
+
+ mem = fdt_getprop(fdt, off, "mem", &len);
+ if (mem && len == sizeof(*mem)) {
+ guestmemfs_base = mem->addr & ~KHO_MEM_ADDR_FLAG_MASK;
+ guestmemfs_size = mem->len;
+ } else {
+ pr_err("KHO did not contain a guestmemfs base address and size\n");
+ return -EINVAL;
+ }
+
+ pr_info("Reclaimed [%llx + %llx] via KHO\n", guestmemfs_base, guestmemfs_size);
+ if (err) {
+ pr_err("Unable to reserve [0x%llx + 0x%llx] from memblock: %i\n",
+ guestmemfs_base, guestmemfs_size, err);
+ return err;
+ }
+ return 0;
+}
+struct guestmemfs_sb *guestmemfs_restore_from_kho(void)
+{
+ const void *fdt = kho_get_fdt();
+ struct guestmemfs_sb *old_sb;
+ int err;
+ /* Offset into the KHO DT */
+ int off;
+
+ if (!fdt) {
+ pr_err("Unable to get KHO DT after KHO boot?\n");
+ return NULL;
+ }
+
+ off = fdt_path_offset(fdt, "/guestmemfs");
+ pr_info("guestmemfs offset: %i\n", off);
+
+ if (!off) {
+ pr_info("No guestmemfs data available in KHO\n");
+ return NULL;
+ }
+ err = fdt_node_check_compatible(fdt, off, "guestmemfs-v1");
+ if (err) {
+ pr_err("Existing KHO superblock format is not compatible with this kernel\n");
+ return NULL;
+ }
+
+ old_sb = deserialise_superblock(fdt, off);
+ if (!old_sb) {
+ pr_warn("Failed to restore superblock\n");
+ return NULL;
+ }
+
+ err = deserialise_mappings_blocks(fdt, off, old_sb);
+ if (err) {
+ pr_warn("Failed to restore mappings blocks\n");
+ return NULL;
+ }
+
+ err = deserialise_fs_mem(fdt, off);
+ if (err) {
+ pr_warn("Failed to restore filesystem memory extents\n");
+ return NULL;
+ }
+
+ return old_sb;
+}
--
2.34.1
Powered by blists - more mailing lists