[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20250923153146.365015-4-fam.zheng@bytedance.com>
Date: Tue, 23 Sep 2025 15:31:44 +0000
From: Fam Zheng <fam.zheng@...edance.com>
To: linux-kernel@...r.kernel.org
Cc: Lukasz Luba <lukasz.luba@....com>,
linyongting@...edance.com,
songmuchun@...edance.com,
satish.kumar@...edance.com,
Borislav Petkov <bp@...en8.de>,
Thomas Gleixner <tglx@...utronix.de>,
yuanzhu@...edance.com,
Ingo Molnar <mingo@...hat.com>,
Daniel Lezcano <daniel.lezcano@...aro.org>,
fam.zheng@...edance.com,
Zhang Rui <rui.zhang@...el.com>,
fam@...hon.net,
"H. Peter Anvin" <hpa@...or.com>,
x86@...nel.org,
liangma@...edance.com,
Dave Hansen <dave.hansen@...ux.intel.com>,
"Rafael J. Wysocki" <rafael@...nel.org>,
guojinhui.liam@...edance.com,
linux-pm@...r.kernel.org,
Thom Hughes <thom.hughes@...edance.com>
Subject: [RFC 3/5] x86/parker: Introduce parker kerfs interface
From: Thom Hughes <thom.hughes@...edance.com>
This is the control knobs exposed to the boot kernel in order to start
the secondary kernels.
Signed-off-by: Thom Hughes <thom.hughes@...edance.com>
Signed-off-by: Fam Zheng <fam.zheng@...edance.com>
---
arch/x86/Kbuild | 3 +
arch/x86/Kconfig | 2 +
arch/x86/parker/Kconfig | 4 +
arch/x86/parker/Makefile | 2 +
arch/x86/parker/internal.h | 54 ++
arch/x86/parker/kernfs.c | 1266 ++++++++++++++++++++++++++++++++++++
include/linux/parker.h | 7 +
include/uapi/linux/magic.h | 1 +
8 files changed, 1339 insertions(+)
create mode 100644 arch/x86/parker/Kconfig
create mode 100644 arch/x86/parker/Makefile
create mode 100644 arch/x86/parker/internal.h
create mode 100644 arch/x86/parker/kernfs.c
create mode 100644 include/linux/parker.h
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index f7fb3d88c57b..e50fec2e8e5a 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -16,6 +16,9 @@ obj-$(CONFIG_XEN) += xen/
obj-$(CONFIG_PVH) += platform/pvh/
+# Multi-kernel support
+obj-$(CONFIG_PARKER) += parker/
+
# Hyper-V paravirtualization support
obj-$(subst m,y,$(CONFIG_HYPERV)) += hyperv/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f86e7072a5ba..490ea18cf783 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -3218,3 +3218,5 @@ config HAVE_ATOMIC_IOMAP
source "arch/x86/kvm/Kconfig"
source "arch/x86/Kconfig.assembler"
+
+source "arch/x86/parker/Kconfig"
diff --git a/arch/x86/parker/Kconfig b/arch/x86/parker/Kconfig
new file mode 100644
index 000000000000..716a2537f12c
--- /dev/null
+++ b/arch/x86/parker/Kconfig
@@ -0,0 +1,4 @@
+config PARKER
+ bool "Enable multi-kernel host support"
+ depends on X86_64 && SMP
+ select CMA
diff --git a/arch/x86/parker/Makefile b/arch/x86/parker/Makefile
new file mode 100644
index 000000000000..41c40fc64267
--- /dev/null
+++ b/arch/x86/parker/Makefile
@@ -0,0 +1,2 @@
+obj-y += kernfs.o
+$(obj)/kernfs.o: $(obj)/internal.h
diff --git a/arch/x86/parker/internal.h b/arch/x86/parker/internal.h
new file mode 100644
index 000000000000..a6150f1beb77
--- /dev/null
+++ b/arch/x86/parker/internal.h
@@ -0,0 +1,54 @@
+#ifndef _PARKER_INTERNAL_H
+#define _PARKER_INTERNAL_H
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/list.h>
+#include <linux/kernfs.h>
+
+/* Currently limit support for devices */
+#define PARKER_MAX_PCI_DEVICES 256
+#define PARKER_MAX_CPUS 512
+
+/* For now, limited to one page,
+ * but could have chained pages for PCI devs + APIC ids */
+struct parker_control_structure {
+ phys_addr_t start_address;
+ bool online;
+ unsigned int parker_id;
+ u32 pci_dev_ids[PARKER_MAX_PCI_DEVICES];
+ unsigned int num_pci_devs;
+ u32 apic_ids[PARKER_MAX_CPUS];
+ unsigned int num_cpus;
+};
+
+struct parker_kernel_device_entry {
+ struct list_head list_entry;
+ struct kernfs_node *kn;
+ struct device *dev;
+};
+
+struct parker_kernel_entry {
+ struct kernfs_node *kn;
+ struct mutex mutex;
+ unsigned int id;
+ bool online;
+ struct cpumask cpu_mask;
+ /* Contiguous pages from CMA for parker physical memory */
+ struct page *physical_memory_pages;
+ unsigned long physical_memory_page_count;
+ /* Control structure PAGE for now */
+ struct page *control_structure_pages;
+ /* Currently always 1 but future proofing */
+ unsigned long control_structure_page_count;
+ struct kernfs_node *kn_devices;
+ /* List of each kernfs node, get kobj from kernfs_node */
+ struct list_head list_devices;
+};
+
+/* Ensure we don't exceed 1 page, if we do. We need to rethink control structure
+ * and chain pages together. */
+static_assert(sizeof(struct parker_control_structure) < PAGE_SIZE,
+ "struct (parker_control_structure) too large!");
+
+#endif
diff --git a/arch/x86/parker/kernfs.c b/arch/x86/parker/kernfs.c
new file mode 100644
index 000000000000..68f4b7f779b5
--- /dev/null
+++ b/arch/x86/parker/kernfs.c
@@ -0,0 +1,1266 @@
+#define pr_fmt(fmt) "parker: " fmt
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/syscalls.h>
+#include <linux/kernel.h>
+#include <linux/cpumask.h>
+#include <linux/kexec.h>
+#include <linux/mm.h>
+#include <linux/cma.h>
+#include <linux/parker.h>
+#include <linux/magic.h>
+#include <linux/math.h>
+#include <linux/interrupt.h>
+#include <linux/irqreturn.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/device.h>
+#include <linux/reboot.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/fs_parser.h>
+#include <linux/sysfs.h>
+#include <linux/kernfs.h>
+#include <linux/seq_buf.h>
+#include <linux/seq_file.h>
+#include <linux/nospec.h>
+
+#include <asm/mtrr.h>
+#include <asm/realmode.h>
+#include <asm/microcode.h>
+#include <asm/apic.h>
+#include <asm/espfix.h>
+#include <asm/irqdomain.h>
+#include <asm/init.h>
+#include <asm/hw_irq.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/set_memory.h>
+
+#include "internal.h"
+
+static struct cma *parker_cma[MAX_NUMNODES];
+static unsigned long long parker_cma_size;
+static unsigned long long parker_cma_size_in_node[MAX_NUMNODES];
+static struct page *parker_active_control_structure_page;
+
+static struct transition_pagetable_data {
+ struct x86_mapping_info info;
+ pgd_t *pgd;
+ void *stack;
+} transition_pagetable;
+
+static int __init parker_early_cma(char *p)
+{
+ int nid, count = 0;
+ unsigned long long tmp;
+ char *s = p;
+
+ while (*s) {
+ if (sscanf(s, "%llu%n", &tmp, &count) != 1)
+ break;
+
+ if (s[count] == ':') {
+ if (tmp >= MAX_NUMNODES)
+ break;
+ nid = array_index_nospec(tmp, MAX_NUMNODES);
+
+ s += count + 1;
+ tmp = memparse(s, &s);
+ parker_cma_size_in_node[nid] = tmp;
+ parker_cma_size += tmp;
+
+ /*
+ * Skip the separator if have one, otherwise
+ * break the parsing.
+ */
+ if (*s == ',')
+ s++;
+ else
+ break;
+ } else {
+ parker_cma_size = memparse(p, &p);
+ break;
+ }
+ }
+
+ return 0;
+}
+early_param("parker_cma", parker_early_cma);
+
+#define ORDER_1G 30
+void __init parker_cma_reserve(void)
+{
+ bool node_specific_cma_alloc = false;
+ unsigned long long size, reserved, per_node;
+ int nid;
+
+ if (!parker_cma_size)
+ return;
+
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ if (parker_cma_size_in_node[nid] == 0)
+ continue;
+
+ if (!node_online(nid)) {
+ pr_warn("invalid node %d specified for CMA allocation\n", nid);
+ parker_cma_size -= parker_cma_size_in_node[nid];
+ parker_cma_size_in_node[nid] = 0;
+ continue;
+ }
+
+ if (parker_cma_size_in_node[nid] < SZ_1G) {
+ pr_warn("cma area of node %d should be at least 1GiB\n", nid);
+ parker_cma_size -= parker_cma_size_in_node[nid];
+ parker_cma_size_in_node[nid] = 0;
+ } else {
+ node_specific_cma_alloc = true;
+ }
+ }
+ /* Validate the CMA size again in case some invalid nodes specified. */
+ if (!parker_cma_size)
+ return;
+
+ if (parker_cma_size < SZ_1G) {
+ pr_warn("cma area should be at least 1 GiB\n");
+ parker_cma_size = 0;
+ return;
+ }
+
+ if (!node_specific_cma_alloc) {
+ /*
+ * If 3 GB area is requested on a machine with 4 numa nodes,
+ * let's allocate 1 GB on first three nodes and ignore the last one.
+ */
+ per_node = DIV_ROUND_UP(parker_cma_size, nr_online_nodes);
+ pr_info("reserve CMA %llu MiB, up to %llu MiB per node\n",
+ parker_cma_size / SZ_1M, per_node / SZ_1M);
+ }
+
+ reserved = 0;
+ for_each_online_node(nid) {
+ int res;
+ char name[CMA_MAX_NAME];
+
+ if (node_specific_cma_alloc) {
+ if (parker_cma_size_in_node[nid] == 0)
+ continue;
+
+ size = parker_cma_size_in_node[nid];
+ } else {
+ size = min(per_node, parker_cma_size - reserved);
+ }
+
+ size = round_up(size, SZ_1G);
+
+ snprintf(name, sizeof(name), "parker%d", nid);
+ /*
+ * Note that 'order per bit' is based on smallest size that
+ * may be returned to CMA allocator in the case of
+ * huge page demotion.
+ */
+ res = cma_declare_contiguous_nid(0, size, 0,
+ SZ_1G,
+ ORDER_1G - PAGE_SHIFT, false, name,
+ &parker_cma[nid], nid);
+ if (res) {
+ pr_warn("reservation failed - err %d, node %d",
+ res, nid);
+ continue;
+ }
+
+ reserved += size;
+ pr_info("reserved %llu MiB on node %d\n",
+ size / SZ_1M, nid);
+
+ if (reserved >= parker_cma_size)
+ break;
+ }
+
+ if (!reserved)
+ /*
+ * parker_cma_size is used to determine if allocations from
+ * cma are possible. Set to zero if no cma regions are set up.
+ */
+ parker_cma_size = 0;
+}
+
+/* Make sure we don't overwrite initial_code too early */
+struct semaphore cpu_kick_semaphore;
+
+__attribute__((noreturn)) static void parker_bsp_start(void)
+{
+ /* Let parker_start_kernel know we're here */
+ up(&cpu_kick_semaphore);
+
+ if (kexec_image) {
+ machine_kexec(kexec_image);
+ }
+ // never get here but?
+ for (;;) {
+ continue;
+ }
+}
+
+__attribute__((noreturn)) static void parker_ap_wait(void)
+{
+ /* Let parker_start_kernel know we're here */
+ up(&cpu_kick_semaphore);
+
+ unsigned int cpu = smp_processor_id();
+ unsigned int apic_id = apic->cpu_present_to_apicid(cpu);
+
+ volatile struct parker_control_structure *pcs;
+ /* For now, use global active control page.
+ * Eventually we can add lookup from CPU -> control page */
+ pcs = page_address(parker_active_control_structure_page);
+ int idx = 0;
+ while (!READ_ONCE(pcs->start_address)) {
+ idx++;
+ continue;
+ }
+ pr_debug("parker trampoline physical address %llx\n", pcs->start_address);
+ smp_mb();
+ u64 call_addr = 0;
+ /* There's no race condition on stack as we don't read the stack pointer again */
+ asm volatile (
+ "mov (%1), %0\n\t"
+ "mov %3, %%rsp\n\t"
+ "mov %4, %%esi\n\t"
+ "mov %2, %%cr3\n\t"
+ ANNOTATE_RETPOLINE_SAFE
+ "call *%0\n\t"
+ : "+r" (call_addr)
+ : "r" (&pcs->start_address),
+ "r" (__sme_pa(transition_pagetable.pgd)),
+ "r" (__sme_pa(transition_pagetable.stack + PAGE_SIZE)),
+ "r" (apic_id)
+ : "esi", "rsp"
+ );
+
+ for (;;) {
+ continue;
+ }
+}
+
+static void parker_host_ipicb(void)
+{
+ pr_info("OKK\n");
+}
+
+static void __init *alloc_pgt_page(void *dummy)
+{
+ return (void*)get_zeroed_page(GFP_ATOMIC);
+}
+
+static int __init init_transition_pgtable(pgd_t *pgd)
+{
+ pgprot_t prot = PAGE_KERNEL_EXEC_NOENC;
+ unsigned long vaddr, paddr;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ vaddr = (unsigned long)parker_ap_wait;
+ pgd += pgd_index(vaddr);
+ if (!pgd_present(*pgd)) {
+ p4d = (p4d_t *)alloc_pgt_page(NULL);
+ if (!p4d)
+ return -ENOMEM;
+ set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE));
+ }
+ p4d = p4d_offset(pgd, vaddr);
+ if (!p4d_present(*p4d)) {
+ pud = (pud_t *)alloc_pgt_page(NULL);
+ if (!pud)
+ return -ENOMEM;
+ set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE));
+ }
+ pud = pud_offset(p4d, vaddr);
+ if (!pud_present(*pud)) {
+ pmd = (pmd_t *)alloc_pgt_page(NULL);
+ if (!pmd)
+ return -ENOMEM;
+ set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
+ }
+ pmd = pmd_offset(pud, vaddr);
+ if (!pmd_present(*pmd)) {
+ pte = (pte_t *)alloc_pgt_page(NULL);
+ if (!pte)
+ return -ENOMEM;
+ set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
+ }
+ pte = pte_offset_kernel(pmd, vaddr);
+
+ paddr = __pa(vaddr);
+ set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, prot));
+
+ return 0;
+}
+
+/* Allocate intermediate trampoline pagetable, that has all physical memory
+ * mapped allowing us to reuse this for all parker kernel instantiations. */
+static int __init parker_host_transition_pagetable_init(void)
+{
+ struct x86_mapping_info info = {
+ .alloc_pgt_page = alloc_pgt_page,
+// .free_pgt_page = free_pgt_page,
+ .page_flag = __PAGE_KERNEL_LARGE_EXEC,
+ .kernpg_flag = _KERNPG_TABLE_NOENC,
+ };
+
+ pgd_t *pgd;
+ pgd = alloc_pgt_page(NULL);
+ void *stack = alloc_pgt_page(NULL);
+ if (!pgd)
+ return -ENOMEM;
+
+ for (int i = 0; i < nr_pfn_mapped; i++) {
+ unsigned long mstart, mend;
+
+ mstart = pfn_mapped[i].start << PAGE_SHIFT;
+ mend = pfn_mapped[i].end << PAGE_SHIFT;
+ if (kernel_ident_mapping_init(&info, pgd, mstart, mend)) {
+ //kernel_ident_mapping_free(&info, pgd);
+ return -ENOMEM;
+ }
+ }
+
+ transition_pagetable.info = info;
+ transition_pagetable.pgd = pgd;
+ transition_pagetable.stack = stack;
+
+ return init_transition_pgtable(pgd);
+}
+static int __init parker_kernfs_init(void);
+
+/* Multi-kernel module code for Primary <-> secondary communication */
+static int __init parker_module_init(void)
+{
+ if (is_parker_instance())
+ return -ENODEV;
+ parker_kernfs_init();
+ sema_init(&cpu_kick_semaphore, 0);
+ // TODO: Device registration for sysfs interface
+ // copying resctrl interface style with folder creation
+ // and deletion to create kernels.
+ pr_info("Multikernel module loading...\n");
+ // TODO: Custom
+ if (x86_platform_ipi_callback) {
+ pr_err("Platform callback exists\n");
+ return -ENODEV;
+ }
+ x86_platform_ipi_callback = parker_host_ipicb;
+ if(parker_host_transition_pagetable_init()) {
+ pr_info("TTABLE FAILED!\n");
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+static void __exit parker_module_exit(void)
+{
+ pr_info("Multikernel exiting.\n");
+ //__free_pages(parker_control_page, 0);
+}
+
+/* Ensure global parker lock is held */
+static int parker_start_kernel(struct parker_kernel_entry *pke)
+{
+ struct parker_control_structure *pcs;
+ struct list_head *dev_elem;
+ int ret;
+
+ WRITE_ONCE(parker_active_control_structure_page, pke->control_structure_pages);
+ pcs = page_address(parker_active_control_structure_page);
+
+ if (!pcs)
+ return -EINVAL;
+
+ /* Add PCI device IDs to control structure */
+ list_for_each(dev_elem, &pke->list_devices) {
+ struct parker_kernel_device_entry *pkde;
+ struct pci_dev *pdev;
+ int pci_dev_index = pcs->num_pci_devs++;
+ pkde = container_of(dev_elem,
+ struct parker_kernel_device_entry,
+ list_entry);
+ pdev = to_pci_dev(pkde->dev);
+ pcs->pci_dev_ids[pci_dev_index] = pci_dev_id(pdev);
+ }
+
+ int bsp_cpu, cpu, i = 0;
+ /* Partitioned kernel's AP will wait on BSP to jump to kernel's startup code */
+ for_each_cpu(cpu, &pke->cpu_mask) {
+ u32 apicid = apic->cpu_present_to_apicid(cpu);
+ pcs->apic_ids[i] = apicid;
+ ++pcs->num_cpus;
+ int old = i++;
+ if (old == 0) {
+ bsp_cpu = cpu;
+ continue;
+ }
+
+ smpboot_control = cpu;
+ initial_code = (unsigned long)parker_ap_wait;
+ init_espfix_ap(cpu);
+ smp_mb();
+
+ pr_debug("parker AP %d %d\n", apicid, ret);
+ unsigned long start_ip = real_mode_header->trampoline_start;
+ ret = wakeup_secondary_cpu_via_init(apicid, start_ip);
+ /* Continue on with errors for now */
+ if (ret) {
+ pr_err("Failed to start cpu %d\n", cpu);
+ --i;
+ --pcs->num_cpus;
+ continue;
+ }
+ /* Wait for CPU to wakeup and start executing AP wait function */
+ down(&cpu_kick_semaphore);
+ }
+
+ /* Start the partitioned kernel's BSP */
+ //mtrr_save_state();
+ u32 apicid = apic->cpu_present_to_apicid(bsp_cpu);
+ smpboot_control = bsp_cpu;
+ initial_code = (unsigned long)parker_bsp_start;
+ init_espfix_ap(bsp_cpu);
+ smp_mb();
+ unsigned long start_ip = real_mode_header->trampoline_start;
+ ret = wakeup_secondary_cpu_via_init(apicid, start_ip);
+ if (ret)
+ return ret;
+ down(&cpu_kick_semaphore);
+
+ /* Wait for partitioned kernel to start */
+ while (!READ_ONCE(pcs->online))
+ cpu_relax();
+
+ return 0;
+}
+
+static bool parker_kernel_is_online(struct parker_kernel_entry *pke)
+{
+ struct parker_control_structure *pcs;
+ pcs = page_address(pke->control_structure_pages);
+ return READ_ONCE(pcs->online);
+}
+
+/*
+ *
+ * Proper implementation:
+ * /sys/fs/parker new kernelfs
+ *
+ */
+/* The filesystem can only be mounted once. */
+/* TODO: Deal with recovery of structures if unmounted */
+// Forward declarations
+static int parker_get_tree(struct fs_context *fc);
+static int parker_init_fs_context(struct fs_context *fc);
+static void parker_fs_context_free(struct fs_context *fc);
+static void parker_kill_sb(struct super_block *sb);
+static int parker_kn_set_ugid(struct kernfs_node *kn);
+
+/* Mutex to protect parker access. */
+DEFINE_MUTEX(parker_mutex);
+atomic_t parker_kernels = ATOMIC_INIT(0);
+static bool parker_mounted;
+/* All CPUs belonging to second kernel*/
+static struct cpumask parker_cpus;
+
+struct parker {
+ struct kernfs_node *kn;
+ /* TODO: control structures etc... */
+};
+
+struct parker_file_type {
+ char *name;
+ umode_t mode;
+ const struct kernfs_ops *kf_ops;
+
+ int (*seq_show)(struct kernfs_open_file *of, struct seq_file *sf, void *v);
+
+ ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off);
+};
+
+static int parker_add_files(struct kernfs_node *kn, struct parker_file_type *pfts, int len);
+
+static int parker_seqfile_show(struct seq_file *m, void *arg)
+{
+ struct kernfs_open_file *of = m->private;
+ struct parker_file_type *pft = of->kn->priv;
+
+ if (pft->seq_show)
+ return pft->seq_show(of, m, arg);
+
+ return 0;
+}
+
+static ssize_t parker_file_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct parker_file_type *pft = of->kn->priv;
+
+ if (pft->write)
+ return pft->write(of, buf, nbytes, off);
+
+ return -EINVAL;
+}
+
+static const struct kernfs_ops parker_kf_ops = {
+ .atomic_write_len = PAGE_SIZE,
+ .write = parker_file_write,
+ .seq_show = parker_seqfile_show,
+};
+
+/* List of attributes in root - currently none */
+static struct parker_file_type root_attributes[] = {};
+
+static int parker_kernel_index_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct parker_kernel_entry *pke = of->kn->parent->priv;
+ mutex_lock(&pke->mutex);
+ seq_printf(seq, "%u\n", pke->id);
+ mutex_unlock(&pke->mutex);
+ return 0;
+}
+
+static int parker_kernel_control_structure_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct parker_kernel_entry *pke = of->kn->parent->priv;
+ mutex_lock(&parker_mutex);
+ seq_printf(seq, "0x%llx\n", page_to_phys(pke->control_structure_pages));
+ mutex_unlock(&parker_mutex);
+ return 0;
+}
+
+
+static int parker_kernel_online_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct parker_kernel_entry *pke = of->kn->parent->priv;
+ bool online;
+
+ mutex_lock(&pke->mutex);
+ online = parker_kernel_is_online(pke);
+ seq_printf(seq, "%u\n", online);
+ mutex_unlock(&pke->mutex);
+ return 0;
+}
+
+static ssize_t parker_kernel_online_write(struct kernfs_open_file *of,
+ char *buf,
+ size_t nbytes, loff_t off)
+{
+ int ret;
+ struct parker_kernel_entry *pke = of->kn->parent->priv;
+
+ mutex_lock(&parker_mutex);
+ mutex_lock(&pke->mutex);
+
+ ret = parker_start_kernel(pke);
+ /* Only set online if the second kernel successfully started */
+ if (!ret)
+ pke->online = true;
+
+
+ mutex_unlock(&pke->mutex);
+ mutex_unlock(&parker_mutex);
+
+ return ret ?: nbytes;
+}
+
+static int parker_kernel_cpus_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ struct parker_kernel_entry *pke = of->kn->parent->priv;
+ mutex_lock(&pke->mutex);
+ seq_printf(seq, "%*pbl\n", cpumask_pr_args(&pke->cpu_mask));
+ mutex_unlock(&pke->mutex);
+ return 0;
+}
+
+static ssize_t parker_kernel_cpus_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ cpumask_var_t tmpmask, newmask;
+ struct parker_kernel_entry *pke = of->kn->parent->priv;
+ int cpu, ret;
+
+ if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) {
+ free_cpumask_var(tmpmask);
+ return -ENOMEM;
+ }
+
+ mutex_lock(&parker_mutex);
+ mutex_lock(&pke->mutex);
+ ret = cpulist_parse(buf, newmask);
+
+ /* Check if any CPUs belong to another parker kernel */
+ cpumask_and(tmpmask, newmask, &parker_cpus);
+ if (!cpumask_empty(tmpmask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* If CPUs are currently online, offline them */
+ cpumask_and(tmpmask, newmask, cpu_online_mask);
+ if (!cpumask_empty(tmpmask)) {
+ for_each_cpu(cpu, tmpmask)
+ remove_cpu(cpu);
+ }
+
+ cpumask_or(&parker_cpus, &parker_cpus, newmask);
+ cpumask_copy(&pke->cpu_mask, newmask);
+out:
+ free_cpumask_var(tmpmask);
+ free_cpumask_var(newmask);
+ mutex_unlock(&pke->mutex);
+ mutex_unlock(&parker_mutex);
+ return ret ?: nbytes;
+}
+
+static int parker_kernel_memory_show(struct kernfs_open_file *of,
+ struct seq_file *seq, void *v)
+{
+ int ret;
+
+ struct parker_kernel_entry *pke = of->kn->parent->priv;
+ mutex_lock(&pke->mutex);
+ if (!pke->physical_memory_pages) {
+ ret = -EINVAL;
+ goto out;
+ }
+ phys_addr_t base = page_to_phys(pke->physical_memory_pages);
+ unsigned long long size = pke->physical_memory_page_count * PAGE_SIZE;
+ seq_printf(seq, "%llu@...llx\n", size, base);
+ ret = 0;
+out:
+ mutex_unlock(&pke->mutex);
+ return ret;
+}
+
+static ssize_t parker_kernel_memory_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct parker_kernel_entry *pke = of->kn->parent->priv;
+ struct page *result;
+ int ret, memory_nid = NUMA_NO_NODE;
+ char *end;
+
+ unsigned long long size;
+ unsigned long page_count;
+
+ mutex_lock(&pke->mutex);
+ if (!(size = memparse(buf, &end))) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Ensure write is fully parsed */
+ if (*end != '\0' && *end != '\n') {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* We need a CPU to determine which NUMA node to allocate memory on */
+ if (cpumask_empty(&pke->cpu_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* Get NUMA node for first cpu (BSP) */
+ memory_nid = cpu_to_node(cpumask_first(&pke->cpu_mask));
+
+ if (pke->physical_memory_pages) {
+ if (!cma_release(parker_cma[memory_nid],
+ pke->physical_memory_pages,
+ pke->physical_memory_page_count)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ }
+
+ /* Assume that size is page aligned, if not second kernel loses page */
+ page_count = size >> PAGE_SHIFT;
+ result = cma_alloc(parker_cma[memory_nid], page_count, 0, false);
+
+ if (!result) {
+ ret = -ENOMEM;
+ pke->physical_memory_pages = NULL;
+ pke->physical_memory_page_count = 0;
+ goto out;
+ }
+
+ if (!cma_pages_valid(parker_cma[memory_nid], result, page_count)) {
+ ret = -EINVAL;
+ if (!cma_release(parker_cma[memory_nid], result, page_count))
+ pr_err("Failed to release invalid allocation.");
+ goto out;
+
+ }
+
+ pke->physical_memory_pages = result;
+ pke->physical_memory_page_count = page_count;
+ ret = 0;
+out:
+ mutex_unlock(&pke->mutex);
+ return ret ?: nbytes;
+}
+
+/* TODO: Consider implementation where we bind to pci-stub instead - avoid rescanning problem? */
+static ssize_t parker_kernel_bind_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct parker_kernel_entry *pke = of->kn->parent->priv;
+ struct kernfs_node *dev_kn;
+ struct device *dev;
+ int ret = -ENODEV;
+
+ mutex_lock(&pke->mutex);
+ dev = bus_find_device_by_name(&pci_bus_type, NULL, buf);
+ /* Remove from bus to prevent anyone from using it */
+ if (dev) {
+ struct parker_kernel_device_entry *pkde;
+ /* If device already disabled, maybe owned by another kernel. Only claim enabled devices */
+ if (!pci_is_enabled(to_pci_dev(dev))) {
+ put_device(dev);
+ ret = -EBUSY;
+ goto out;
+ }
+
+ pkde = kzalloc(sizeof(*pkde), GFP_KERNEL);
+ pkde->dev = dev;
+ dev_kn = kernfs_create_dir(pke->kn_devices, dev_name(dev),
+ pke->kn_devices->mode, pkde);
+ /* We use after kernfs_remove in unbind & rmdir case*/
+ kernfs_get(dev_kn);
+ pkde->kn = dev_kn;
+ list_add_tail(&pkde->list_entry, &pke->list_devices);
+ kernfs_activate(dev_kn);
+ pci_bus_type.remove(dev);
+ ret = 0;
+ }
+
+out:
+ mutex_unlock(&pke->mutex);
+ return ret ?: nbytes;
+}
+
+static ssize_t parker_kernel_unbind_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct parker_kernel_entry *pke = of->kn->parent->priv;
+ struct parker_kernel_device_entry *pkde;
+ struct kernfs_node *dev_kn;
+ struct device *dev;
+ int ret = -ENODEV;
+
+ mutex_lock(&pke->mutex);
+ dev = bus_find_device_by_name(&pci_bus_type, NULL, buf);
+ /* Remove from bus to prevent anyone from using it */
+ if (dev) {
+ /* Check if device is claimed by kernel */
+ dev_kn = kernfs_find_and_get(pke->kn_devices, dev_name(dev));
+ if (!dev_kn) {
+ put_device(dev);
+ goto out;
+ }
+
+ /* Ensure PCI device isn't enabled */
+ if (pci_is_enabled(to_pci_dev(dev))) {
+ put_device(dev);
+ kernfs_put(dev_kn);
+ goto out;
+ }
+ pkde = dev_kn->priv;
+
+ ret = pci_bus_type.probe(dev);
+ put_device(dev);
+ kernfs_remove(dev_kn);
+ /* One reference from getting above, one from device subdir creation */
+ kernfs_put(dev_kn);
+ kernfs_put(dev_kn);
+ list_del(&pkde->list_entry);
+ kfree(pkde);
+ }
+
+out:
+ mutex_unlock(&pke->mutex);
+ return ret ?: nbytes;
+}
+
+/* Secondary kernel attributes */
+static struct parker_file_type per_kernel_attributes[] = {
+ /* Passed to secondary kernel to identify */
+ {
+ .name = "id",
+ .mode = 0644,
+ .kf_ops = &parker_kf_ops,
+ .seq_show = parker_kernel_index_show,
+ },
+ {
+ .name = "control_structure",
+ .mode = 0644,
+ .kf_ops = &parker_kf_ops,
+ .seq_show = parker_kernel_control_structure_show,
+ },
+ {
+ .name = "cpus",
+ .mode = 0644,
+ .kf_ops = &parker_kf_ops,
+ .seq_show = parker_kernel_cpus_show,
+ .write = parker_kernel_cpus_write,
+ },
+ /* Add per numa node memory? */
+ {
+ .name = "memory",
+ .mode = 0644,
+ .kf_ops = &parker_kf_ops,
+ .seq_show = parker_kernel_memory_show,
+ .write = parker_kernel_memory_write,
+ },
+ {
+ .name = "bind",
+ .mode = 0644,
+ .kf_ops = &parker_kf_ops,
+ .write = parker_kernel_bind_write,
+ },
+ {
+ .name = "unbind",
+ .mode = 0644,
+ .kf_ops = &parker_kf_ops,
+ .write = parker_kernel_unbind_write,
+ },
+ /* TODO: is status better? */
+ {
+ .name = "online",
+ .mode = 0644,
+ .kf_ops = &parker_kf_ops,
+ .seq_show = parker_kernel_online_show,
+ .write = parker_kernel_online_write, // todo
+ },
+};
+
+struct parker_fs_context {
+ struct kernfs_fs_context kfc;
+};
+
+static int parker_setup_root(struct parker_fs_context *ctx);
+static void parker_destroy_root(void);
+
+static struct kernfs_root *parker_root;
+struct parker parker_default;
+
+static const struct fs_context_operations parker_fs_context_ops = {
+ .free = parker_fs_context_free,
+ .get_tree = parker_get_tree,
+};
+
+static struct file_system_type parker_fs_type = {
+ .name = "parker",
+ .init_fs_context = parker_init_fs_context,
+ .kill_sb = parker_kill_sb,
+};
+
+
+static int parker_kernel_entry_destroy(struct parker_kernel_entry *pke)
+{
+ int cpu, memory_nid = NUMA_NO_NODE, ret = 0;
+ struct list_head *dev_elem, *n;
+
+
+ /* Bring back parker CPUs */
+ for_each_cpu(cpu, &pke->cpu_mask) {
+ add_cpu(cpu);
+ if (memory_nid == NUMA_NO_NODE)
+ memory_nid = cpu_to_node(cpu);
+ }
+ cpumask_andnot(&parker_cpus, &parker_cpus, &pke->cpu_mask);
+
+ /* Free memory allocated */
+ if (pke->physical_memory_page_count > 0 &&
+ !cma_release(parker_cma[memory_nid],
+ pke->physical_memory_pages,
+ pke->physical_memory_page_count)) {
+ ret = -EBUSY;
+ }
+
+ for (int i = 0; i < pke->control_structure_page_count; ++i) {
+ __free_pages(pke->control_structure_pages + i, 0);
+ }
+
+
+ /* Unclaim PCI devices */
+ list_for_each_safe(dev_elem, n, &pke->list_devices) {
+ struct parker_kernel_device_entry *pkde;
+ pkde = container_of(dev_elem,
+ struct parker_kernel_device_entry,
+ list_entry);
+ ret = pci_bus_type.probe(pkde->dev);
+ if (ret)
+ continue;
+ put_device(pkde->dev);
+ kernfs_remove(pkde->kn);
+ kernfs_put(pkde->kn);
+ kfree(pkde);
+ }
+
+ atomic_dec(&parker_kernels);
+ mutex_destroy(&pke->mutex);
+ if (pke->kn)
+ kernfs_put(pke->kn);
+ kfree(pke);
+
+ return ret;
+}
+
+static int parker_kernel_control_structure_alloc(struct parker_kernel_entry *pke)
+{
+ pke->control_structure_pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0);
+ if (!pke->control_structure_pages)
+ return -ENOMEM;
+
+ pke->control_structure_page_count = 1;
+ return 0;
+}
+
+static int parker_kernel_entry_init(struct parker_kernel_entry *pke)
+{
+ struct kernfs_node *kn;
+ int ret;
+ // Also allocate any secondary structures?
+
+ ret = parker_kernel_control_structure_alloc(pke);
+ if (ret)
+ return ret;
+
+ atomic_inc(&parker_kernels);
+ pke->id = atomic_read(&parker_kernels);
+ pke->online = false;
+ mutex_init(&pke->mutex);
+ INIT_LIST_HEAD(&pke->list_devices);
+
+ kn = kernfs_create_dir(pke->kn, "devices", pke->kn->mode, pke);
+ if (IS_ERR(kn)) {
+ /* As no devices, can't fail */
+ parker_kernel_entry_destroy(pke);
+ return PTR_ERR(kn);
+ }
+ pke->kn_devices = kn;
+
+ return 0;
+}
+
+static int parker_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode)
+{
+ int ret = 0;
+ struct parker_kernel_entry *pke;
+ struct kernfs_node *kn;
+
+ /* Only allow creation from within root directory */
+ if (parent_kn != parker_default.kn)
+ return -EINVAL;
+
+ if (strchr(name, '\n'))
+ return -EINVAL;
+
+ mutex_lock(&parker_mutex);
+ pke = kzalloc(sizeof(*pke), GFP_KERNEL);
+ if (!pke) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ kn = kernfs_create_dir(parent_kn, name, mode, pke);
+ if (IS_ERR(kn)) {
+ ret = PTR_ERR(kn);
+ goto out_free_pke;
+ }
+ pke->kn = kn;
+
+ ret = parker_kernel_entry_init(pke);
+ if (ret)
+ goto out_unlock;
+
+ /* As we will use pke after kernfs_remove */
+ kernfs_get(pke->kn);
+
+ ret = parker_kn_set_ugid(kn);
+ if (ret) {
+ goto out_destroy;
+ }
+
+ ret = parker_add_files(kn, per_kernel_attributes, ARRAY_SIZE(per_kernel_attributes));
+ if (ret) {
+ goto out_destroy;
+ }
+
+ kernfs_activate(kn);
+ goto out_unlock;
+
+out_destroy:
+ kernfs_remove(pke->kn);
+ kernfs_put(pke->kn);
+out_free_pke:
+ kfree(pke);
+out_unlock:
+ mutex_unlock(&parker_mutex);
+ return ret;
+}
+
+static int parker_rmdir(struct kernfs_node *kn)
+{
+ struct parker_kernel_entry *pke = kn->priv;
+ int ret = 0;
+
+ /* Only handle rmdir of kernel */
+ if (pke->kn != kn) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ if (parker_kernel_is_online(pke)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* First remove, ensuring no new operations */
+ mutex_lock(&pke->mutex);
+ kernfs_remove_self(kn);
+ mutex_unlock(&pke->mutex);
+
+ ret = parker_kernel_entry_destroy(pke);
+out:
+ return ret;
+}
+
+static struct kernfs_syscall_ops parker_kf_syscall_ops = {
+ .mkdir = parker_mkdir,
+ .rmdir = parker_rmdir,
+};
+
+static inline struct parker_fs_context *parker_fc2context(struct fs_context *fc)
+{
+ struct kernfs_fs_context *kfc = fc->fs_private;
+
+ return container_of(kfc, struct parker_fs_context, kfc);
+}
+
+static int parker_kn_set_ugid(struct kernfs_node *kn)
+{
+ struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID,
+ .ia_uid = current_fsuid(),
+ .ia_gid = current_fsgid(), };
+
+ if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) &&
+ gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID))
+ return 0;
+
+ return kernfs_setattr(kn, &iattr);
+}
+
+static int parker_add_file(struct kernfs_node *parent_kn,
+ struct parker_file_type *pft)
+{
+ struct kernfs_node *kn;
+ int ret;
+
+ kn = __kernfs_create_file(parent_kn, pft->name, pft->mode,
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
+ 0, pft->kf_ops, pft, NULL, NULL);
+ if (IS_ERR(kn))
+ return PTR_ERR(kn);
+
+ ret = parker_kn_set_ugid(kn);
+ if (ret) {
+ kernfs_remove(kn);
+ return ret;
+ }
+
+ return 0;
+}
+
+static int parker_add_files(struct kernfs_node *kn, struct parker_file_type *pfts, int len)
+{
+ struct parker_file_type *pft;
+ int ret;
+
+ lockdep_assert_held(&parker_mutex);
+
+ for (pft = pfts; pft < pfts + len; pft++) {
+ ret = parker_add_file(kn, pft);
+ if (ret)
+ goto error;
+ }
+
+ return 0;
+error:
+ pr_warn("Failed to add %s, err=%d\n", pft->name, ret);
+ while (--pft >= pfts) {
+ kernfs_remove_by_name(kn, pft->name);
+ }
+ return ret;
+}
+
+
+static int parker_init_fs_context(struct fs_context *fc)
+{
+ struct parker_fs_context *ctx;
+ ctx = kzalloc(sizeof(struct parker_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->kfc.magic = PARKER_SUPER_MAGIC; // TODO: Add to include/uapi/linux/magic.h
+ fc->fs_private = &ctx->kfc;
+ fc->ops = &parker_fs_context_ops;
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(&init_user_ns);
+ fc->global = true;
+ return 0;
+}
+
+static int parker_get_tree(struct fs_context *fc)
+{
+ struct parker_fs_context *ctx = parker_fc2context(fc);
+ int ret = 0;
+
+ mutex_lock(&parker_mutex);
+ if (parker_mounted) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ /* filesystem was unmounted but kernels weren't cleared up, reactivate last root */
+ if (parker_default.kn) {
+ ctx->kfc.root = parker_root;
+ goto activate_root;
+ }
+
+ ret = parker_setup_root(ctx);
+ if (ret)
+ goto destroy_root;
+
+ ret = parker_add_files(parker_default.kn, root_attributes, ARRAY_SIZE(root_attributes));
+ if (ret < 0)
+ goto destroy_root;
+
+activate_root:
+ kernfs_activate(parker_default.kn);
+ ret = kernfs_get_tree(fc);
+ if (ret < 0)
+ goto destroy_root;
+ parker_mounted = true;
+out:
+ mutex_unlock(&parker_mutex);
+ return ret;
+
+destroy_root:
+ parker_destroy_root();
+ return ret;
+}
+
+static void parker_fs_context_free(struct fs_context *fc)
+{
+ struct parker_fs_context *ctx = parker_fc2context(fc);
+
+ kernfs_free_fs_context(fc);
+ kfree(ctx);
+}
+
+static void parker_kill_sb(struct super_block *sb)
+{
+ mutex_lock(&parker_mutex);
+ parker_mounted = false;
+
+ /* Only destroy root if no kernels are still declared */
+ if (atomic_read(&parker_kernels) == 0) {
+ parker_destroy_root();
+ }
+
+ kernfs_kill_sb(sb);
+ mutex_unlock(&parker_mutex);
+}
+
+static void parker_destroy_root(void)
+{
+ kernfs_destroy_root(parker_root);
+ parker_default.kn = NULL;
+}
+
+static int parker_setup_root(struct parker_fs_context *ctx)
+{
+ parker_root = kernfs_create_root(
+ &parker_kf_syscall_ops,
+ KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
+ &parker_default);
+
+ if (IS_ERR(parker_root))
+ return PTR_ERR(parker_root);
+
+ ctx->kfc.root = parker_root;
+ parker_default.kn = kernfs_root_to_node(parker_root);
+
+ return 0;
+}
+
+/* Prevent us from onlining CPUs provisioned to parker instance */
+static int parker_cpu_offline_startup(unsigned int cpu)
+{
+ int ret;
+
+ mutex_lock(&parker_mutex);
+ ret = cpumask_test_cpu(cpu, &parker_cpus) ? -EINVAL : 0;
+ mutex_unlock(&parker_mutex);
+
+ return 0;
+}
+
+
+static int __init parker_kernfs_init(void)
+{
+ int ret = 0;
+
+ if (!parker_cma_size) {
+ pr_err("No parker CMA regions allocated, disabling parker.");
+ return -ENOENT;
+ }
+
+ ret = sysfs_create_mount_point(fs_kobj, "parker");
+ if (ret)
+ return ret;
+
+ ret = register_filesystem(&parker_fs_type);
+ if (ret)
+ goto cleanup_mountpoint;
+
+ ret = cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "parker", parker_cpu_offline_startup, NULL);
+ if (ret < 0)
+ goto cleanup_filesystem;
+
+ return ret;
+cleanup_filesystem:
+ unregister_filesystem(&parker_fs_type);
+cleanup_mountpoint:
+ sysfs_remove_mount_point(fs_kobj, "parker");
+ return ret;
+}
+
+module_init(parker_module_init);
+module_exit(parker_module_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Thom Hughes");
+MODULE_DESCRIPTION("Parker linux host module.");
+
diff --git a/include/linux/parker.h b/include/linux/parker.h
new file mode 100644
index 000000000000..4984aefcee0f
--- /dev/null
+++ b/include/linux/parker.h
@@ -0,0 +1,7 @@
+#ifndef _LINUX_PARKER_H
+#define _LINUX_PARKER_H
+#ifdef CONFIG_PARKER
+
+#endif /* CONFIG_PARKER */
+#endif /* _LINUX_PARKER_H */
+
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h
index bb575f3ab45e..25658054e3a7 100644
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -38,6 +38,7 @@
#define OVERLAYFS_SUPER_MAGIC 0x794c7630
#define FUSE_SUPER_MAGIC 0x65735546
#define BCACHEFS_SUPER_MAGIC 0xca451a4e
+#define PARKER_SUPER_MAGIC 0x5041524b /* "PARK" */
#define MINIX_SUPER_MAGIC 0x137F /* minix v1 fs, 14 char names */
#define MINIX_SUPER_MAGIC2 0x138F /* minix v1 fs, 30 char names */
--
2.39.5
Powered by blists - more mailing lists