[<prev] [next>] [day] [month] [year] [list]
Message-ID: <20250417094432.3690181-1-mclapinski@google.com>
Date: Thu, 17 Apr 2025 11:44:32 +0200
From: Michal Clapinski <mclapinski@...gle.com>
To: Pasha Tatashin <pasha.tatashin@...een.com>, Dan Williams <dan.j.williams@...el.com>,
Vishal Verma <vishal.l.verma@...el.com>, Dave Jiang <dave.jiang@...el.com>,
Ira Weiny <ira.weiny@...el.com>, Jonathan Corbet <corbet@....net>
Cc: nvdimm@...ts.linux.dev, linux-doc@...r.kernel.org,
linux-kernel@...r.kernel.org, Michal Clapinski <mclapinski@...gle.com>
Subject: [PATCH 1/1] libnvdimm/e820: Add a new parameter to configure many
regions per e820 entry
Currently, the user has to specify each memory region to be used with
nvdimm via the memmap parameter. Due to the character limit of the
command line, this makes it impossible to have a lot of pmem devices.
This new parameter solves this issue by allowing users to divide
one e820 entry into many nvdimm regions.
This change is needed for the hypervisor live update. VMs' memory will
be backed by those emulated pmem devices. To support various VM shapes
I want to create devdax devices at 1GB granularity similar to hugetlb.
It's also possible to expand this parameter in the future,
e.g. to specify the type of the device (fsdax/devdax).
Signed-off-by: Michal Clapinski <mclapinski@...gle.com>
---
.../admin-guide/kernel-parameters.txt | 7 +
drivers/nvdimm/e820.c | 149 +++++++++++++++++-
2 files changed, 153 insertions(+), 3 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fb8752b42ec85..63af03eb850ed 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3849,6 +3849,13 @@
n2= [NET] SDL Inc. RISCom/N2 synchronous serial card
+ nd_e820.pmem=ss[KMG],nn[KMG]
+ Divide one e820 entry specified by memmap=x!ss
+ (that is starting at ss) into pmem devices of size nn.
+ There can be only one pmem parameter per one e820
+ entry. The size of the e820 entry has to be divisible
+ by the device size.
+
netdev= [NET] Network devices parameters
Format: <irq>,<io>,<mem_start>,<mem_end>,<name>
Note that mem_start is often overloaded to mean
diff --git a/drivers/nvdimm/e820.c b/drivers/nvdimm/e820.c
index 41c67dfa80158..581fe01553a22 100644
--- a/drivers/nvdimm/e820.c
+++ b/drivers/nvdimm/e820.c
@@ -8,6 +8,87 @@
#include <linux/libnvdimm.h>
#include <linux/module.h>
#include <linux/numa.h>
+#include <linux/moduleparam.h>
+#include <linux/xarray.h>
+
+#define MAX_PMEM_ARGUMENTS 32
+
+static char *pmem[MAX_PMEM_ARGUMENTS];
+static int pmem_count;
+
+static int pmem_param_set(const char *arg, const struct kernel_param *kp)
+{
+ int rc;
+ struct kernel_param kp_new;
+
+ kp_new.name = kp->name;
+ kp_new.arg = &pmem[pmem_count];
+ rc = param_set_charp(arg, &kp_new);
+ if (rc)
+ return rc;
+ ++pmem_count;
+ return 0;
+}
+
+static void pmem_param_free(void *arg)
+{
+ int i;
+
+ for (i = 0; i < pmem_count; ++i)
+ param_free_charp(&pmem[i]);
+
+ pmem_count = 0;
+}
+
+static const struct kernel_param_ops pmem_param_ops = {
+ .set = pmem_param_set,
+ .free = pmem_param_free,
+};
+module_param_cb(pmem, &pmem_param_ops, NULL, 0);
+
+struct pmem_entry {
+ unsigned long region_size;
+};
+
+static int parse_one_pmem_arg(struct xarray *xarray, char *p)
+{
+ int rc = -EINVAL;
+ char *oldp;
+ unsigned long start;
+ struct pmem_entry *entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+
+ if (!entry)
+ return -ENOMEM;
+
+ oldp = p;
+ start = memparse(p, &p);
+ if (p == oldp || *p != ',') {
+ pr_err("Can't parse pmem start: %s\n", oldp);
+ goto err;
+ }
+ ++p;
+
+ oldp = p;
+ entry->region_size = memparse(p, &p);
+ if (p == oldp || (*p != ',' && *p != '\0')) {
+ pr_err("Can't parse pmem region size: %s\n", oldp);
+ goto err;
+ }
+
+ if (*p != '\0')
+ pr_warn("Unexpected parameters in pmem arg: %s\n", p);
+
+ rc = xa_err(xa_store(xarray, start, entry, GFP_KERNEL));
+ if (rc) {
+ pr_err("Failed to store 0x%lx in xarray, error %d\n", start, rc);
+ goto err;
+ }
+ return 0;
+
+err:
+ kfree(entry);
+ return rc;
+}
static void e820_pmem_remove(struct platform_device *pdev)
{
@@ -16,10 +97,9 @@ static void e820_pmem_remove(struct platform_device *pdev)
nvdimm_bus_unregister(nvdimm_bus);
}
-static int e820_register_one(struct resource *res, void *data)
+static int register_one_pmem(struct resource *res, struct nvdimm_bus *nvdimm_bus)
{
struct nd_region_desc ndr_desc;
- struct nvdimm_bus *nvdimm_bus = data;
int nid = phys_to_target_node(res->start);
memset(&ndr_desc, 0, sizeof(ndr_desc));
@@ -32,12 +112,64 @@ static int e820_register_one(struct resource *res, void *data)
return 0;
}
+struct walk_data {
+ struct xarray *pmem_xarray;
+ struct nvdimm_bus *nvdimm_bus;
+};
+
+static int e820_handle_one_entry(struct resource *res, void *data)
+{
+ struct walk_data *walk_data = data;
+ struct resource res_local;
+ struct pmem_entry *entry;
+ unsigned long entry_size = resource_size(res);
+ int rc;
+
+ entry = xa_load(walk_data->pmem_xarray, res->start);
+
+ if (!entry)
+ return register_one_pmem(res, data);
+
+ if (entry_size % entry->region_size != 0) {
+ pr_err("Entry size %lu is not divisible by region size %lu\n",
+ entry_size, entry->region_size);
+ return -EINVAL;
+ }
+
+ res_local.start = res->start;
+ res_local.end = res->start + entry->region_size - 1;
+ while (res_local.end <= res->end) {
+ rc = register_one_pmem(&res_local, walk_data->nvdimm_bus);
+ if (rc)
+ return rc;
+
+ res_local.start += entry->region_size;
+ res_local.end += entry->region_size;
+ }
+
+ return 0;
+}
+
+static void free_pmem_xarray(struct xarray *pmem_xarray)
+{
+ unsigned long start;
+ struct pmem_entry *entry;
+
+ xa_for_each(pmem_xarray, start, entry) {
+ kfree(entry);
+ }
+ xa_destroy(pmem_xarray);
+}
+
static int e820_pmem_probe(struct platform_device *pdev)
{
static struct nvdimm_bus_descriptor nd_desc;
struct device *dev = &pdev->dev;
struct nvdimm_bus *nvdimm_bus;
+ struct xarray pmem_xarray;
+ struct walk_data walk_data = {.pmem_xarray = &pmem_xarray};
int rc = -ENXIO;
+ int i;
nd_desc.provider_name = "e820";
nd_desc.module = THIS_MODULE;
@@ -46,8 +178,19 @@ static int e820_pmem_probe(struct platform_device *pdev)
goto err;
platform_set_drvdata(pdev, nvdimm_bus);
+ xa_init(&pmem_xarray);
+ for (i = 0; i < pmem_count; i++) {
+ rc = parse_one_pmem_arg(&pmem_xarray, pmem[i]);
+ if (rc != 0 && rc != -EINVAL) {
+ free_pmem_xarray(&pmem_xarray);
+ goto err;
+ }
+ }
+
+ walk_data.nvdimm_bus = nvdimm_bus;
rc = walk_iomem_res_desc(IORES_DESC_PERSISTENT_MEMORY_LEGACY,
- IORESOURCE_MEM, 0, -1, nvdimm_bus, e820_register_one);
+ IORESOURCE_MEM, 0, -1, &walk_data, e820_handle_one_entry);
+ free_pmem_xarray(&pmem_xarray);
if (rc)
goto err;
return 0;
--
2.49.0.777.g153de2bbd5-goog
Powered by blists - more mailing lists