[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <3d5ccbb3-a083-4a5c-8c97-2db2adbc5446@kernel.org>
Date: Mon, 12 Jan 2026 21:00:54 +0100
From: "David Hildenbrand (Red Hat)" <david@...nel.org>
To: Gregory Price <gourry@...rry.net>, linux-cxl@...r.kernel.org
Cc: linux-kernel@...r.kernel.org, kernel-team@...a.com, dave@...olabs.net,
jonathan.cameron@...wei.com, dave.jiang@...el.com,
alison.schofield@...el.com, vishal.l.verma@...el.com, ira.weiny@...el.com,
dan.j.williams@...el.com
Subject: Re: [PATCH 2/6] cxl: add sysram_region memory controller
On 1/12/26 17:35, Gregory Price wrote:
> Add a sysram memctrl that directly hotplugs memory without needing to
> route through DAX. This simplifies the sysram usecase considerably.
>
> The sysram memctl adds new sysfs controls when registered:
> region/memctrl/[hotplug, hotunplug, state]
>
> hotplug: controller attempts to hotplug the memory region
Why disconnect the hotplug from the online state?
echo online_movable > hotplug ?
Then we can just have something like add_and_online_memory() in the core.
> hotunplug: controller attempts to offline and hotunplug the memory region
> state: [online,online_normal,offline]
> online : controller onlines blocks in ZONE_MOVABLE
I don't like this incosistency regarding the remainder of common hotplug
toggles.
We should use exactly the same values with exactly the same semantics.
Yes, user-space tooling should be thaught to pass in online_movable :)
> online_normal: controller onlines blocks in ZONE_NORMAL
> offline : controller attempts to offline the memory blocks
Why is that required? ideally we'd start with hotplug vs. hotunplug and
leave manual onlining/offlining out of this interface for now.
>
> Hotplug note - by default the controller will hotplug the blocks, but
> leave them offline (unless MHP auto-online in Kconfig is enabled).
>
> Setting state to "online_normal" may prevent future hot-unplug of sysram
> regions, and unbinding a memory region with memory online in ZONE_NORMAL
> may result in the device being removed but the memory remaining online.
>
> This can result in future management functions failing (such as adding a
> new region). This is why "online_normal" is explicit, and the default
> online zone is ZONE_MOVABLE.
>
> Cc: David Hildenbrand <david@...nel.org>
> Signed-off-by: Gregory Price <gourry@...rry.net>
> ---
> drivers/cxl/core/core.h | 2 +
> drivers/cxl/core/memctrl/Makefile | 1 +
> drivers/cxl/core/memctrl/memctrl.c | 2 +
> drivers/cxl/core/memctrl/sysram_region.c | 358 +++++++++++++++++++++++
> drivers/cxl/core/region.c | 5 +
> drivers/cxl/cxl.h | 6 +-
> 6 files changed, 372 insertions(+), 2 deletions(-)
> create mode 100644 drivers/cxl/core/memctrl/sysram_region.c
>
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index 1156a4bd0080..18cb84950500 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -31,6 +31,8 @@ int cxl_decoder_detach(struct cxl_region *cxlr,
> struct cxl_endpoint_decoder *cxled, int pos,
> enum cxl_detach_mode mode);
>
> +int devm_cxl_add_sysram_region(struct cxl_region *cxlr);
> +
> #define CXL_REGION_ATTR(x) (&dev_attr_##x.attr)
> #define CXL_REGION_TYPE(x) (&cxl_region_type)
> #define SET_CXL_REGION_ATTR(x) (&dev_attr_##x.attr),
> diff --git a/drivers/cxl/core/memctrl/Makefile b/drivers/cxl/core/memctrl/Makefile
> index 8165aad5a52a..1c52c7d75570 100644
> --- a/drivers/cxl/core/memctrl/Makefile
> +++ b/drivers/cxl/core/memctrl/Makefile
> @@ -2,3 +2,4 @@
>
> cxl_core-$(CONFIG_CXL_REGION) += memctrl/memctrl.o
> cxl_core-$(CONFIG_CXL_REGION) += memctrl/dax_region.o
> +cxl_core-$(CONFIG_CXL_REGION) += memctrl/sysram_region.o
> diff --git a/drivers/cxl/core/memctrl/memctrl.c b/drivers/cxl/core/memctrl/memctrl.c
> index 24e0e14b39c7..40ffb59353bb 100644
> --- a/drivers/cxl/core/memctrl/memctrl.c
> +++ b/drivers/cxl/core/memctrl/memctrl.c
> @@ -34,6 +34,8 @@ int cxl_enable_memctrl(struct cxl_region *cxlr)
> return devm_cxl_add_dax_region(cxlr);
> case CXL_MEMCTRL_DAX:
> return devm_cxl_add_dax_region(cxlr);
> + case CXL_MEMCTRL_SYSRAM:
> + return devm_cxl_add_sysram_region(cxlr);
> default:
> return -EINVAL;
> }
> diff --git a/drivers/cxl/core/memctrl/sysram_region.c b/drivers/cxl/core/memctrl/sysram_region.c
> new file mode 100644
> index 000000000000..a7570c8a54e1
> --- /dev/null
> +++ b/drivers/cxl/core/memctrl/sysram_region.c
> @@ -0,0 +1,358 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright(c) 2026 Meta Inc. All rights reserved. */
> +#include <linux/memremap.h>
> +#include <linux/memory.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/slab.h>
> +#include <linux/mm.h>
> +#include <linux/memory-tiers.h>
> +#include <linux/memory_hotplug.h>
> +#include <linux/string_helpers.h>
> +#include <linux/sched/signal.h>
> +#include <cxlmem.h>
> +#include <cxl.h>
> +#include "../core.h"
> +
> +/* If HMAT was unavailable, assign a default distance. */
> +#define MEMTIER_DEFAULT_CXL_ADISTANCE (MEMTIER_ADISTANCE_DRAM * 5)
> +
> +static const char *sysram_name = "System RAM (CXL)";
> +
> +struct cxl_sysram_data {
> + const char *res_name;
> + int mgid;
> + struct resource *res;
> +};
> +
> +static DEFINE_MUTEX(cxl_memory_type_lock);
> +static LIST_HEAD(cxl_memory_types);
> +
> +static struct cxl_region *to_cxl_region(struct device *dev)
> +{
> + if (dev->type != &cxl_region_type)
> + return NULL;
> + return container_of(dev, struct cxl_region, dev);
> +}
> +
> +static struct memory_dev_type *cxl_find_alloc_memory_type(int adist)
> +{
> + guard(mutex)(&cxl_memory_type_lock);
> + return mt_find_alloc_memory_type(adist, &cxl_memory_types);
> +}
> +
> +static void __maybe_unused cxl_put_memory_types(void)
> +{
> + guard(mutex)(&cxl_memory_type_lock);
> + mt_put_memory_types(&cxl_memory_types);
> +}
> +
> +static int cxl_sysram_range(struct cxl_region *cxlr, struct range *r)
> +{
> + struct cxl_region_params *p = &cxlr->params;
> +
> + if (!p->res)
> + return -ENODEV;
> +
> + /* memory-block align the hotplug range */
> + r->start = ALIGN(p->res->start, memory_block_size_bytes());
> + r->end = ALIGN_DOWN(p->res->end + 1, memory_block_size_bytes()) - 1;
> + if (r->start >= r->end) {
> + r->start = p->res->start;
> + r->end = p->res->end;
> + return -ENOSPC;
> + }
> + return 0;
> +}
> +
> +static ssize_t hotunplug_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct cxl_region *cxlr = to_cxl_region(dev);
> + struct range range;
> + int rc;
> +
> + if (!cxlr)
> + return -ENODEV;
> +
> + rc = cxl_sysram_range(cxlr, &range);
> + if (rc)
> + return rc;
> +
> + rc = offline_and_remove_memory(range.start, range_len(&range));
> +
> + if (rc)
> + return rc;
> +
> + return len;
> +}
> +static DEVICE_ATTR_WO(hotunplug);
> +
> +struct online_memory_cb_arg {
> + int online_type;
> + int rc;
> +};
> +
> +static int online_memory_block_cb(struct memory_block *mem, void *arg)
> +{
> + struct online_memory_cb_arg *cb_arg = arg;
> +
> + if (signal_pending(current))
> + return -EINTR;
> +
> + cond_resched();
> +
> + if (mem->state == MEM_ONLINE)
> + return 0;
> +
> + mem->online_type = cb_arg->online_type;
> + cb_arg->rc = device_online(&mem->dev);
> +
> + return cb_arg->rc;
> +}
> +
> +static int offline_memory_block_cb(struct memory_block *mem, void *arg)
> +{
> + int *rc = arg;
> +
> + if (signal_pending(current))
> + return -EINTR;
> +
> + cond_resched();
> +
> + if (mem->state == MEM_OFFLINE)
> + return 0;
> +
> + *rc = device_offline(&mem->dev);
> +
> + return *rc;
> +}
> +
> +static ssize_t state_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct cxl_region *cxlr = to_cxl_region(dev);
> + struct online_memory_cb_arg cb_arg;
> + struct range range;
> + int rc;
> +
> + if (!cxlr)
> + return -ENODEV;
> +
> + rc = cxl_sysram_range(cxlr, &range);
> + if (rc)
> + return rc;
> +
> + rc = lock_device_hotplug_sysfs();
> + if (rc)
> + return rc;
> +
> + if (sysfs_streq(buf, "online")) {
> + cb_arg.online_type = MMOP_ONLINE_MOVABLE;
> + cb_arg.rc = 0;
> + rc = walk_memory_blocks(range.start, range_len(&range),
> + &cb_arg, online_memory_block_cb);
> + if (!rc)
> + rc = cb_arg.rc;
> + } else if (sysfs_streq(buf, "online_normal")) {
> + cb_arg.online_type = MMOP_ONLINE;
> + cb_arg.rc = 0;
> + rc = walk_memory_blocks(range.start, range_len(&range),
> + &cb_arg, online_memory_block_cb);
> + if (!rc)
> + rc = cb_arg.rc;
> + } else if (sysfs_streq(buf, "offline")) {
> + int offline_rc = 0;
> +
> + rc = walk_memory_blocks(range.start, range_len(&range),
> + &offline_rc, offline_memory_block_cb);
> + if (!rc)
> + rc = offline_rc;
Let's expose this functionality through some common-code helpers. I
really don't want more code doing this non-obvious device_offline() etc
dance.
walk_memory_blocks() should become a core-mm helper. Maybe we can also
cleanup drivers/acpi/acpi_memhotplug.c in that regard.
Hopefully we can then also reuse these helpers in ppc code (see
dlpar_add_lmb() and dlpar_remove_lmb() that do something similar, but
grab the device hotplug lock themselves as they want to perform some
additional operations).
--
Cheers
David
Powered by blists - more mailing lists