[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <0233bdab-9b59-4394-9ce4-c3a5df2be06d@amd.com>
Date: Mon, 12 Jan 2026 15:10:41 -0600
From: "Cheatham, Benjamin" <benjamin.cheatham@....com>
To: Gregory Price <gourry@...rry.net>, <linux-cxl@...r.kernel.org>
CC: <linux-kernel@...r.kernel.org>, <kernel-team@...a.com>,
<dave@...olabs.net>, <jonathan.cameron@...wei.com>, <dave.jiang@...el.com>,
<alison.schofield@...el.com>, <vishal.l.verma@...el.com>,
<ira.weiny@...el.com>, <dan.j.williams@...el.com>, David Hildenbrand
<david@...nel.org>
Subject: Re: [PATCH 2/6] cxl: add sysram_region memory controller
On 1/12/2026 10:35 AM, Gregory Price wrote:
> Add a sysram memctrl that directly hotplugs memory without needing to
> route through DAX. This simplifies the sysram usecase considerably.
>
> The sysram memctl adds new sysfs controls when registered:
> region/memctrl/[hotplug, hotunplug, state]
>
> hotplug: controller attempts to hotplug the memory region
> hotunplug: controller attempts to offline and hotunplug the memory region
Nit: Would it be better to use hotadd/hotremove here instead of hotplug/hotunplug? The terms
are basically synonymous, but I think hotadd and hotremove are more descriptive.
> state: [online,online_normal,offline]
> online : controller onlines blocks in ZONE_MOVABLE
> online_normal: controller onlines blocks in ZONE_NORMAL
The naming for online states could be improved imo. I understand and agree with the motivation
behind the names, but I could see the use of the word "normal" being confusing to less savvy users.
You could change it to include the zone for both (online_movable/online_normal), but I think it may
be easier to mark which one has drawbacks, i.e. change "online_normal" to something like "online_nonremovable".
That way, anyone who doesn't want to go find the documentation for these can understand the user-visible
impact.
In any case, all of these attributes need ABI documentation as well.
> offline : controller attempts to offline the memory blocks
>
> Hotplug note - by default the controller will hotplug the blocks, but
> leave them offline (unless MHP auto-online in Kconfig is enabled).
>
> Setting state to "online_normal" may prevent future hot-unplug of sysram
> regions, and unbinding a memory region with memory online in ZONE_NORMAL
> may result in the device being removed but the memory remaining online.
>
> This can result in future management functions failing (such as adding a
> new region). This is why "online_normal" is explicit, and the default
> online zone is ZONE_MOVABLE.
>
> Cc: David Hildenbrand <david@...nel.org>
> Signed-off-by: Gregory Price <gourry@...rry.net>
> ---
> drivers/cxl/core/core.h | 2 +
> drivers/cxl/core/memctrl/Makefile | 1 +
> drivers/cxl/core/memctrl/memctrl.c | 2 +
> drivers/cxl/core/memctrl/sysram_region.c | 358 +++++++++++++++++++++++
> drivers/cxl/core/region.c | 5 +
> drivers/cxl/cxl.h | 6 +-
> 6 files changed, 372 insertions(+), 2 deletions(-)
> create mode 100644 drivers/cxl/core/memctrl/sysram_region.c
>
> diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h
> index 1156a4bd0080..18cb84950500 100644
> --- a/drivers/cxl/core/core.h
> +++ b/drivers/cxl/core/core.h
> @@ -31,6 +31,8 @@ int cxl_decoder_detach(struct cxl_region *cxlr,
> struct cxl_endpoint_decoder *cxled, int pos,
> enum cxl_detach_mode mode);
>
> +int devm_cxl_add_sysram_region(struct cxl_region *cxlr);
> +
> #define CXL_REGION_ATTR(x) (&dev_attr_##x.attr)
> #define CXL_REGION_TYPE(x) (&cxl_region_type)
> #define SET_CXL_REGION_ATTR(x) (&dev_attr_##x.attr),
> diff --git a/drivers/cxl/core/memctrl/Makefile b/drivers/cxl/core/memctrl/Makefile
> index 8165aad5a52a..1c52c7d75570 100644
> --- a/drivers/cxl/core/memctrl/Makefile
> +++ b/drivers/cxl/core/memctrl/Makefile
> @@ -2,3 +2,4 @@
>
> cxl_core-$(CONFIG_CXL_REGION) += memctrl/memctrl.o
> cxl_core-$(CONFIG_CXL_REGION) += memctrl/dax_region.o
> +cxl_core-$(CONFIG_CXL_REGION) += memctrl/sysram_region.o
> diff --git a/drivers/cxl/core/memctrl/memctrl.c b/drivers/cxl/core/memctrl/memctrl.c
> index 24e0e14b39c7..40ffb59353bb 100644
> --- a/drivers/cxl/core/memctrl/memctrl.c
> +++ b/drivers/cxl/core/memctrl/memctrl.c
> @@ -34,6 +34,8 @@ int cxl_enable_memctrl(struct cxl_region *cxlr)
> return devm_cxl_add_dax_region(cxlr);
> case CXL_MEMCTRL_DAX:
> return devm_cxl_add_dax_region(cxlr);
> + case CXL_MEMCTRL_SYSRAM:
> + return devm_cxl_add_sysram_region(cxlr);
> default:
> return -EINVAL;
> }
> diff --git a/drivers/cxl/core/memctrl/sysram_region.c b/drivers/cxl/core/memctrl/sysram_region.c
> new file mode 100644
> index 000000000000..a7570c8a54e1
> --- /dev/null
> +++ b/drivers/cxl/core/memctrl/sysram_region.c
> @@ -0,0 +1,358 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/* Copyright(c) 2026 Meta Inc. All rights reserved. */
> +#include <linux/memremap.h>
> +#include <linux/memory.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/slab.h>
> +#include <linux/mm.h>
> +#include <linux/memory-tiers.h>
> +#include <linux/memory_hotplug.h>
> +#include <linux/string_helpers.h>
> +#include <linux/sched/signal.h>
> +#include <cxlmem.h>
> +#include <cxl.h>
> +#include "../core.h"
> +
> +/* If HMAT was unavailable, assign a default distance. */
> +#define MEMTIER_DEFAULT_CXL_ADISTANCE (MEMTIER_ADISTANCE_DRAM * 5)
> +
> +static const char *sysram_name = "System RAM (CXL)";
> +
> +struct cxl_sysram_data {
> + const char *res_name;
> + int mgid;
> + struct resource *res;
> +};
> +
> +static DEFINE_MUTEX(cxl_memory_type_lock);
> +static LIST_HEAD(cxl_memory_types);
> +
> +static struct cxl_region *to_cxl_region(struct device *dev)
> +{
> + if (dev->type != &cxl_region_type)
> + return NULL;
> + return container_of(dev, struct cxl_region, dev);
> +}
What's the reasoning behind redefining this in this file? It's still defined in cxl/core/region.c,
so I would probably just drop the static there and include it through core.h.
> +
> +static struct memory_dev_type *cxl_find_alloc_memory_type(int adist)
> +{
> + guard(mutex)(&cxl_memory_type_lock);
> + return mt_find_alloc_memory_type(adist, &cxl_memory_types);
> +}
> +
> +static void __maybe_unused cxl_put_memory_types(void)
> +{
> + guard(mutex)(&cxl_memory_type_lock);
> + mt_put_memory_types(&cxl_memory_types);
> +}
> +
> +static int cxl_sysram_range(struct cxl_region *cxlr, struct range *r)
> +{
> + struct cxl_region_params *p = &cxlr->params;
> +
> + if (!p->res)
> + return -ENODEV;
> +
> + /* memory-block align the hotplug range */
> + r->start = ALIGN(p->res->start, memory_block_size_bytes());
> + r->end = ALIGN_DOWN(p->res->end + 1, memory_block_size_bytes()) - 1;
> + if (r->start >= r->end) {
> + r->start = p->res->start;
> + r->end = p->res->end;
> + return -ENOSPC;
> + }
> + return 0;
> +}
> +
> +static ssize_t hotunplug_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct cxl_region *cxlr = to_cxl_region(dev);
> + struct range range;
> + int rc;
> +
> + if (!cxlr)
> + return -ENODEV;
> +
> + rc = cxl_sysram_range(cxlr, &range);
> + if (rc)
> + return rc;
> +
> + rc = offline_and_remove_memory(range.start, range_len(&range));
> +
> + if (rc)
Extra blank line above.
> + return rc;
> +
> + return len;
> +}
> +static DEVICE_ATTR_WO(hotunplug);
> +
> +struct online_memory_cb_arg {
> + int online_type;
> + int rc;
> +};
> +
> +static int online_memory_block_cb(struct memory_block *mem, void *arg)
> +{
> + struct online_memory_cb_arg *cb_arg = arg;
> +
> + if (signal_pending(current))
> + return -EINTR;
> +
> + cond_resched();
> +
> + if (mem->state == MEM_ONLINE)
> + return 0;
> +
> + mem->online_type = cb_arg->online_type;
> + cb_arg->rc = device_online(&mem->dev);
> +
> + return cb_arg->rc;
> +}
> +
> +static int offline_memory_block_cb(struct memory_block *mem, void *arg)
> +{
> + int *rc = arg;
> +
> + if (signal_pending(current))
> + return -EINTR;
> +
> + cond_resched();
> +
> + if (mem->state == MEM_OFFLINE)
> + return 0;
> +
> + *rc = device_offline(&mem->dev);
> +
> + return *rc;
> +}
> +
> +static ssize_t state_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct cxl_region *cxlr = to_cxl_region(dev);
> + struct online_memory_cb_arg cb_arg;
> + struct range range;
> + int rc;
> +
> + if (!cxlr)
> + return -ENODEV;
> +
> + rc = cxl_sysram_range(cxlr, &range);
> + if (rc)
> + return rc;
> +
> + rc = lock_device_hotplug_sysfs();
> + if (rc)
> + return rc;
> +
> + if (sysfs_streq(buf, "online")) {
> + cb_arg.online_type = MMOP_ONLINE_MOVABLE;
> + cb_arg.rc = 0;
> + rc = walk_memory_blocks(range.start, range_len(&range),
> + &cb_arg, online_memory_block_cb);
> + if (!rc)
> + rc = cb_arg.rc;
> + } else if (sysfs_streq(buf, "online_normal")) {
> + cb_arg.online_type = MMOP_ONLINE;
> + cb_arg.rc = 0;
> + rc = walk_memory_blocks(range.start, range_len(&range),
> + &cb_arg, online_memory_block_cb);
> + if (!rc)
> + rc = cb_arg.rc;
> + } else if (sysfs_streq(buf, "offline")) {
> + int offline_rc = 0;
> +
> + rc = walk_memory_blocks(range.start, range_len(&range),
> + &offline_rc, offline_memory_block_cb);
> + if (!rc)
> + rc = offline_rc;
> + } else {
> + rc = -EINVAL;
> + }
Nit: You can just set rc = -EINVAL before the if statement instead of doing this else clause.> +
> + unlock_device_hotplug();
> +
> + if (rc)
> + return rc;
> +
> + return len;
> +}
> +static DEVICE_ATTR_WO(state);
> +
> +static ssize_t hotplug_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t len)
> +{
> + struct cxl_region *cxlr = to_cxl_region(dev);
> + struct cxl_sysram_data *data;
> + struct range range;
> + int rc;
> +
> + if (!cxlr)
> + return -ENODEV;
> +
> + data = dev_get_drvdata(dev);
> + if (!data)
> + return -ENODEV;
> +
> + rc = cxl_sysram_range(cxlr, &range);
> + if (rc)
> + return rc;
> +
> + rc = add_memory_driver_managed(data->mgid, range.start,
> + range_len(&range), sysram_name,
> + MHP_NID_IS_MGID);
> + if (rc)
> + return rc;
> +
> + return len;
> +}
> +static DEVICE_ATTR_WO(hotplug);
> +
> +static struct attribute *cxl_sysram_region_attrs[] = {
> + &dev_attr_hotunplug.attr,
> + &dev_attr_state.attr,
> + &dev_attr_hotplug.attr,
> + NULL,
> +};
> +
> +static const struct attribute_group cxl_sysram_region_group = {
> + .name = "memctl",
> + .attrs = cxl_sysram_region_attrs,
> +};
> +
> +static void cxl_sysram_unregister(void *_data)
> +{
> + struct cxl_sysram_data *data = _data;
> + struct range range = {
> + .start = data->res->start,
> + .end = data->res->end
> + };
> +
> + /* We have one shot for removal, otherwise it's stuck til reboot */
> + if (!offline_and_remove_memory(range.start, range_len(&range))) {
> + remove_resource(data->res);
> + kfree(data->res);
> + memory_group_unregister(data->mgid);
> + kfree(data->res_name);
> + kfree(data);
> + return;
> + }
> + pr_err("CXL: %#llx-%#llx cannot be hotremoved until next reboot\n",
> + range.start, range.end);
> +}
> +
> +int devm_cxl_add_sysram_region(struct cxl_region *cxlr)
> +{
> + struct cxl_region_params *p = &cxlr->params;
> + struct device *dev = &cxlr->dev;
> + struct cxl_sysram_data *data;
> + struct memory_dev_type *mtype;
> + unsigned long total_len = 0;
> + struct resource *res;
> + struct range range;
> + mhp_t mhp_flags;
> + int numa_node;
> + int adist = MEMTIER_DEFAULT_CXL_ADISTANCE;
> + int rc;
> +
> + numa_node = phys_to_target_node(p->res->start);
> + if (numa_node < 0) {
> + dev_warn(dev, "rejecting CXL region with invalid node: %d\n",
> + numa_node);
> + return -EINVAL;
> + }
> +
> + rc = cxl_sysram_range(cxlr, &range);
> + if (rc) {
> + dev_info(dev, "range %#llx-%#llx too small after alignment\n",
> + range.start, range.end);
This should probably be a warning instead. You do it for the next check which is essentially the same
case, so may as well do it here.
> + return rc;
> + }
> + total_len = range_len(&range);
> +
> + if (!total_len) {
> + dev_warn(dev, "rejecting CXL region without any memory after alignment\n");
> + return -EINVAL;
> + }
I don't think this check is needed. cxl_sysram_range() checks if the range->start == range->end (i.e. size == 0)
and errors out. That should cause the above check to error out before this.
> +
> + mt_calc_adistance(numa_node, &adist);
> + mtype = cxl_find_alloc_memory_type(adist);
> + if (IS_ERR(mtype))
> + return PTR_ERR(mtype);
> +
> + init_node_memory_type(numa_node, mtype);
> +
> + data = kzalloc(sizeof(*data), GFP_KERNEL);
> + if (!data) {
> + rc = -ENOMEM;
> + goto err_data;
> + }
> +
> + data->res_name = kstrdup(dev_name(dev), GFP_KERNEL);
> + if (!data->res_name) {
> + rc = -ENOMEM;
> + goto err_res_name;
> + }
> +
> + rc = memory_group_register_static(numa_node, PFN_UP(total_len));
> + if (rc < 0)
> + goto err_reg_mgid;
> + data->mgid = rc;
> +
> + /* Region is permanently reserved if hotremove fails when unbinding. */
> + res = request_mem_region(range.start, range_len(&range),
> + data->res_name);
> + if (!res) {
> + dev_warn(dev, "range %#llx-%#llx could not reserve region\n",
> + range.start, range.end);
> + rc = -EBUSY;
> + goto err_request_mem;
> + }
> + data->res = res;
> +
> + /*
> + * Setup flags for System RAM. Leave _BUSY clear so add_memory() can add
> + * a child resource. Do not inherit flags from parent since it may set
> + * flags unknown to us that will the break add_memory() below.
> + */
> + res->flags = IORESOURCE_SYSTEM_RAM;
> + mhp_flags = MHP_NID_IS_MGID;
> + rc = add_memory_driver_managed(data->mgid, range.start,
> + range_len(&range), sysram_name, mhp_flags);
Look like mhp_flags is only used once, I'd get rid of it and just use MHP_NID_IS_MGID instead.
> + if (rc) {
> + dev_warn(dev, "range %#llx-%#llx memory add failed\n",
> + range.start, range.end);
> + goto err_add_memory;
> + }
> + dev_dbg(dev, "%s: added %llu bytes as System RAM\n", dev_name(dev),
> + (unsigned long long)total_len);
> +
> + dev_set_drvdata(dev, data);
> + rc = devm_device_add_group(dev, &cxl_sysram_region_group);
> + if (rc)
> + goto err_add_group;
> +
> + return devm_add_action_or_reset(dev, cxl_sysram_unregister, data);
> +
> +err_add_group:
> + dev_set_drvdata(dev, NULL);
> + /* if this fails, memory cannot be removed from the system until reboot */
> + remove_memory(range.start, range_len(&range));
> +err_add_memory:
> + remove_resource(res);
> + kfree(res);
> +err_request_mem:
> + memory_group_unregister(data->mgid);
> +err_reg_mgid:
> + kfree(data->res_name);
> +err_res_name:
> + kfree(data);
> +err_data:
> + clear_node_memory_type(numa_node, mtype);
> + return rc;
> +}
> diff --git a/drivers/cxl/core/region.c b/drivers/cxl/core/region.c
> index 02d7d9ae0252..eeab091f043a 100644
> --- a/drivers/cxl/core/region.c
> +++ b/drivers/cxl/core/region.c
> @@ -639,6 +639,9 @@ static ssize_t ctrl_show(struct device *dev, struct device_attribute *attr,
> case CXL_MEMCTRL_DAX:
> desc = "dax";
> break;
> + case CXL_MEMCTRL_SYSRAM:
> + desc = "sysram";
> + break;
> default:
> desc = "";
> break;
> @@ -663,6 +666,8 @@ static ssize_t ctrl_store(struct device *dev, struct device_attribute *attr,
>
> if (sysfs_streq(buf, "dax"))
> cxlr->memctrl = CXL_MEMCTRL_DAX;
> + else if (sysfs_streq(buf, "sysram"))
> + cxlr->memctrl = CXL_MEMCTRL_SYSRAM;
> else
> return -EINVAL;
>
> diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h
> index b8fabaa77262..bb4f877b4e8f 100644
> --- a/drivers/cxl/cxl.h
> +++ b/drivers/cxl/cxl.h
> @@ -506,13 +506,15 @@ enum cxl_partition_mode {
> /*
> * Memory Controller modes:
> * None - No controller selected
> - * Auto - either BIOS-configured as SysRAM, or default to DAX
> - * DAX - creates a dax_region controller for the cxl_region
> + * Auto - either BIOS-configured as SysRAM, or default to DAX
> + * DAX - creates a dax_region controller for the cxl_region
> + * SYSRAM - hotplugs the region directly as System RAM
> */
> enum cxl_memctrl_mode {
> CXL_MEMCTRL_NONE,
> CXL_MEMCTRL_AUTO,
> CXL_MEMCTRL_DAX,
> + CXL_MEMCTRL_SYSRAM,
> };
>
> /*
Powered by blists - more mailing lists