[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <4e832570-32f8-46a5-80d0-40570db862b2@redhat.com>
Date: Tue, 7 Oct 2025 22:07:43 +0200
From: David Hildenbrand <david@...hat.com>
To: Sumanth Korikkar <sumanthk@...ux.ibm.com>,
Andrew Morton <akpm@...ux-foundation.org>, linux-mm <linux-mm@...ck.org>
Cc: LKML <linux-kernel@...r.kernel.org>,
linux-s390 <linux-s390@...r.kernel.org>,
Gerald Schaefer <gerald.schaefer@...ux.ibm.com>,
Heiko Carstens <hca@...ux.ibm.com>, Vasily Gorbik <gor@...ux.ibm.com>,
Alexander Gordeev <agordeev@...ux.ibm.com>
Subject: Re: [PATCH 2/4] s390/sclp: Add support for dynamic (de)configuration
of memory
[...]
> ---
> drivers/s390/char/sclp_mem.c | 291 +++++++++++++++++++++++++++++------
> 1 file changed, 241 insertions(+), 50 deletions(-)
>
> diff --git a/drivers/s390/char/sclp_mem.c b/drivers/s390/char/sclp_mem.c
> index 27f49f5fd358..802439230294 100644
> --- a/drivers/s390/char/sclp_mem.c
> +++ b/drivers/s390/char/sclp_mem.c
> @@ -9,9 +9,12 @@
> #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
>
> #include <linux/cpufeature.h>
> +#include <linux/container_of.h>
> #include <linux/err.h>
> #include <linux/errno.h>
> #include <linux/init.h>
> +#include <linux/kobject.h>
> +#include <linux/kstrtox.h>
> #include <linux/memory.h>
> #include <linux/memory_hotplug.h>
> #include <linux/mm.h>
> @@ -27,7 +30,6 @@
> #define SCLP_CMDW_ASSIGN_STORAGE 0x000d0001
> #define SCLP_CMDW_UNASSIGN_STORAGE 0x000c0001
>
> -static DEFINE_MUTEX(sclp_mem_mutex);
> static LIST_HEAD(sclp_mem_list);
> static u8 sclp_max_storage_id;
> static DECLARE_BITMAP(sclp_storage_ids, 256);
> @@ -38,6 +40,18 @@ struct memory_increment {
> int standby;
> };
>
> +struct mblock {
> + struct kobject kobj;
> + unsigned int id;
> + unsigned int memmap_on_memory;
> + unsigned int config;
> +};
> +
> +struct memory_block_arg {
> + struct mblock *mblocks;
> + struct kset *kset;
> +};
I would avoid using "memory_block_arg" as it reminds of core mm "struct memory_block".
Similarly, I'd not call this "mblock".
What about incorporating the "sclp" side of things?
"struct sclp_mem" / "struct sclp_mem_arg"
Nicely fits "sclp_mem.c" ;)
Something like that might be better.
> +
> struct assign_storage_sccb {
> struct sccb_header header;
> u16 rn;
> @@ -185,15 +199,11 @@ static int sclp_mem_notifier(struct notifier_block *nb,
> {
> unsigned long start, size;
> struct memory_notify *arg;
> - unsigned char id;
> int rc = 0;
>
> arg = data;
> start = arg->start_pfn << PAGE_SHIFT;
> size = arg->nr_pages << PAGE_SHIFT;
> - mutex_lock(&sclp_mem_mutex);
> - for_each_clear_bit(id, sclp_storage_ids, sclp_max_storage_id + 1)
> - sclp_attach_storage(id);
> switch (action) {
> case MEM_GOING_OFFLINE:
> /*
> @@ -204,45 +214,201 @@ static int sclp_mem_notifier(struct notifier_block *nb,
> if (contains_standby_increment(start, start + size))
> rc = -EPERM;
> break;
Is there any reson this notifier is still needed? I'd assume we can just allow
for offlining + re-onlining as we please now.
In fact, I'd assume we can get rid of the notifier entirely now?
> - case MEM_PREPARE_ONLINE:
> - /*
> - * Access the altmap_start_pfn and altmap_nr_pages fields
> - * within the struct memory_notify specifically when dealing
> - * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
> - *
> - * When altmap is in use, take the specified memory range
> - * online, which includes the altmap.
> - */
> - if (arg->altmap_nr_pages) {
> - start = PFN_PHYS(arg->altmap_start_pfn);
> - size += PFN_PHYS(arg->altmap_nr_pages);
> - }
> - rc = sclp_mem_change_state(start, size, 1);
> - if (rc || !arg->altmap_nr_pages)
> - break;
> - /*
> - * Set CMMA state to nodat here, since the struct page memory
> - * at the beginning of the memory block will not go through the
> - * buddy allocator later.
> - */
> - __arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages);
> + default:
> break;
> - case MEM_FINISH_OFFLINE:
> + }
> + return rc ? NOTIFY_BAD : NOTIFY_OK;
> +}
> +
> +static ssize_t config_mblock_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
> +{
> + struct mblock *mblock = container_of(kobj, struct mblock, kobj);
> +
> + return sysfs_emit(buf, "%u\n", READ_ONCE(mblock->config));
> +}
> +
> +static ssize_t config_mblock_store(struct kobject *kobj, struct kobj_attribute *attr,
> + const char *buf, size_t count)
> +{
> + unsigned long long addr, block_size;
"unsigned long" should be sufficient I'm sure :)
> + struct memory_block *mem;
> + struct mblock *mblock;
> + unsigned char id;
> + bool value;
> + int rc;
> +
> + rc = kstrtobool(buf, &value);
> + if (rc)
> + return rc;
> + mblock = container_of(kobj, struct mblock, kobj);
> + block_size = memory_block_size_bytes();
> + addr = mblock->id * block_size;
> + /*
> + * Hold device_hotplug_lock when adding/removing memory blocks.
> + * Additionally, also protect calls to find_memory_block() and
> + * sclp_attach_storage().
> + */
> + rc = lock_device_hotplug_sysfs();
> + if (rc)
> + goto out;
> + for_each_clear_bit(id, sclp_storage_ids, sclp_max_storage_id + 1)
> + sclp_attach_storage(id);
> + if (value) {
> + if (mblock->config)
> + goto out_unlock;
> + rc = sclp_mem_change_state(addr, block_size, 1);
> + if (rc)
> + goto out_unlock;
> /*
> - * When altmap is in use, take the specified memory range
> - * offline, which includes the altmap.
> + * Set entire memory block CMMA state to nodat. Later, when
> + * page tables pages are allocated via __add_memory(), those
> + * regions are marked __arch_set_page_dat().
> */
> - if (arg->altmap_nr_pages) {
> - start = PFN_PHYS(arg->altmap_start_pfn);
> - size += PFN_PHYS(arg->altmap_nr_pages);
> + __arch_set_page_nodat((void *)__va(addr), block_size >> PAGE_SHIFT);
> + rc = __add_memory(0, addr, block_size,
> + mblock->memmap_on_memory ?
> + MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE);
> + if (rc)
> + goto out_unlock;
Do we have to undo the state change?
> + mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(addr)));
> + put_device(&mem->dev);
> + WRITE_ONCE(mblock->config, 1);
> + } else {
> + if (!mblock->config)
> + goto out_unlock;
> + mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(addr)));
> + if (mem->state != MEM_OFFLINE) {
> + put_device(&mem->dev);
> + rc = -EBUSY;
> + goto out_unlock;
> }
> - sclp_mem_change_state(start, size, 0);
> - break;
> - default:
> - break;
> + /* drop the ref just got via find_memory_block() */
> + put_device(&mem->dev);
> + sclp_mem_change_state(addr, block_size, 0);
> + __remove_memory(addr, block_size);
> + WRITE_ONCE(mblock->config, 0);
> }
> - mutex_unlock(&sclp_mem_mutex);
> - return rc ? NOTIFY_BAD : NOTIFY_OK;
> +out_unlock:
> + unlock_device_hotplug();
> +out:
> + return rc ? rc : count;
> +}
> +
> +static ssize_t memmap_on_memory_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
> +{
> + struct mblock *mblock = container_of(kobj, struct mblock, kobj);
> +
> + return sysfs_emit(buf, "%u\n", READ_ONCE(mblock->memmap_on_memory));
> +}
> +
> +static ssize_t memmap_on_memory_store(struct kobject *kobj, struct kobj_attribute *attr,
> + const char *buf, size_t count)
> +{
> + unsigned long block_size;
> + struct memory_block *mem;
> + struct mblock *mblock;
> + bool value;
> + int rc;
> +
> + rc = kstrtobool(buf, &value);
> + if (rc)
> + return rc;
> + rc = lock_device_hotplug_sysfs();
> + if (rc)
> + return rc;
> + block_size = memory_block_size_bytes();
> + mblock = container_of(kobj, struct mblock, kobj);
> + mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(mblock->id * block_size)));
> + if (!mem) {
> + WRITE_ONCE(mblock->memmap_on_memory, value);
> + } else {
> + put_device(&mem->dev);
> + rc = -EBUSY;
> + }
> + unlock_device_hotplug();
> + return rc ? rc : count;
> +}
> +
> +static void mblock_sysfs_release(struct kobject *kobj)
> +{
> + struct mblock *mblock = container_of(kobj, struct mblock, kobj);
> +
> + kfree(mblock);
> +}
> +
> +static const struct kobj_type ktype = {
> + .release = mblock_sysfs_release,
> + .sysfs_ops = &kobj_sysfs_ops,
> +};
> +
> +static struct kobj_attribute memmap_attr =
> + __ATTR(memmap_on_memory, 0644, memmap_on_memory_show, memmap_on_memory_store);
> +static struct kobj_attribute config_attr =
> + __ATTR(config, 0644, config_mblock_show, config_mblock_store);
> +
> +static struct attribute *mblock_attrs[] = {
> + &memmap_attr.attr,
> + &config_attr.attr,
> + NULL,
> +};
> +
> +static struct attribute_group mblock_attr_group = {
> + .attrs = mblock_attrs,
> +};
> +
> +static int create_mblock(struct mblock *mblock, struct kset *kset,
> + unsigned int id, bool config, bool memmap_on_memory)
> +{
> + int rc;
> +
> + mblock->memmap_on_memory = memmap_on_memory;
> + mblock->config = config;
> + mblock->id = id;
> + kobject_init(&mblock->kobj, &ktype);
> + rc = kobject_add(&mblock->kobj, &kset->kobj, "memory%d", id);
> + if (rc)
> + return rc;
> + rc = sysfs_create_group(&mblock->kobj, &mblock_attr_group);
> + if (rc)
> + kobject_put(&mblock->kobj);
> + return rc;
> +}
> +
> +/*
> + * Create /sys/firmware/memory/memoryX for boottime configured online memory
> + * blocks
> + */
> +static int create_online_mblock(struct memory_block *mem, void *argument)
"online" is conusing. It's "initial" / "configured". Same applies to the other functions
that mention "online".
> +{
> + struct memory_block_arg *arg;
> + struct mblock *mblocks;
> + struct kset *kset;
> + unsigned int id;
> +
> + id = mem->dev.id;
> + arg = (struct memory_block_arg *)argument;
> + mblocks = arg->mblocks;
> + kset = arg->kset;
> + return create_mblock(&mblocks[id], kset, id, true, false);
> +}
> +
> +static int __init create_initial_online_mblocks(struct mblock *mblocks, struct kset *kset)
> +{
> + struct memory_block_arg arg;
> +
> + arg.mblocks = mblocks;
> + arg.kset = kset;
> + return for_each_memory_block(&arg, create_online_mblock);
> +}
> +
> +static struct mblock * __init allocate_mblocks(void)
> +{
> + u64 max_mblocks;
Nit: why an u64? The block ids are "unsigned int id;"
> + u64 block_size;
> +
> + block_size = memory_block_size_bytes();
> + max_mblocks = roundup(sclp.rnmax * sclp.rzm, block_size) / block_size;
> + return kcalloc(max_mblocks, sizeof(struct mblock), GFP_KERNEL);
I think you should structure the code a bit differently, not splitting
the function up into tiny helpers.
static int __init init_sclp_mem(void)
{
const u64 block_size = memory_block_size_bytes();
const u64 max_mblocks = roundup(sclp.rnmax * sclp.rzm, block_size) / block_size;
struct sclp_mem_arg arg;
struct kset *kset;
int rc;
/* We'll allocate memory for all blocks ahead of time. */
sclp_mem = kcalloc(max_mblocks, sizeof(struct mblock), GFP_KERNEL);
if (!sclp_mem)
return -ENOMEM;
kset = kset_create_and_add("memory", NULL, firmware_kobj);
if (!kset)
return -ENOMEM;
/* Initial memory is in the "configured" state already. */
arg.sclp_mem = sclp_mem;
arg.kset = kset;
rc = for_each_memory_block(&arg, create_configured_sclp_mem);
if (rc)
return rc;
/* Standby memory is "deconfigured". */
return create_standby_sclp_mem(sclp_mem, kset);
}
Should still be quite readable.
> }
>
> static struct notifier_block sclp_mem_nb = {
> @@ -264,14 +430,17 @@ static void __init align_to_block_size(unsigned long *start,
> *size = size_align;
> }
>
> -static void __init add_memory_merged(u16 rn)
> +static int __init create_standby_mblocks_merged(struct mblock *mblocks,
> + struct kset *kset, u16 rn)
> {
> unsigned long start, size, addr, block_size;
> static u16 first_rn, num;
> + unsigned int id;
> + int rc = 0;
>
> if (rn && first_rn && (first_rn + num == rn)) {
> num++;
> - return;
> + return rc;
> }
> if (!first_rn)
> goto skip_add;
> @@ -286,24 +455,31 @@ static void __init add_memory_merged(u16 rn)
> if (!size)
> goto skip_add;
> for (addr = start; addr < start + size; addr += block_size) {
> - add_memory(0, addr, block_size,
> - cpu_has_edat1() ?
> - MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE);
> + id = addr / block_size;
> + rc = create_mblock(&mblocks[id], kset, id, false, mhp_supports_memmap_on_memory());
> + if (rc)
> + break;
> }
> skip_add:
> first_rn = rn;
> num = 1;
> + return rc;
> }
>
> -static void __init sclp_add_standby_memory(void)
> +static int __init create_standby_mblocks(struct mblock *mblocks, struct kset *kset)
> {
> struct memory_increment *incr;
> + int rc = 0;
>
> list_for_each_entry(incr, &sclp_mem_list, list) {
> if (incr->standby)
> - add_memory_merged(incr->rn);
> + rc = create_standby_mblocks_merged(mblocks, kset, incr->rn);
> + if (rc)
> + goto out;
> }
> - add_memory_merged(0);
> + rc = create_standby_mblocks_merged(mblocks, kset, 0);
> +out:
> + return rc;
> }
>
> static void __init insert_increment(u16 rn, int standby, int assigned)
> @@ -336,10 +512,12 @@ static void __init insert_increment(u16 rn, int standby, int assigned)
> list_add(&new_incr->list, prev);
> }
>
> -static int __init sclp_detect_standby_memory(void)
> +static int __init sclp_setup_memory(void)
> {
> struct read_storage_sccb *sccb;
> int i, id, assigned, rc;
> + struct mblock *mblocks;
> + struct kset *kset;
>
> /* No standby memory in kdump mode */
> if (oldmem_data.start)
Wouldn't we still want to create the ones for initial memory at least?
[...]
--
Cheers
David / dhildenb
Powered by blists - more mailing lists