linux-kernel - Re: [PATCH 2/4] s390/sclp: Add support for dynamic (de)configuration of memory

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite for Android: free password hash cracker in your pocket
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4e832570-32f8-46a5-80d0-40570db862b2@redhat.com>
Date: Tue, 7 Oct 2025 22:07:43 +0200
From: David Hildenbrand <david@...hat.com>
To: Sumanth Korikkar <sumanthk@...ux.ibm.com>,
 Andrew Morton <akpm@...ux-foundation.org>, linux-mm <linux-mm@...ck.org>
Cc: LKML <linux-kernel@...r.kernel.org>,
 linux-s390 <linux-s390@...r.kernel.org>,
 Gerald Schaefer <gerald.schaefer@...ux.ibm.com>,
 Heiko Carstens <hca@...ux.ibm.com>, Vasily Gorbik <gor@...ux.ibm.com>,
 Alexander Gordeev <agordeev@...ux.ibm.com>
Subject: Re: [PATCH 2/4] s390/sclp: Add support for dynamic (de)configuration
 of memory

[...]

> ---
>   drivers/s390/char/sclp_mem.c | 291 +++++++++++++++++++++++++++++------
>   1 file changed, 241 insertions(+), 50 deletions(-)
> 
> diff --git a/drivers/s390/char/sclp_mem.c b/drivers/s390/char/sclp_mem.c
> index 27f49f5fd358..802439230294 100644
> --- a/drivers/s390/char/sclp_mem.c
> +++ b/drivers/s390/char/sclp_mem.c
> @@ -9,9 +9,12 @@
>   #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
>   
>   #include <linux/cpufeature.h>
> +#include <linux/container_of.h>
>   #include <linux/err.h>
>   #include <linux/errno.h>
>   #include <linux/init.h>
> +#include <linux/kobject.h>
> +#include <linux/kstrtox.h>
>   #include <linux/memory.h>
>   #include <linux/memory_hotplug.h>
>   #include <linux/mm.h>
> @@ -27,7 +30,6 @@
>   #define SCLP_CMDW_ASSIGN_STORAGE		0x000d0001
>   #define SCLP_CMDW_UNASSIGN_STORAGE		0x000c0001
>   
> -static DEFINE_MUTEX(sclp_mem_mutex);
>   static LIST_HEAD(sclp_mem_list);
>   static u8 sclp_max_storage_id;
>   static DECLARE_BITMAP(sclp_storage_ids, 256);
> @@ -38,6 +40,18 @@ struct memory_increment {
>   	int standby;
>   };
>   
> +struct mblock {
> +	struct kobject kobj;
> +	unsigned int id;
> +	unsigned int memmap_on_memory;
> +	unsigned int config;
> +};
> +
> +struct memory_block_arg {
> +	struct mblock *mblocks;
> +	struct kset *kset;
> +};

I would avoid using "memory_block_arg" as it reminds of core mm "struct memory_block".

Similarly, I'd not call this "mblock".

What about incorporating the "sclp" side of things?

"struct sclp_mem" / "struct sclp_mem_arg"

Nicely fits "sclp_mem.c" ;)

Something like that might be better.

> +
>   struct assign_storage_sccb {
>   	struct sccb_header header;
>   	u16 rn;
> @@ -185,15 +199,11 @@ static int sclp_mem_notifier(struct notifier_block *nb,
>   {
>   	unsigned long start, size;
>   	struct memory_notify *arg;
> -	unsigned char id;
>   	int rc = 0;
>   
>   	arg = data;
>   	start = arg->start_pfn << PAGE_SHIFT;
>   	size = arg->nr_pages << PAGE_SHIFT;
> -	mutex_lock(&sclp_mem_mutex);
> -	for_each_clear_bit(id, sclp_storage_ids, sclp_max_storage_id + 1)
> -		sclp_attach_storage(id);
>   	switch (action) {
>   	case MEM_GOING_OFFLINE:
>   		/*
> @@ -204,45 +214,201 @@ static int sclp_mem_notifier(struct notifier_block *nb,
>   		if (contains_standby_increment(start, start + size))
>   			rc = -EPERM;
>   		break;

Is there any reson this notifier is still needed? I'd assume we can just allow
for offlining + re-onlining as we please now.

In fact, I'd assume we can get rid of the notifier entirely now?

> -	case MEM_PREPARE_ONLINE:
> -		/*
> -		 * Access the altmap_start_pfn and altmap_nr_pages fields
> -		 * within the struct memory_notify specifically when dealing
> -		 * with only MEM_PREPARE_ONLINE/MEM_FINISH_OFFLINE notifiers.
> -		 *
> -		 * When altmap is in use, take the specified memory range
> -		 * online, which includes the altmap.
> -		 */
> -		if (arg->altmap_nr_pages) {
> -			start = PFN_PHYS(arg->altmap_start_pfn);
> -			size += PFN_PHYS(arg->altmap_nr_pages);
> -		}
> -		rc = sclp_mem_change_state(start, size, 1);
> -		if (rc || !arg->altmap_nr_pages)
> -			break;
> -		/*
> -		 * Set CMMA state to nodat here, since the struct page memory
> -		 * at the beginning of the memory block will not go through the
> -		 * buddy allocator later.
> -		 */
> -		__arch_set_page_nodat((void *)__va(start), arg->altmap_nr_pages);
> +	default:
>   		break;
> -	case MEM_FINISH_OFFLINE:
> +	}
> +	return rc ? NOTIFY_BAD : NOTIFY_OK;
> +}
> +
> +static ssize_t config_mblock_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
> +{
> +	struct mblock *mblock = container_of(kobj, struct mblock, kobj);
> +
> +	return sysfs_emit(buf, "%u\n", READ_ONCE(mblock->config));
> +}
> +
> +static ssize_t config_mblock_store(struct kobject *kobj, struct kobj_attribute *attr,
> +				   const char *buf, size_t count)
> +{
> +	unsigned long long addr, block_size;

"unsigned long" should be sufficient I'm sure :)

> +	struct memory_block *mem;
> +	struct mblock *mblock;
> +	unsigned char id;
> +	bool value;
> +	int rc;
> +
> +	rc = kstrtobool(buf, &value);
> +	if (rc)
> +		return rc;
> +	mblock = container_of(kobj, struct mblock, kobj);
> +	block_size = memory_block_size_bytes();
> +	addr = mblock->id * block_size;
> +	/*
> +	 * Hold device_hotplug_lock when adding/removing memory blocks.
> +	 * Additionally, also protect calls to find_memory_block() and
> +	 * sclp_attach_storage().
> +	 */
> +	rc = lock_device_hotplug_sysfs();
> +	if (rc)
> +		goto out;
> +	for_each_clear_bit(id, sclp_storage_ids, sclp_max_storage_id + 1)
> +		sclp_attach_storage(id);
> +	if (value) {
> +		if (mblock->config)
> +			goto out_unlock;
> +		rc = sclp_mem_change_state(addr, block_size, 1);
> +		if (rc)
> +			goto out_unlock;
>   		/*
> -		 * When altmap is in use, take the specified memory range
> -		 * offline, which includes the altmap.
> +		 * Set entire memory block CMMA state to nodat. Later, when
> +		 * page tables pages are allocated via __add_memory(), those
> +		 * regions are marked __arch_set_page_dat().
>   		 */
> -		if (arg->altmap_nr_pages) {
> -			start = PFN_PHYS(arg->altmap_start_pfn);
> -			size += PFN_PHYS(arg->altmap_nr_pages);
> +		__arch_set_page_nodat((void *)__va(addr), block_size >> PAGE_SHIFT);
> +		rc = __add_memory(0, addr, block_size,
> +				  mblock->memmap_on_memory ?
> +				  MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE);
> +		if (rc)
> +			goto out_unlock;

Do we have to undo the state change?

> +		mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(addr)));
> +		put_device(&mem->dev);
> +		WRITE_ONCE(mblock->config, 1);
> +	} else {
> +		if (!mblock->config)
> +			goto out_unlock;
> +		mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(addr)));
> +		if (mem->state != MEM_OFFLINE) {
> +			put_device(&mem->dev);
> +			rc = -EBUSY;
> +			goto out_unlock;
>   		}
> -		sclp_mem_change_state(start, size, 0);
> -		break;
> -	default:
> -		break;
> +		/* drop the ref just got via find_memory_block() */
> +		put_device(&mem->dev);
> +		sclp_mem_change_state(addr, block_size, 0);
> +		__remove_memory(addr, block_size);
> +		WRITE_ONCE(mblock->config, 0);
>   	}
> -	mutex_unlock(&sclp_mem_mutex);
> -	return rc ? NOTIFY_BAD : NOTIFY_OK;
> +out_unlock:
> +	unlock_device_hotplug();
> +out:
> +	return rc ? rc : count;
> +}
> +
> +static ssize_t memmap_on_memory_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
> +{
> +	struct mblock *mblock = container_of(kobj, struct mblock, kobj);
> +
> +	return sysfs_emit(buf, "%u\n", READ_ONCE(mblock->memmap_on_memory));
> +}
> +
> +static ssize_t memmap_on_memory_store(struct kobject *kobj, struct kobj_attribute *attr,
> +				      const char *buf, size_t count)
> +{
> +	unsigned long block_size;
> +	struct memory_block *mem;
> +	struct mblock *mblock;
> +	bool value;
> +	int rc;
> +
> +	rc = kstrtobool(buf, &value);
> +	if (rc)
> +		return rc;
> +	rc = lock_device_hotplug_sysfs();
> +	if (rc)
> +		return rc;
> +	block_size = memory_block_size_bytes();
> +	mblock = container_of(kobj, struct mblock, kobj);
> +	mem = find_memory_block(pfn_to_section_nr(PFN_DOWN(mblock->id * block_size)));
> +	if (!mem) {
> +		WRITE_ONCE(mblock->memmap_on_memory, value);
> +	} else {
> +		put_device(&mem->dev);
> +		rc = -EBUSY;
> +	}
> +	unlock_device_hotplug();
> +	return rc ? rc : count;
> +}
> +
> +static void mblock_sysfs_release(struct kobject *kobj)
> +{
> +	struct mblock *mblock = container_of(kobj, struct mblock, kobj);
> +
> +	kfree(mblock);
> +}
> +
> +static const struct kobj_type ktype = {
> +	.release = mblock_sysfs_release,
> +	.sysfs_ops = &kobj_sysfs_ops,
> +};
> +
> +static struct kobj_attribute memmap_attr =
> +	__ATTR(memmap_on_memory, 0644, memmap_on_memory_show, memmap_on_memory_store);
> +static struct kobj_attribute config_attr =
> +	__ATTR(config, 0644, config_mblock_show, config_mblock_store);
> +
> +static struct attribute *mblock_attrs[] = {
> +	&memmap_attr.attr,
> +	&config_attr.attr,
> +	NULL,
> +};
> +
> +static struct attribute_group mblock_attr_group = {
> +	.attrs = mblock_attrs,
> +};
> +
> +static int create_mblock(struct mblock *mblock, struct kset *kset,
> +			 unsigned int id, bool config, bool memmap_on_memory)
> +{
> +	int rc;
> +
> +	mblock->memmap_on_memory = memmap_on_memory;
> +	mblock->config = config;
> +	mblock->id = id;
> +	kobject_init(&mblock->kobj, &ktype);
> +	rc = kobject_add(&mblock->kobj, &kset->kobj, "memory%d", id);
> +	if (rc)
> +		return rc;
> +	rc = sysfs_create_group(&mblock->kobj, &mblock_attr_group);
> +	if (rc)
> +		kobject_put(&mblock->kobj);
> +	return rc;
> +}
> +
> +/*
> + * Create /sys/firmware/memory/memoryX for boottime configured online memory
> + * blocks
> + */
> +static int create_online_mblock(struct memory_block *mem, void *argument)

"online" is conusing. It's "initial" / "configured". Same applies to the other functions
that mention "online".

> +{
> +	struct memory_block_arg *arg;
> +	struct mblock *mblocks;
> +	struct kset *kset;
> +	unsigned int id;
> +
> +	id = mem->dev.id;
> +	arg = (struct memory_block_arg *)argument;
> +	mblocks = arg->mblocks;
> +	kset = arg->kset;
> +	return create_mblock(&mblocks[id], kset, id, true, false);
> +}
> +
> +static int __init create_initial_online_mblocks(struct mblock *mblocks, struct kset *kset)
> +{
> +	struct memory_block_arg arg;
> +
> +	arg.mblocks = mblocks;
> +	arg.kset = kset;
> +	return for_each_memory_block(&arg, create_online_mblock);
> +}
> +
> +static struct mblock * __init allocate_mblocks(void)
> +{
> +	u64 max_mblocks;

Nit: why an u64? The block ids are "unsigned int id;"

> +	u64 block_size;
> +
> +	block_size = memory_block_size_bytes();
> +	max_mblocks = roundup(sclp.rnmax * sclp.rzm, block_size) / block_size;
> +	return kcalloc(max_mblocks, sizeof(struct mblock), GFP_KERNEL);


I think you should structure the code a bit differently, not splitting
the function up into tiny helpers.

static int __init init_sclp_mem(void)
{
	const u64 block_size = memory_block_size_bytes();
	const u64 max_mblocks = roundup(sclp.rnmax * sclp.rzm, block_size) / block_size;
	struct sclp_mem_arg arg;
	struct kset *kset;
	int rc;
	
	/* We'll allocate memory for all blocks ahead of time. */
	sclp_mem = kcalloc(max_mblocks, sizeof(struct mblock), GFP_KERNEL);
	if (!sclp_mem)
		return -ENOMEM;

	kset = kset_create_and_add("memory", NULL, firmware_kobj);
	if (!kset)
		return -ENOMEM;

	/* Initial memory is in the "configured" state already. */
	arg.sclp_mem = sclp_mem;
	arg.kset = kset;
	rc = for_each_memory_block(&arg, create_configured_sclp_mem);
	if (rc)
		return rc;

	/* Standby memory is "deconfigured". */
	return create_standby_sclp_mem(sclp_mem, kset);
}

Should still be quite readable.

>   }
>   
>   static struct notifier_block sclp_mem_nb = {
> @@ -264,14 +430,17 @@ static void __init align_to_block_size(unsigned long *start,
>   	*size = size_align;
>   }
>   
> -static void __init add_memory_merged(u16 rn)
> +static int __init create_standby_mblocks_merged(struct mblock *mblocks,
> +						struct kset *kset, u16 rn)
>   {
>   	unsigned long start, size, addr, block_size;
>   	static u16 first_rn, num;
> +	unsigned int id;
> +	int rc = 0;
>   
>   	if (rn && first_rn && (first_rn + num == rn)) {
>   		num++;
> -		return;
> +		return rc;
>   	}
>   	if (!first_rn)
>   		goto skip_add;
> @@ -286,24 +455,31 @@ static void __init add_memory_merged(u16 rn)
>   	if (!size)
>   		goto skip_add;
>   	for (addr = start; addr < start + size; addr += block_size) {
> -		add_memory(0, addr, block_size,
> -			   cpu_has_edat1() ?
> -			   MHP_MEMMAP_ON_MEMORY | MHP_OFFLINE_INACCESSIBLE : MHP_NONE);
> +		id = addr / block_size;
> +		rc = create_mblock(&mblocks[id], kset, id, false, mhp_supports_memmap_on_memory());
> +		if (rc)
> +			break;
>   	}
>   skip_add:
>   	first_rn = rn;
>   	num = 1;
> +	return rc;
>   }
>   
> -static void __init sclp_add_standby_memory(void)
> +static int __init create_standby_mblocks(struct mblock *mblocks, struct kset *kset)
>   {
>   	struct memory_increment *incr;
> +	int rc = 0;
>   
>   	list_for_each_entry(incr, &sclp_mem_list, list) {
>   		if (incr->standby)
> -			add_memory_merged(incr->rn);
> +			rc = create_standby_mblocks_merged(mblocks, kset, incr->rn);
> +		if (rc)
> +			goto out;
>   	}
> -	add_memory_merged(0);
> +	rc = create_standby_mblocks_merged(mblocks, kset, 0);
> +out:
> +	return rc;
>   }
>   
>   static void __init insert_increment(u16 rn, int standby, int assigned)
> @@ -336,10 +512,12 @@ static void __init insert_increment(u16 rn, int standby, int assigned)
>   	list_add(&new_incr->list, prev);
>   }
>   
> -static int __init sclp_detect_standby_memory(void)
> +static int __init sclp_setup_memory(void)
>   {
>   	struct read_storage_sccb *sccb;
>   	int i, id, assigned, rc;
> +	struct mblock *mblocks;
> +	struct kset *kset;
>   
>   	/* No standby memory in kdump mode */
>   	if (oldmem_data.start)

Wouldn't we still want to create the ones for initial memory at least?

[...]

-- 
Cheers

David / dhildenb