linux-kernel - [PATCH v3 17/22] sched/mshare: mshare ownership

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20250820010415.699353-18-anthony.yznaga@oracle.com>
Date: Tue, 19 Aug 2025 18:04:10 -0700
From: Anthony Yznaga <anthony.yznaga@...cle.com>
To: linux-mm@...ck.org
Cc: akpm@...ux-foundation.org, andreyknvl@...il.com, arnd@...db.de,
        bp@...en8.de, brauner@...nel.org, bsegall@...gle.com, corbet@....net,
        dave.hansen@...ux.intel.com, david@...hat.com,
        dietmar.eggemann@....com, ebiederm@...ssion.com, hpa@...or.com,
        jakub.wartak@...lbox.org, jannh@...gle.com, juri.lelli@...hat.com,
        khalid@...nel.org, liam.howlett@...cle.com, linyongting@...edance.com,
        lorenzo.stoakes@...cle.com, luto@...nel.org, markhemm@...glemail.com,
        maz@...nel.org, mhiramat@...nel.org, mgorman@...e.de, mhocko@...e.com,
        mingo@...hat.com, muchun.song@...ux.dev, neilb@...e.de,
        osalvador@...e.de, pcc@...gle.com, peterz@...radead.org,
        pfalcato@...e.de, rostedt@...dmis.org, rppt@...nel.org,
        shakeel.butt@...ux.dev, surenb@...gle.com, tglx@...utronix.de,
        vasily.averin@...ux.dev, vbabka@...e.cz, vincent.guittot@...aro.org,
        viro@...iv.linux.org.uk, vschneid@...hat.com, willy@...radead.org,
        x86@...nel.org, xhao@...ux.alibaba.com, linux-doc@...r.kernel.org,
        linux-kernel@...r.kernel.org, linux-arch@...r.kernel.org
Subject: [PATCH v3 17/22] sched/mshare: mshare ownership

Ownership of an mshare region is assigned to the process that creates
it. Establishing ownership ensures that accounting the memory in an
mshare region is applied to the owner and not spread among the processes
sharing the memory. It also provides a means for freeing mshare memory
in an OOM situation. Once an mshare owner exits, access to the memory by
a non-owner process results in a SIGSEGV. For this initial implementation
ownership is not shared or transferred through forking or other means.

Signed-off-by: Anthony Yznaga <anthony.yznaga@...cle.com>
---
 include/linux/mshare.h | 25 +++++++++++++
 include/linux/sched.h  |  5 +++
 kernel/exit.c          |  1 +
 kernel/fork.c          |  1 +
 mm/mshare.c            | 83 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 115 insertions(+)
 create mode 100644 include/linux/mshare.h

diff --git a/include/linux/mshare.h b/include/linux/mshare.h
new file mode 100644
index 000000000000..b62f0e54cf84
--- /dev/null
+++ b/include/linux/mshare.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MSHARE_H_
+#define _LINUX_MSHARE_H_
+
+#include <linux/types.h>
+
+struct task_struct;
+
+#ifdef CONFIG_MSHARE
+
+void exit_mshare(struct task_struct *task);
+#define mshare_init_task(task) INIT_LIST_HEAD(&(task)->mshare_mem)
+
+#else
+
+static inline void exit_mshare(struct task_struct *task)
+{
+}
+static inline void mshare_init_task(struct task_struct *task)
+{
+}
+
+#endif
+
+#endif /* _LINUX_MSHARE_H_ */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2b272382673d..17f2f3c0b465 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -48,6 +48,7 @@
 #include <linux/uidgid_types.h>
 #include <linux/tracepoint-defs.h>
 #include <linux/unwind_deferred_types.h>
+#include <linux/mshare.h>
 #include <asm/kmap_size.h>
 
 /* task_struct member predeclarations (sorted alphabetically): */
@@ -1654,6 +1655,10 @@ struct task_struct {
 	/* CPU-specific state of this task: */
 	struct thread_struct		thread;
 
+#ifdef CONFIG_MSHARE
+	struct list_head		mshare_mem;
+#endif
+
 	/*
 	 * New fields for task_struct should be added above here, so that
 	 * they are included in the randomized portion of task_struct.
diff --git a/kernel/exit.c b/kernel/exit.c
index 343eb97543d5..24445109865d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -951,6 +951,7 @@ void __noreturn do_exit(long code)
 	if (group_dead)
 		acct_process();
 
+	exit_mshare(tsk);
 	exit_sem(tsk);
 	exit_shm(tsk);
 	exit_files(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5115be549234..eba6bd709c6e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2143,6 +2143,7 @@ __latent_entropy struct task_struct *copy_process(
 #endif
 
 	unwind_task_init(p);
+	mshare_init_task(p);
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	retval = sched_fork(clone_flags, p);
diff --git a/mm/mshare.c b/mm/mshare.c
index f7b7904f0405..8a23b391fa11 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -17,6 +17,7 @@
 #include <linux/fs_context.h>
 #include <linux/mman.h>
 #include <linux/mmu_notifier.h>
+#include <linux/mshare.h>
 #include <uapi/linux/magic.h>
 #include <linux/falloc.h>
 #include <asm/tlbflush.h>
@@ -27,6 +28,7 @@ const unsigned long mshare_align = P4D_SIZE;
 const unsigned long mshare_base = mshare_align;
 
 #define MSHARE_INITIALIZED	0x1
+#define MSHARE_HAS_OWNER	0x2
 
 struct mshare_data {
 	struct mm_struct *mm;
@@ -35,6 +37,7 @@ struct mshare_data {
 	unsigned long size;
 	unsigned long flags;
 	struct mmu_notifier mn;
+	struct list_head list;
 };
 
 static inline bool mshare_is_initialized(struct mshare_data *m_data)
@@ -42,6 +45,65 @@ static inline bool mshare_is_initialized(struct mshare_data *m_data)
 	return test_bit(MSHARE_INITIALIZED, &m_data->flags);
 }
 
+static inline bool mshare_has_owner(struct mshare_data *m_data)
+{
+	return test_bit(MSHARE_HAS_OWNER, &m_data->flags);
+}
+
+static bool mshare_data_getref(struct mshare_data *m_data);
+static void mshare_data_putref(struct mshare_data *m_data);
+
+void exit_mshare(struct task_struct *task)
+{
+	for (;;) {
+		struct mshare_data *m_data;
+		int error;
+
+		task_lock(task);
+
+		if (list_empty(&task->mshare_mem)) {
+			task_unlock(task);
+			break;
+		}
+
+		m_data = list_first_entry(&task->mshare_mem, struct mshare_data,
+						list);
+
+		WARN_ON_ONCE(!mshare_data_getref(m_data));
+
+		list_del_init(&m_data->list);
+		task_unlock(task);
+
+		/*
+		 * The owner of an mshare region is going away. Unmap
+		 * everything in the region and prevent more mappings from
+		 * being created.
+		 *
+		 * XXX
+		 * The fact that the unmap can possibly fail is problematic.
+		 * One alternative is doing a subset of what exit_mmap() does.
+		 * If it's preferrable to preserve the mappings then another
+		 * approach is to fail any further faults on the mshare region
+		 * and unlink the shared page tables from the page tables of
+		 * each sharing process by walking the rmap via the msharefs
+		 * inode.
+		 * Unmapping everything means mshare memory is freed up when
+		 * the owner exits which may be preferrable for OOM situations.
+		 */
+
+		clear_bit(MSHARE_HAS_OWNER, &m_data->flags);
+
+		mmap_write_lock(m_data->mm);
+		error = do_munmap(m_data->mm, m_data->start, m_data->size, NULL);
+		mmap_write_unlock(m_data->mm);
+
+		if (error)
+			pr_warn("%s: do_munmap returned %d\n", __func__, error);
+
+		mshare_data_putref(m_data);
+	}
+}
+
 static void mshare_invalidate_tlbs(struct mmu_notifier *mn, struct mm_struct *mm,
 				   unsigned long start, unsigned long end)
 {
@@ -362,6 +424,11 @@ msharefs_fill_mm(struct inode *inode)
 	ret = mmu_notifier_register(&m_data->mn, mm);
 	if (ret)
 		goto err_free;
+	INIT_LIST_HEAD(&m_data->list);
+	task_lock(current);
+	list_add(&m_data->list, &current->mshare_mem);
+	task_unlock(current);
+	set_bit(MSHARE_HAS_OWNER, &m_data->flags);
 
 	refcount_set(&m_data->ref, 1);
 	inode->i_private = m_data;
@@ -380,6 +447,11 @@ msharefs_delmm(struct mshare_data *m_data)
 	kfree(m_data);
 }
 
+static bool mshare_data_getref(struct mshare_data *m_data)
+{
+	return refcount_inc_not_zero(&m_data->ref);
+}
+
 static void mshare_data_putref(struct mshare_data *m_data)
 {
 	if (!refcount_dec_and_test(&m_data->ref))
@@ -543,6 +615,17 @@ msharefs_evict_inode(struct inode *inode)
 	if (!m_data)
 		goto out;
 
+	rcu_read_lock();
+
+	if (!list_empty(&m_data->list)) {
+		struct task_struct *owner = m_data->mm->owner;
+
+		task_lock(owner);
+		list_del_init(&m_data->list);
+		task_unlock(owner);
+	}
+	rcu_read_unlock();
+
 	mshare_data_putref(m_data);
 out:
 	clear_inode(inode);
-- 
2.47.1