lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <2b5378f3686fd2831468e65c49609fbb19072b43.1748594840.git.libo.gcs85@bytedance.com>
Date: Fri, 30 May 2025 17:27:37 +0800
From: Bo Li <libo.gcs85@...edance.com>
To: tglx@...utronix.de,
	mingo@...hat.com,
	bp@...en8.de,
	dave.hansen@...ux.intel.com,
	x86@...nel.org,
	luto@...nel.org,
	kees@...nel.org,
	akpm@...ux-foundation.org,
	david@...hat.com,
	juri.lelli@...hat.com,
	vincent.guittot@...aro.org,
	peterz@...radead.org
Cc: dietmar.eggemann@....com,
	hpa@...or.com,
	acme@...nel.org,
	namhyung@...nel.org,
	mark.rutland@....com,
	alexander.shishkin@...ux.intel.com,
	jolsa@...nel.org,
	irogers@...gle.com,
	adrian.hunter@...el.com,
	kan.liang@...ux.intel.com,
	viro@...iv.linux.org.uk,
	brauner@...nel.org,
	jack@...e.cz,
	lorenzo.stoakes@...cle.com,
	Liam.Howlett@...cle.com,
	vbabka@...e.cz,
	rppt@...nel.org,
	surenb@...gle.com,
	mhocko@...e.com,
	rostedt@...dmis.org,
	bsegall@...gle.com,
	mgorman@...e.de,
	vschneid@...hat.com,
	jannh@...gle.com,
	pfalcato@...e.de,
	riel@...riel.com,
	harry.yoo@...cle.com,
	linux-kernel@...r.kernel.org,
	linux-perf-users@...r.kernel.org,
	linux-fsdevel@...r.kernel.org,
	linux-mm@...ck.org,
	duanxiongchun@...edance.com,
	yinhongbo@...edance.com,
	dengliang.1214@...edance.com,
	xieyongji@...edance.com,
	chaiwen.cc@...edance.com,
	songmuchun@...edance.com,
	yuanzhu@...edance.com,
	chengguozhu@...edance.com,
	sunjiadong.lff@...edance.com,
	Bo Li <libo.gcs85@...edance.com>
Subject: [RFC v2 09/35] RPAL: enable address space sharing

RPAL's memory sharing is implemented by copying p4d entries, which requires
implementing corresponding interfaces. Meanwhile, copying p4d entries can
cause the process's page table to contain p4d entries that do not belong to
it, and RPAL needs to resolve compatibility issues with other subsystems
caused by this.

This patch implements the rpal_map_service() interface to complete the
mutual copying of p4d entries between two RPAL services. For the copied p4d
entries, RPAL adds a _PAGE_RPAL_IGN flag to them. This flag makes
p4d_none() return true and p4d_present() return false, ensuring that these
p4d entries are invisible to other kernel subsystems. The protection of p4d
entries is guaranteed by the memory balloon, which ensures that the address
space corresponding to the p4d entries is not used by the current service.

Signed-off-by: Bo Li <libo.gcs85@...edance.com>
---
 arch/x86/include/asm/pgtable.h       |  25 ++++
 arch/x86/include/asm/pgtable_types.h |  11 ++
 arch/x86/rpal/internal.h             |   2 +
 arch/x86/rpal/mm.c                   | 175 +++++++++++++++++++++++++++
 4 files changed, 213 insertions(+)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5ddba366d3b4..54351bfe4e47 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1137,12 +1137,37 @@ static inline int pud_bad(pud_t pud)
 #if CONFIG_PGTABLE_LEVELS > 3
 static inline int p4d_none(p4d_t p4d)
 {
+#if IS_ENABLED(CONFIG_RPAL)
+	p4dval_t p4dv = native_p4d_val(p4d);
+
+	/*
+	 * Since RPAL copy p4d entry to share address space,
+	 * it is important that other process will not manipulate
+	 * this copied p4d. Thus, make p4d_none() always return
+	 * 0 to bypass kernel page table logic on copied p4d.
+	 */
+	return (p4dv & _PAGE_RPAL_IGN) ||
+	       ((p4dv & ~(_PAGE_KNL_ERRATUM_MASK)) == 0);
+#else
 	return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
+#endif
 }
 
 static inline int p4d_present(p4d_t p4d)
 {
+#if IS_ENABLED(CONFIG_RPAL)
+	p4dval_t p4df = p4d_flags(p4d);
+
+	/*
+	 * Since RPAL copy p4d entry to share address space,
+	 * it is important that other process will not manipulate
+	 * this copied p4d. Thus, make p4d_present() always return
+	 * 0 to bypass kernel page table logic on copied p4d.
+	 */
+	return ((p4df & (_PAGE_PRESENT | _PAGE_RPAL_IGN)) == _PAGE_PRESENT);
+#else
 	return p4d_flags(p4d) & _PAGE_PRESENT;
+#endif
 }
 
 static inline pud_t *p4d_pgtable(p4d_t p4d)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b74ec5c3643b..781b0f5bc359 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -35,6 +35,13 @@
 #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_KERNEL_4K	_PAGE_BIT_SOFTW3 /* page must not be converted to large */
 #define _PAGE_BIT_DEVMAP	_PAGE_BIT_SOFTW4
+/*
+ * _PAGE_BIT_SOFTW1 is used by _PAGE_BIT_SPECIAL.
+ * but we are not conflicted with _PAGE_BIT_SPECIAL
+ * as we use it only on p4d/pud level and _PAGE_BIT_SPECIAL
+ * is only used on pte level.
+ */
+#define _PAGE_BIT_RPAL_IGN	_PAGE_BIT_SOFTW1
 
 #ifdef CONFIG_X86_64
 #define _PAGE_BIT_SAVED_DIRTY	_PAGE_BIT_SOFTW5 /* Saved Dirty bit (leaf) */
@@ -95,6 +102,10 @@
 #define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 0))
 #endif
 
+#if IS_ENABLED(CONFIG_RPAL)
+#define _PAGE_RPAL_IGN	(_AT(pteval_t, 1) << _PAGE_BIT_RPAL_IGN)
+#endif
+
 /*
  * Tracking soft dirty bit when a page goes to a swap is tricky.
  * We need a bit which can be stored in pte _and_ not conflict
diff --git a/arch/x86/rpal/internal.h b/arch/x86/rpal/internal.h
index 3559c9c6e868..65f2cf4baf8f 100644
--- a/arch/x86/rpal/internal.h
+++ b/arch/x86/rpal/internal.h
@@ -34,6 +34,8 @@ static inline void rpal_put_shared_page(struct rpal_shared_page *rsp)
 int rpal_mmap(struct file *filp, struct vm_area_struct *vma);
 struct rpal_shared_page *rpal_find_shared_page(struct rpal_service *rs,
 					       unsigned long addr);
+int rpal_map_service(struct rpal_service *tgt);
+void rpal_unmap_service(struct rpal_service *tgt);
 
 /* thread.c */
 int rpal_register_sender(unsigned long addr);
diff --git a/arch/x86/rpal/mm.c b/arch/x86/rpal/mm.c
index 8a738c502d1d..f1003baae001 100644
--- a/arch/x86/rpal/mm.c
+++ b/arch/x86/rpal/mm.c
@@ -215,3 +215,178 @@ void rpal_exit_mmap(struct mm_struct *mm)
 		rpal_put_service(rs);
 	}
 }
+
+/*
+ * Since the user address space size of rpal process is 512G, which
+ * is the size of one p4d, we assume p4d entry will never change after
+ * rpal process is created.
+ */
+static int mm_link_p4d(struct mm_struct *dst_mm, p4d_t src_p4d,
+		       unsigned long addr)
+{
+	spinlock_t *dst_ptl = &dst_mm->page_table_lock;
+	unsigned long flags;
+	pgd_t *dst_pgdp;
+	p4d_t p4d, *dst_p4dp;
+	p4dval_t p4dv;
+	int ret = 0;
+
+	BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
+
+	mmap_write_lock(dst_mm);
+	spin_lock_irqsave(dst_ptl, flags);
+	dst_pgdp = pgd_offset(dst_mm, addr);
+	/*
+	 * dst_pgd must exists, otherwise we need to alloc pgd entry. When
+	 * src_p4d is freed, we also need to free the pgd entry. This should
+	 * be supported in the future.
+	 */
+	if (unlikely(pgd_none_or_clear_bad(dst_pgdp))) {
+		rpal_err("cannot find pgd entry for addr 0x%016lx\n", addr);
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	dst_p4dp = p4d_offset(dst_pgdp, addr);
+	if (unlikely(!p4d_none_or_clear_bad(dst_p4dp))) {
+		rpal_err("p4d is previously mapped\n");
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	p4dv = p4d_val(src_p4d);
+
+	/*
+	 * Since RPAL copy p4d entry to share address space,
+	 * it is important that other process will not manipulate
+	 * this copied p4d. We need mark the copied p4d and make
+	 * p4d_present() and p4d_none() ignore such p4d.
+	 */
+	p4dv |= _PAGE_RPAL_IGN;
+
+	if (boot_cpu_has(X86_FEATURE_PTI))
+		p4d = native_make_p4d((~_PAGE_NX) & p4dv);
+	else
+		p4d = native_make_p4d(p4dv);
+
+	set_p4d(dst_p4dp, p4d);
+	spin_unlock_irqrestore(dst_ptl, flags);
+	mmap_write_unlock(dst_mm);
+
+	return 0;
+unlock:
+	spin_unlock_irqrestore(dst_ptl, flags);
+	mmap_write_unlock(dst_mm);
+	return ret;
+}
+
+static void mm_unlink_p4d(struct mm_struct *mm, unsigned long addr)
+{
+	spinlock_t *ptl = &mm->page_table_lock;
+	unsigned long flags;
+	pgd_t *pgdp;
+	p4d_t *p4dp;
+
+	mmap_write_lock(mm);
+	spin_lock_irqsave(ptl, flags);
+	pgdp = pgd_offset(mm, addr);
+	p4dp = p4d_offset(pgdp, addr);
+	p4d_clear(p4dp);
+	spin_unlock_irqrestore(ptl, flags);
+	mmap_write_unlock(mm);
+
+	flush_tlb_mm(mm);
+}
+
+static int get_mm_p4d(struct mm_struct *mm, unsigned long addr, p4d_t *srcp)
+{
+	spinlock_t *ptl;
+	unsigned long flags;
+	pgd_t *pgdp;
+	p4d_t *p4dp;
+	int ret = 0;
+
+	ptl = &mm->page_table_lock;
+	spin_lock_irqsave(ptl, flags);
+	pgdp = pgd_offset(mm, addr);
+	if (pgd_none(*pgdp)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	p4dp = p4d_offset(pgdp, addr);
+	if (p4d_none(*p4dp) || p4d_bad(*p4dp)) {
+		ret = -EINVAL;
+		goto out;
+	}
+	*srcp = *p4dp;
+
+out:
+	spin_unlock_irqrestore(ptl, flags);
+
+	return ret;
+}
+
+int rpal_map_service(struct rpal_service *tgt)
+{
+	struct rpal_service *cur = rpal_current_service();
+	struct mm_struct *cur_mm, *tgt_mm;
+	unsigned long cur_addr, tgt_addr;
+	p4d_t cur_p4d, tgt_p4d;
+	int ret = 0;
+
+	cur_mm = current->mm;
+	tgt_mm = tgt->mm;
+	if (!mmget_not_zero(tgt_mm)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	cur_addr = rpal_get_base(cur);
+	tgt_addr = rpal_get_base(tgt);
+
+	ret = get_mm_p4d(tgt_mm, tgt_addr, &tgt_p4d);
+	if (ret)
+		goto put_tgt;
+
+	ret = get_mm_p4d(cur_mm, cur_addr, &cur_p4d);
+	if (ret)
+		goto put_tgt;
+
+	ret = mm_link_p4d(cur_mm, tgt_p4d, tgt_addr);
+	if (ret)
+		goto put_tgt;
+
+	ret = mm_link_p4d(tgt_mm, cur_p4d, cur_addr);
+	if (ret) {
+		mm_unlink_p4d(cur_mm, tgt_addr);
+		goto put_tgt;
+	}
+
+put_tgt:
+	mmput(tgt_mm);
+out:
+	return ret;
+}
+
+void rpal_unmap_service(struct rpal_service *tgt)
+{
+	struct rpal_service *cur = rpal_current_service();
+	struct mm_struct *cur_mm, *tgt_mm;
+	unsigned long cur_addr, tgt_addr;
+
+	cur_mm = current->mm;
+	tgt_mm = tgt->mm;
+
+	cur_addr = rpal_get_base(cur);
+	tgt_addr = rpal_get_base(tgt);
+
+	if (mmget_not_zero(tgt_mm)) {
+		mm_unlink_p4d(tgt_mm, cur_addr);
+		mmput(tgt_mm);
+	} else {
+		/* If tgt has exited, then we get a NULL tgt_mm */
+		pr_debug("rpal: [%d] cannot find target mm\n", current->pid);
+	}
+	mm_unlink_p4d(cur_mm, tgt->base);
+}
-- 
2.20.1


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ