[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZkJJflk6Lhx3QSji@debug.ba.rivosinc.com>
Date: Mon, 13 May 2024 10:10:22 -0700
From: Deepak Gupta <debug@...osinc.com>
To: Alexandre Ghiti <alex@...ti.fr>
Cc: paul.walmsley@...ive.com, rick.p.edgecombe@...el.com,
broonie@...nel.org, Szabolcs.Nagy@....com, kito.cheng@...ive.com,
keescook@...omium.org, ajones@...tanamicro.com,
conor.dooley@...rochip.com, cleger@...osinc.com,
atishp@...shpatra.org, bjorn@...osinc.com, alexghiti@...osinc.com,
samuel.holland@...ive.com, conor@...nel.org,
linux-doc@...r.kernel.org, linux-riscv@...ts.infradead.org,
linux-kernel@...r.kernel.org, devicetree@...r.kernel.org,
linux-mm@...ck.org, linux-arch@...r.kernel.org,
linux-kselftest@...r.kernel.org, corbet@....net, palmer@...belt.com,
aou@...s.berkeley.edu, robh+dt@...nel.org,
krzysztof.kozlowski+dt@...aro.org, oleg@...hat.com,
akpm@...ux-foundation.org, arnd@...db.de, ebiederm@...ssion.com,
Liam.Howlett@...cle.com, vbabka@...e.cz, lstoakes@...il.com,
shuah@...nel.org, brauner@...nel.org, andy.chiu@...ive.com,
jerry.shih@...ive.com, hankuan.chen@...ive.com,
greentime.hu@...ive.com, evan@...osinc.com, xiao.w.wang@...el.com,
charlie@...osinc.com, apatel@...tanamicro.com,
mchitale@...tanamicro.com, dbarboza@...tanamicro.com,
sameo@...osinc.com, shikemeng@...weicloud.com, willy@...radead.org,
vincent.chen@...ive.com, guoren@...nel.org, samitolvanen@...gle.com,
songshuaishuai@...ylab.org, gerg@...nel.org, heiko@...ech.de,
bhe@...hat.com, jeeheng.sia@...rfivetech.com, cyy@...self.name,
maskray@...gle.com, ancientmodern4@...il.com,
mathis.salmen@...sal.de, cuiyunhui@...edance.com,
bgray@...ux.ibm.com, mpe@...erman.id.au, baruch@...s.co.il,
alx@...nel.org, david@...hat.com, catalin.marinas@....com,
revest@...omium.org, josh@...htriplett.org, shr@...kernel.io,
deller@....de, omosnace@...hat.com, ojeda@...nel.org,
jhubbard@...dia.com
Subject: Re: [PATCH v3 15/29] riscv/shstk: If needed allocate a new shadow
stack on clone
On Sun, May 12, 2024 at 07:05:27PM +0200, Alexandre Ghiti wrote:
>On 04/04/2024 01:35, Deepak Gupta wrote:
>>Userspace specifies VM_CLONE to share address space and spawn new thread.
>
>
>CLONE_VM?
Yes I meant CLONE_VM, will fix it.
>
>
>>`clone` allow userspace to specify a new stack for new thread. However
>>there is no way to specify new shadow stack base address without changing
>>API. This patch allocates a new shadow stack whenever VM_CLONE is given.
>>
>>In case of VM_FORK, parent is suspended until child finishes and thus can
>
>
>You mean CLONE_VFORK here right?
Yes I meant CLONE_VFORK, will fix it.
>
>
>>child use parent shadow stack. In case of !VM_CLONE, COW kicks in because
>>entire address space is copied from parent to child.
>>
>>`clone3` is extensible and can provide mechanisms using which shadow stack
>>as an input parameter can be provided. This is not settled yet and being
>>extensively discussed on mailing list. Once that's settled, this commit
>>will adapt to that.
>>
>>Signed-off-by: Deepak Gupta <debug@...osinc.com>
>>---
>> arch/riscv/include/asm/usercfi.h | 39 ++++++++++
>> arch/riscv/kernel/process.c | 12 ++-
>> arch/riscv/kernel/usercfi.c | 121 +++++++++++++++++++++++++++++++
>> 3 files changed, 171 insertions(+), 1 deletion(-)
>>
>>diff --git a/arch/riscv/include/asm/usercfi.h b/arch/riscv/include/asm/usercfi.h
>>index 4fa201b4fc4e..b47574a7a8c9 100644
>>--- a/arch/riscv/include/asm/usercfi.h
>>+++ b/arch/riscv/include/asm/usercfi.h
>>@@ -8,6 +8,9 @@
>> #ifndef __ASSEMBLY__
>> #include <linux/types.h>
>>+struct task_struct;
>>+struct kernel_clone_args;
>>+
>> #ifdef CONFIG_RISCV_USER_CFI
>> struct cfi_status {
>> unsigned long ubcfi_en : 1; /* Enable for backward cfi. */
>>@@ -17,6 +20,42 @@ struct cfi_status {
>> unsigned long shdw_stk_size; /* size of shadow stack */
>> };
>>+unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
>>+ const struct kernel_clone_args *args);
>>+void shstk_release(struct task_struct *tsk);
>>+void set_shstk_base(struct task_struct *task, unsigned long shstk_addr, unsigned long size);
>>+void set_active_shstk(struct task_struct *task, unsigned long shstk_addr);
>>+bool is_shstk_enabled(struct task_struct *task);
>>+
>>+#else
>>+
>>+static inline unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
>>+ const struct kernel_clone_args *args)
>>+{
>>+ return 0;
>>+}
>>+
>>+static inline void shstk_release(struct task_struct *tsk)
>>+{
>>+
>>+}
>>+
>>+static inline void set_shstk_base(struct task_struct *task, unsigned long shstk_addr,
>>+ unsigned long size)
>>+{
>>+
>>+}
>>+
>>+static inline void set_active_shstk(struct task_struct *task, unsigned long shstk_addr)
>>+{
>>+
>>+}
>>+
>>+static inline bool is_shstk_enabled(struct task_struct *task)
>>+{
>>+ return false;
>>+}
>>+
>> #endif /* CONFIG_RISCV_USER_CFI */
>> #endif /* __ASSEMBLY__ */
>>diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
>>index ce577cdc2af3..ef48a25b0eff 100644
>>--- a/arch/riscv/kernel/process.c
>>+++ b/arch/riscv/kernel/process.c
>>@@ -26,6 +26,7 @@
>> #include <asm/cpuidle.h>
>> #include <asm/vector.h>
>> #include <asm/cpufeature.h>
>>+#include <asm/usercfi.h>
>> register unsigned long gp_in_global __asm__("gp");
>>@@ -202,7 +203,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
>> void exit_thread(struct task_struct *tsk)
>> {
>>-
>>+ if (IS_ENABLED(CONFIG_RISCV_USER_CFI))
>>+ shstk_release(tsk);
>> }
>> int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
>>@@ -210,6 +212,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
>> unsigned long clone_flags = args->flags;
>> unsigned long usp = args->stack;
>> unsigned long tls = args->tls;
>>+ unsigned long ssp = 0;
>> struct pt_regs *childregs = task_pt_regs(p);
>> memset(&p->thread.s, 0, sizeof(p->thread.s));
>>@@ -225,11 +228,18 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
>> p->thread.s[0] = (unsigned long)args->fn;
>> p->thread.s[1] = (unsigned long)args->fn_arg;
>> } else {
>>+ /* allocate new shadow stack if needed. In case of CLONE_VM we have to */
>>+ ssp = shstk_alloc_thread_stack(p, args);
>>+ if (IS_ERR_VALUE(ssp))
>>+ return PTR_ERR((void *)ssp);
>>+
>> *childregs = *(current_pt_regs());
>> /* Turn off status.VS */
>> riscv_v_vstate_off(childregs);
>> if (usp) /* User fork */
>> childregs->sp = usp;
>>+ if (ssp) /* if needed, set new ssp */
>>+ set_active_shstk(p, ssp);
>> if (clone_flags & CLONE_SETTLS)
>> childregs->tp = tls;
>> childregs->a0 = 0; /* Return value of fork() */
>>diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c
>>index c4ed0d4e33d6..11ef7ab925c9 100644
>>--- a/arch/riscv/kernel/usercfi.c
>>+++ b/arch/riscv/kernel/usercfi.c
>>@@ -19,6 +19,41 @@
>> #define SHSTK_ENTRY_SIZE sizeof(void *)
>>+bool is_shstk_enabled(struct task_struct *task)
>>+{
>>+ return task->thread_info.user_cfi_state.ubcfi_en ? true : false;
>>+}
>>+
>>+void set_shstk_base(struct task_struct *task, unsigned long shstk_addr, unsigned long size)
>>+{
>>+ task->thread_info.user_cfi_state.shdw_stk_base = shstk_addr;
>>+ task->thread_info.user_cfi_state.shdw_stk_size = size;
>>+}
>>+
>>+unsigned long get_shstk_base(struct task_struct *task, unsigned long *size)
>>+{
>>+ if (size)
>>+ *size = task->thread_info.user_cfi_state.shdw_stk_size;
>>+ return task->thread_info.user_cfi_state.shdw_stk_base;
>>+}
>>+
>>+void set_active_shstk(struct task_struct *task, unsigned long shstk_addr)
>>+{
>>+ task->thread_info.user_cfi_state.user_shdw_stk = shstk_addr;
>>+}
>>+
>>+/*
>>+ * If size is 0, then to be compatible with regular stack we want it to be as big as
>>+ * regular stack. Else PAGE_ALIGN it and return back
>>+ */
>>+static unsigned long calc_shstk_size(unsigned long size)
>>+{
>>+ if (size)
>>+ return PAGE_ALIGN(size);
>>+
>>+ return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
>>+}
>>+
>> /*
>> * Writes on shadow stack can either be `sspush` or `ssamoswap`. `sspush` can happen
>> * implicitly on current shadow stack pointed to by CSR_SSP. `ssamoswap` takes pointer to
>>@@ -147,3 +182,89 @@ SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsi
>> return allocate_shadow_stack(addr, aligned_size, size, set_tok);
>> }
>>+
>>+/*
>>+ * This gets called during clone/clone3/fork. And is needed to allocate a shadow stack for
>>+ * cases where CLONE_VM is specified and thus a different stack is specified by user. We
>>+ * thus need a separate shadow stack too. How does separate shadow stack is specified by
>>+ * user is still being debated. Once that's settled, remove this part of the comment.
>>+ * This function simply returns 0 if shadow stack are not supported or if separate shadow
>>+ * stack allocation is not needed (like in case of !CLONE_VM)
>>+ */
>>+unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
>>+ const struct kernel_clone_args *args)
>>+{
>>+ unsigned long addr, size;
>>+
>>+ /* If shadow stack is not supported, return 0 */
>>+ if (!cpu_supports_shadow_stack())
>>+ return 0;
>>+
>>+ /*
>>+ * If shadow stack is not enabled on the new thread, skip any
>>+ * switch to a new shadow stack.
>>+ */
>>+ if (is_shstk_enabled(tsk))
>>+ return 0;
>>+
>>+ /*
>>+ * For CLONE_VFORK the child will share the parents shadow stack.
>>+ * Set base = 0 and size = 0, this is special means to track this state
>>+ * so the freeing logic run for child knows to leave it alone.
>>+ */
>>+ if (args->flags & CLONE_VFORK) {
>>+ set_shstk_base(tsk, 0, 0);
>>+ return 0;
>>+ }
>>+
>>+ /*
>>+ * For !CLONE_VM the child will use a copy of the parents shadow
>>+ * stack.
>>+ */
>>+ if (!(args->flags & CLONE_VM))
>>+ return 0;
>>+
>>+ /*
>>+ * reaching here means, CLONE_VM was specified and thus a separate shadow
>>+ * stack is needed for new cloned thread. Note: below allocation is happening
>>+ * using current mm.
>>+ */
>>+ size = calc_shstk_size(args->stack_size);
>>+ addr = allocate_shadow_stack(0, size, 0, false);
>>+ if (IS_ERR_VALUE(addr))
>>+ return addr;
>>+
>>+ set_shstk_base(tsk, addr, size);
>>+
>>+ return addr + size;
>>+}
>>+
>>+void shstk_release(struct task_struct *tsk)
>>+{
>>+ unsigned long base = 0, size = 0;
>>+ /* If shadow stack is not supported or not enabled, nothing to release */
>>+ if (!cpu_supports_shadow_stack() ||
>>+ !is_shstk_enabled(tsk))
>>+ return;
>>+
>>+ /*
>>+ * When fork() with CLONE_VM fails, the child (tsk) already has a
>>+ * shadow stack allocated, and exit_thread() calls this function to
>>+ * free it. In this case the parent (current) and the child share
>>+ * the same mm struct. Move forward only when they're same.
>>+ */
>>+ if (!tsk->mm || tsk->mm != current->mm)
>>+ return;
>>+
>>+ /*
>>+ * We know shadow stack is enabled but if base is NULL, then
>>+ * this task is not managing its own shadow stack (CLONE_VFORK). So
>>+ * skip freeing it.
>>+ */
>>+ base = get_shstk_base(tsk, &size);
>>+ if (!base)
>>+ return;
>>+
>>+ vm_munmap(base, size);
>>+ set_shstk_base(tsk, 0, 0);
>>+}
Powered by blists - more mailing lists