linux-kernel - Re: [PATCH v10 09/13] x86/um: nommu: signal handling

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <m2zfdb38cn.wl-thehajime@gmail.com>
Date: Fri, 11 Jul 2025 08:59:52 +0900
From: Hajime Tazaki <thehajime@...il.com>
To: benjamin@...solutions.net
Cc: linux-um@...ts.infradead.org,
	ricarkol@...gle.com,
	Liam.Howlett@...cle.com,
	linux-kernel@...r.kernel.org
Subject: Re: [PATCH v10 09/13] x86/um: nommu: signal handling


Hello Benjamin,

below is the updated patch of this patch.
I didn't follow your suggestion to use host handler to execute
userspace handlers.  instead, setup stack and %rip register to call
userspace handlers at the end of the host handler.

It would be great to hear your opinion.

---
 arch/um/include/shared/kern_util.h      |   4 +
 arch/um/nommu/Makefile                  |   2 +-
 arch/um/nommu/os-Linux/signal.c         |   8 +
 arch/um/nommu/trap.c                    | 201 ++++++++++++++++++++++++
 arch/um/os-Linux/signal.c               |   3 +-
 arch/x86/um/nommu/do_syscall_64.c       |   6 +
 arch/x86/um/nommu/entry_64.S            |  14 ++
 arch/x86/um/nommu/os-Linux/mcontext.c   |   5 +
 arch/x86/um/shared/sysdep/mcontext.h    |   1 +
 arch/x86/um/shared/sysdep/ptrace.h      |   2 +-
 arch/x86/um/shared/sysdep/syscalls_64.h |   1 +
 11 files changed, 244 insertions(+), 3 deletions(-)
 create mode 100644 arch/um/nommu/trap.c

diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h
index ec8ba1f13c58..7f55402b6385 100644
--- a/arch/um/include/shared/kern_util.h
+++ b/arch/um/include/shared/kern_util.h
@@ -73,4 +73,8 @@ void um_idle_sleep(void);
 
 void kasan_map_memory(void *start, size_t len);
 
+#ifndef CONFIG_MMU
+extern void nommu_relay_signal(void *ptr);
+#endif
+
 #endif
diff --git a/arch/um/nommu/Makefile b/arch/um/nommu/Makefile
index baab7c2f57c2..096221590cfd 100644
--- a/arch/um/nommu/Makefile
+++ b/arch/um/nommu/Makefile
@@ -1,3 +1,3 @@
 # SPDX-License-Identifier: GPL-2.0
 
-obj-y := os-Linux/
+obj-y := trap.o os-Linux/
diff --git a/arch/um/nommu/os-Linux/signal.c b/arch/um/nommu/os-Linux/signal.c
index 19043b9652e2..27b6b37744b7 100644
--- a/arch/um/nommu/os-Linux/signal.c
+++ b/arch/um/nommu/os-Linux/signal.c
@@ -5,6 +5,7 @@
 #include <os.h>
 #include <sysdep/mcontext.h>
 #include <sys/ucontext.h>
+#include <as-layout.h>
 
 void sigsys_handler(int sig, struct siginfo *si,
 		    struct uml_pt_regs *regs, void *ptr)
@@ -14,3 +15,10 @@ void sigsys_handler(int sig, struct siginfo *si,
 	/* hook syscall via SIGSYS */
 	set_mc_sigsys_hook(mc);
 }
+
+void nommu_relay_signal(void *ptr)
+{
+	mcontext_t *mc = (mcontext_t *) ptr;
+
+	set_mc_return_address(mc);
+}
diff --git a/arch/um/nommu/trap.c b/arch/um/nommu/trap.c
new file mode 100644
index 000000000000..430297517455
--- /dev/null
+++ b/arch/um/nommu/trap.c
@@ -0,0 +1,201 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/mm.h>
+#include <linux/sched/signal.h>
+#include <linux/hardirq.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/sched/debug.h>
+#include <asm/current.h>
+#include <asm/tlbflush.h>
+#include <arch.h>
+#include <as-layout.h>
+#include <kern_util.h>
+#include <os.h>
+#include <skas.h>
+
+/*
+ * Note this is constrained to return 0, -EFAULT, -EACCES, -ENOMEM by
+ * segv().
+ */
+int handle_page_fault(unsigned long address, unsigned long ip,
+		      int is_write, int is_user, int *code_out)
+{
+	/* !MMU has no pagefault */
+	return -EFAULT;
+}
+
+static void show_segv_info(struct uml_pt_regs *regs)
+{
+	struct task_struct *tsk = current;
+	struct faultinfo *fi = UPT_FAULTINFO(regs);
+
+	if (!unhandled_signal(tsk, SIGSEGV))
+		return;
+
+	pr_warn_ratelimited("%s%s[%d]: segfault at %lx ip %p sp %p error %x",
+			    task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
+			    tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi),
+			    (void *)UPT_IP(regs), (void *)UPT_SP(regs),
+			    fi->error_code);
+}
+
+static void bad_segv(struct faultinfo fi, unsigned long ip)
+{
+	current->thread.arch.faultinfo = fi;
+	force_sig_fault(SIGSEGV, SEGV_ACCERR, (void __user *) FAULT_ADDRESS(fi));
+}
+
+void fatal_sigsegv(void)
+{
+	force_fatal_sig(SIGSEGV);
+	do_signal(&current->thread.regs);
+	/*
+	 * This is to tell gcc that we're not returning - do_signal
+	 * can, in general, return, but in this case, it's not, since
+	 * we just got a fatal SIGSEGV queued.
+	 */
+	os_dump_core();
+}
+
+/**
+ * segv_handler() - the SIGSEGV handler
+ * @sig:	the signal number
+ * @unused_si:	the signal info struct; unused in this handler
+ * @regs:	the ptrace register information
+ *
+ * The handler first extracts the faultinfo from the UML ptrace regs struct.
+ * If the userfault did not happen in an UML userspace process, bad_segv is called.
+ * Otherwise the signal did happen in a cloned userspace process, handle it.
+ */
+void segv_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs,
+		  void *mc)
+{
+	struct faultinfo *fi = UPT_FAULTINFO(regs);
+
+	/* !MMU specific part; detection of userspace */
+	/* mark is_user=1 when the IP is from userspace code. */
+	if (UPT_IP(regs) > uml_reserved && UPT_IP(regs) < high_physmem)
+		regs->is_user = 1;
+
+	if (UPT_IS_USER(regs) && !SEGV_IS_FIXABLE(fi)) {
+		show_segv_info(regs);
+		bad_segv(*fi, UPT_IP(regs));
+		return;
+	}
+	segv(*fi, UPT_IP(regs), UPT_IS_USER(regs), regs, mc);
+
+	/* !MMU specific part; detection of userspace */
+	relay_signal(sig, unused_si, regs, mc);
+}
+
+/*
+ * We give a *copy* of the faultinfo in the regs to segv.
+ * This must be done, since nesting SEGVs could overwrite
+ * the info in the regs. A pointer to the info then would
+ * give us bad data!
+ */
+unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
+		   struct uml_pt_regs *regs, void *mc)
+{
+	int si_code;
+	int err;
+	int is_write = FAULT_WRITE(fi);
+	unsigned long address = FAULT_ADDRESS(fi);
+
+	if (!is_user && regs)
+		current->thread.segv_regs = container_of(regs, struct pt_regs, regs);
+
+	if (current->mm == NULL) {
+		show_regs(container_of(regs, struct pt_regs, regs));
+		panic("Segfault with no mm");
+	} else if (!is_user && address > PAGE_SIZE && address < TASK_SIZE) {
+		show_regs(container_of(regs, struct pt_regs, regs));
+		panic("Kernel tried to access user memory at addr 0x%lx, ip 0x%lx",
+		       address, ip);
+	}
+
+	if (SEGV_IS_FIXABLE(&fi))
+		err = handle_page_fault(address, ip, is_write, is_user,
+					&si_code);
+	else {
+		err = -EFAULT;
+		/*
+		 * A thread accessed NULL, we get a fault, but CR2 is invalid.
+		 * This code is used in __do_copy_from_user() of TT mode.
+		 * XXX tt mode is gone, so maybe this isn't needed any more
+		 */
+		address = 0;
+	}
+
+	if (!err)
+		goto out;
+	else if (!is_user && arch_fixup(ip, regs))
+		goto out;
+
+	if (!is_user) {
+		show_regs(container_of(regs, struct pt_regs, regs));
+		panic("Kernel mode fault at addr 0x%lx, ip 0x%lx",
+		      address, ip);
+	}
+
+	show_segv_info(regs);
+
+	if (err == -EACCES) {
+		current->thread.arch.faultinfo = fi;
+		force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
+	} else {
+		WARN_ON_ONCE(err != -EFAULT);
+		current->thread.arch.faultinfo = fi;
+		force_sig_fault(SIGSEGV, si_code, (void __user *) address);
+	}
+
+out:
+	if (regs)
+		current->thread.segv_regs = NULL;
+
+	return 0;
+}
+
+void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs,
+		  void *mc)
+{
+	int code, err;
+
+	/* !MMU specific part; detection of userspace */
+	/* mark is_user=1 when the IP is from userspace code. */
+	if (UPT_IP(regs) > uml_reserved && UPT_IP(regs) < high_physmem)
+		regs->is_user = 1;
+
+	if (!UPT_IS_USER(regs)) {
+		if (sig == SIGBUS)
+			pr_err("Bus error - the host /dev/shm or /tmp mount likely just ran out of space\n");
+		panic("Kernel mode signal %d", sig);
+	}
+	/* if is_user==1, set return to userspace sig handler to relay signal */
+	nommu_relay_signal(mc);
+
+	arch_examine_signal(sig, regs);
+
+	/* Is the signal layout for the signal known?
+	 * Signal data must be scrubbed to prevent information leaks.
+	 */
+	code = si->si_code;
+	err = si->si_errno;
+	if ((err == 0) && (siginfo_layout(sig, code) == SIL_FAULT)) {
+		struct faultinfo *fi = UPT_FAULTINFO(regs);
+
+		current->thread.arch.faultinfo = *fi;
+		force_sig_fault(sig, code, (void __user *)FAULT_ADDRESS(*fi));
+	} else {
+		pr_err("Attempted to relay unknown signal %d (si_code = %d) with errno %d\n",
+		       sig, code, err);
+		force_sig(sig);
+	}
+}
+
+void winch(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs,
+	   void *mc)
+{
+	do_IRQ(WINCH_IRQ, regs);
+}
diff --git a/arch/um/os-Linux/signal.c b/arch/um/os-Linux/signal.c
index 53e276e81b37..67dcd88b45b1 100644
--- a/arch/um/os-Linux/signal.c
+++ b/arch/um/os-Linux/signal.c
@@ -40,9 +40,10 @@ static void sig_handler_common(int sig, struct siginfo *si, mcontext_t *mc)
 	int save_errno = errno;
 
 	r.is_user = 0;
+	if (mc)
+		get_regs_from_mc(&r, mc);
 	if (sig == SIGSEGV) {
 		/* For segfaults, we want the data from the sigcontext. */
-		get_regs_from_mc(&r, mc);
 		GET_FAULTINFO_FROM_MC(r.faultinfo, mc);
 	}
 
diff --git a/arch/x86/um/nommu/do_syscall_64.c b/arch/x86/um/nommu/do_syscall_64.c
index 74d5bcc4508d..d77e69e097c1 100644
--- a/arch/x86/um/nommu/do_syscall_64.c
+++ b/arch/x86/um/nommu/do_syscall_64.c
@@ -44,6 +44,9 @@ __visible void do_syscall_64(struct pt_regs *regs)
 	/* set fs register to the original host one */
 	os_x86_arch_prctl(0, ARCH_SET_FS, (void *)host_fs);
 
+	/* save fp registers */
+	asm volatile("fxsaveq %0" : "=m"(*(struct _xstate *)regs->regs.fp));
+
 	if (likely(syscall < NR_syscalls)) {
 		PT_REGS_SET_SYSCALL_RETURN(regs,
 				EXECUTE_SYSCALL(syscall, regs));
@@ -54,6 +57,9 @@ __visible void do_syscall_64(struct pt_regs *regs)
 	/* handle tasks and signals at the end */
 	interrupt_end();
 
+	/* restore fp registers */
+	asm volatile("fxrstorq %0" : : "m"((current->thread.regs.regs.fp)));
+
 	/* restore back fs register to userspace configured one */
 	os_x86_arch_prctl(0, ARCH_SET_FS,
 		      (void *)(current->thread.regs.regs.gp[FS_BASE
diff --git a/arch/x86/um/nommu/entry_64.S b/arch/x86/um/nommu/entry_64.S
index 950447dfa66b..e038bc7b53ac 100644
--- a/arch/x86/um/nommu/entry_64.S
+++ b/arch/x86/um/nommu/entry_64.S
@@ -111,3 +111,17 @@ ENTRY(userspace)
 	jmp	*%r11
 
 END(userspace)
+
+/*
+ * this routine prepares the stack to return via host-generated
+ * signals (e.g., SEGV, FPE) via do_signal() from interrupt_end().
+ */
+ENTRY(__prep_sigreturn)
+	/*
+	 * Switch to current top of stack, so "current->" points
+	 * to the right task.
+	 */
+	movq	current_top_of_stack, %rsp
+
+	jmp	userspace
+END(__prep_sigreturn)
diff --git a/arch/x86/um/nommu/os-Linux/mcontext.c b/arch/x86/um/nommu/os-Linux/mcontext.c
index c4ef877d5ea0..87fb2a35e7ff 100644
--- a/arch/x86/um/nommu/os-Linux/mcontext.c
+++ b/arch/x86/um/nommu/os-Linux/mcontext.c
@@ -6,6 +6,11 @@
 #include <sysdep/mcontext.h>
 #include <sysdep/syscalls.h>
 
+void set_mc_return_address(mcontext_t *mc)
+{
+	mc->gregs[REG_RIP] = (unsigned long) __prep_sigreturn;
+}
+
 void set_mc_sigsys_hook(mcontext_t *mc)
 {
 	mc->gregs[REG_RCX] = mc->gregs[REG_RIP];
diff --git a/arch/x86/um/shared/sysdep/mcontext.h b/arch/x86/um/shared/sysdep/mcontext.h
index 9a0d6087f357..de4041b758f3 100644
--- a/arch/x86/um/shared/sysdep/mcontext.h
+++ b/arch/x86/um/shared/sysdep/mcontext.h
@@ -19,6 +19,7 @@ extern int set_stub_state(struct uml_pt_regs *regs, struct stub_data *data,
 
 #ifndef CONFIG_MMU
 extern void set_mc_sigsys_hook(mcontext_t *mc);
+extern void set_mc_return_address(mcontext_t *mc);
 #endif
 
 #ifdef __i386__
diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h
index 8f7476ff6e95..7d553d9f05be 100644
--- a/arch/x86/um/shared/sysdep/ptrace.h
+++ b/arch/x86/um/shared/sysdep/ptrace.h
@@ -65,7 +65,7 @@ struct uml_pt_regs {
 	int is_user;
 
 	/* Dynamically sized FP registers (holds an XSTATE) */
-	unsigned long fp[];
+	unsigned long fp[] __attribute__((aligned(16)));
 };
 
 #define EMPTY_UML_PT_REGS { }
diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h
index ffd80ee3b9dc..bd152422cdfb 100644
--- a/arch/x86/um/shared/sysdep/syscalls_64.h
+++ b/arch/x86/um/shared/sysdep/syscalls_64.h
@@ -29,6 +29,7 @@ extern syscall_handler_t sys_arch_prctl;
 extern void do_syscall_64(struct pt_regs *regs);
 extern long __kernel_vsyscall(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
 			      int64_t a4, int64_t a5, int64_t a6);
+extern void __prep_sigreturn(void);
 #endif
 
 #endif
-- 
2.43.0



thanks, and have a nice day,
-- Hajime

On Wed, 02 Jul 2025 13:37:50 +0900,
Hajime Tazaki wrote:
> 
> 
> Hello Benjamin,
> 
> On Tue, 01 Jul 2025 21:03:36 +0900,
> Benjamin Berg wrote:
> > 
> > Hi Hajim,
> > 
> > On Mon, 2025-06-30 at 10:04 +0900, Hajime Tazaki wrote:
> > > 
> > > Hello Benjamin,
> > > 
> > > On Sat, 28 Jun 2025 00:02:05 +0900,
> > > Benjamin Berg wrote:
> > > > 
> > > > Hi,
> > > > 
> > > > On Fri, 2025-06-27 at 22:50 +0900, Hajime Tazaki wrote:
> > > > > thanks for the comment on the complicated part of the kernel (signal).
> > > > 
> > > > This stuff isn't simple.
> > > > 
> > > > Actually, I am starting to think that the current MMU UML kernel also
> > > > needs a redesign with regard to signal handling and stack use in that
> > > > case. My current impression is that the design right now only permits
> > > > voluntarily scheduling. More specifically, scheduling in response to an
> > > > interrupt is impossible.
> > > > 
> > > > I suppose that works fine, but it also does not seem quite right.
> > > 
> > > thanks for the info.  it's very useful to understand what's going on.
> > > 
> > > (snip)
> > > 
> > > > > > > +void set_mc_userspace_relay_signal(mcontext_t *mc)
> > > > > > > +{
> > > > > > > + mc->gregs[REG_RIP] = (unsigned long) __userspace_relay_signal;
> > > > > > > +}
> > > > > > > +
> > > > > 
> > > > > This is a bit scary code which I tried to handle when SIGSEGV is
> > > > > raised by host for a userspace program running on UML (nommu).
> > > > > 
> > > > > # and I should remember my XXX tag is important to fix....
> > > > > 
> > > > > let me try to explain what happens and what I tried to solve.
> > > > > 
> > > > > The SEGV signal from userspace program is delivered to userspace but
> > > > > if we don't fix the code raising the signal, after (um) rt_sigreturn,
> > > > > it will restart from $rip and raise SIGSEGV again.
> > > > > 
> > > > > # so, yes, we've already relied on host and um's rt_sigreturn to
> > > > >   restore various things.
> > > > > 
> > > > > when a uml userspace crashes with SIGSEGV,
> > > > > 
> > > > > - host kernel raises SIGSEGV (at original $rip)
> > > > > - caught by uml process (hard_handler)
> > > > > - raise a signal to uml userspace process (segv_handler)
> > > > > - handler ends (hard_handler)
> > > > > - (host) run restorer (rt_sigreturn, registered by (libc)sigaction,
> > > > >   not (host) rt_sigaction)
> > > > > - return back to the original $rip
> > > > > - (back to top)
> > > > > 
> > > > > this is the case where endless loop is happened.
> > > > > um's sa_handler isn't called as rt_sigreturn (um) isn't called.
> > > > > and the my original attempt (__userspace_relay_signal) is what I tried.
> > > > > 
> > > > > I agree that it is lazy to call a dummy syscall (indeed, getpid).
> > > > > I'm trying to introduce another routine to jump into userspace and
> > > > > call (um) rt_sigreturn after (host) rt_sigreturn.
> > > > > 
> > > > > > And this is really confusing me. The way I am reading it, the code
> > > > > > tries to do:
> > > > > >    1. Rewrite RIP to jump to __userspace_relay_signal
> > > > > >    2. Trigger a getpid syscall (to do "nothing"?)
> > > > > >    3. Let do_syscall_64 fire the signal from interrupt_end
> > > > > 
> > > > > correct.
> > > > > 
> > > > > > However, then that really confuses me, because:
> > > > > >  * If I am reading it correctly, then this approach will destroy the
> > > > > >    contents of various registers (RIP, RAX and likely more)
> > > > > >  * This would result in an incorrect mcontext in the userspace signal
> > > > > >    handler (which could be relevant if userspace is inspecting it)
> > > > > >  * However, worst, rt_sigreturn will eventually jump back
> > > > > >    into__userspace_relay_signal, which has nothing to return to.
> > > > > >  * Also, relay_signal doesn't use this? What happens for a SIGFPE, how
> > > > > >    is userspace interrupted immediately in that case?
> > > > > 
> > > > > relay_signal shares the same goal of this, indeed.
> > > > > but the issue with `mc->gregs[REG_RIP]` (endless signals) still exists
> > > > > I guess.
> > > > 
> > > > Well, endless signals only exist as long as you exit to the same
> > > > location. My suggestion was to read the user state from the mcontext
> > > > (as SECCOMP mode does it) and executing the signal right away, i.e.:
> > > 
> > > thanks too;  below is my understanding.
> > > 
> > > >  * Fetch the current registers from the mcontext
> > > 
> > > I guess this is already done in sig_handler_common().
> > 
> > Well, not really?
> > 
> > It does seem to fetch the general purpose registers. But the code
> > pretty much assumes we will return to the same location and only stores
> > them on the stack for the signal handler itself. Also, remember that it
> > might be userspace or kernel space in your case. The kernel task
> > registers are in "switch_buf" while the userspace registers are in
> > "regs" of "struct task_struct" (effectively "struct uml_pt_regs").
> 
> indeed, the handler returns to the same location.
> here is what the current patchset does for the signal handling.
> 
> # sorry i might be writing same things several times, but I hope
>   this will help to understand/discuss what it should be.
> 
> receive signal (from host)
> - > call host sa_handler (hard_handler)
>  - > sig_handler_common => get_regs_from_mc (fetch host mcontext to um)
>  - > set TIF_SIGPENDING (um kernel)
>  - > set host mcontext[RIP] to __userspace_relay_signal
> (host sa_handler ends)
> - call host sa_restorer => return to mcontext[RIP]
>  - > call __userspace_relay_signal from mcontext[RIP]
>  - > call interrupt_end()
>  - > do_signal => handle_signal => setup_signal_stack_si
>      (because TIF_SIGPENDING is on above)
>  - > call userspace sa_handler
>  - > call userspace sa_restorer
> 
> instead of set mcontext[RIP] to userspace sa_handler, it uses
> __userspace_relay_signal, which configures stack and mcontext (via
> interrupt_end, setup_signal_stack_si, etc) and call userspace
> sa_handler/restorer after that.
> 
> in this way, programs runs userspace sa_handler not in the host
> sa_handler context.  I guess this means we don't have to configure
> host register/mcontext with the userspace one ?
> 
> I agree that the current __userspace_relay_signal can be shrunk not
> to call __kernel_vsyscall and focus on interrupt_end and stack
> preparation.
> 
> > > >  * Push the signal context onto the userspace stack
> > > 
> > > (guess) this is already done on handle_signal() => setup_signal_stack_si().
> > > 
> > > >  * Modify the host mcontext to set registers for the signal handler
> > > 
> > > this is something which I'm not well understanding.
> > > - do you mean the host handler when you say "for the signal handler" ?
> > >   or the userspace handler ?
> > 
> > Both in a way ;-)
> > 
> > I mean modify the registers in the host mcontext so that the UML
> > userspace will continue executing inside its signal handler.
> >
> > > - if former (the host one), maybe mcontext is already there so, it
> > >   might not be the one you mentioned.
> > > - if the latter, how the original handler (the host one,
> > >   hard_handler()) works ? even if we can call userspace handler
> > >   instead of the host one, we need to call the host handler (and
> > >   restorer).  do we call both ?
> > > - and by "to set registers", what register do you mean ? for the
> > >   registers inspected by userspace signal handler ?  but if you set a
> > >   register, for instance RIP, as the fault location to the host
> > >   register, it will return to RIP after handler and restart the fault
> > >   again ?
> > 
> > I am confused, why would the fault handler be restarted? If you modify
> > RIP, then the host kernel will not return to the faulting location. You
> > were using that already to jump into __userspace_relay_signal. All I am
> > arguing that instead of jumping to __userspace_relay_signal you can
> > prepare everything and directly jump into the users signal handler.
> 
> what I meant in that example is; set host mcontext[RIP] to the fault
> location, as a userspace information, which will lead to the fault
> again.  But this doesn't change RIP before and after so, I guess this
> isn't a good example..
> Sorry for the confusion.
> 
> > > >  * Jump back to userspace by doing a "return"
> > > 
> > > this is still also unclear to me.
> > > 
> > > it would be very helpful if you point the location of the code (at
> > > uml/next tree) on how SECCOMP mode does.  I'm also looking at but
> > > really hard to map what you described and the code (sorry).
> > 
> > "stub_signal_interrupt" simply returns, which means it jumps into the
> > restorer "stub_signal_restorer" which does the rt_sigreturn syscall.
> > This means the host kernel restores the userspace state from the
> > mcontext. As the mcontext resides in shared memory, the UML kernel can
> > update it to specify where the process should continue running (thread
> > switching, signals, syscall return value, …).
> 
> thanks !
> 
> so, stub_signal_interrupt runs on a different host process.
> nommu mode tries to reuse existing host sa_handler (hard_handler) to
> do the job (handle SEGV etc).
> 
> If there are something missing on hard_handler and co on nommmu mode
> for what userspace_tramp does on seccomp mode, I've been trying to
> update it.
> 
> -- Hajime
> 
> > 
> > Benjamin
> > 
> > > all of above runs within hard_handler() in nommu mode on SIGSEGV.
> > > my best guess is this is different from what ptrace/seccomp do.
> > > 
> > > > Said differently, I really prefer deferring as much logic as possible
> > > > to the host. This is both safer and easier to understand. Plus, it also
> > > > has the advantage of making it simpler to port UML to other
> > > > architectures.
> > > 
> > > okay.
> > > 
> > > > 
> > > > > > Honestly, I really think we should take a step back and swap the
> > > > > > current syscall entry/exit code. That would likely also simplify
> > > > > > floating point register handling, which I think is currently
> > > > > > insufficient do deal with the odd special cases caused by different
> > > > > > x86_64 hardware extensions.
> > > > > > 
> > > > > > Basically, I think nommu mode should use the same general approach as
> > > > > > the current SECCOMP mode. Which is to use rt_sigreturn to jump into
> > > > > > userspace and let the host kernel deal with the ugly details of how to
> > > > > > do that.
> > > > > 
> > > > > I looked at how MMU mode (ptrace/seccomp) does handle this case.
> > > > > 
> > > > > In nommu mode, we don't have external process to catch signals so, the
> > > > > nommu mode uses hard_handler() to catch SEGV/FPE of userspace
> > > > > programs.  While mmu mode calls segv_handler not in a context of
> > > > > signal handler.
> > > > > 
> > > > > # correct me if I'm wrong.
> > > > > 
> > > > > thus, mmu mode doesn't have this situation.
> > > > 
> > > > Yes, it does not have this specific issue. But see the top of the mail
> > > > for other issues that are somewhat related.
> > > > 
> > > > > I'm attempting various ways; calling um's rt_sigreturn instead of
> > > > > host's one, which doesn't work as host restore procedures (unblocking
> > > > > masked signals, restoring register states, etc) aren't called.
> > > > > 
> > > > > I'll update here if I found a good direction, but would be great if
> > > > > you see how it should be handled.
> > > > 
> > > > Can we please discuss possible solutions? We can figure out the details
> > > > once it is clear how the interaction with the host should work.
> > > 
> > > I was wishing to update to you that I'm working on it.  So, your
> > > comments are always helpful to me.  Thanks.
> > > 
> > > -- Hajime
> > > 
> > > > I still think that the idea of using the kernel task stack as the
> > > > signal stack is really elegant. Actually, doing that in normal UML may
> > > > be how we can fix the issues mentioned at the top of my mail. And for
> > > > nommu, we can also use the host mcontext to jump back into userspace
> > > > using a simple "return".
> > > > 
> > > > Conceptually it seems so simple.
> > > > 
> > > > Benjamin
> > > > 
> > > > 
> > > > > 
> > > > > -- Hajime
> > > > > 
> > > > > > I believe that this requires a second "userspace" sigaltstack in
> > > > > > addition to the current "IRQ" sigaltstack. Then switching in between
> > > > > > the two (note that the "userspace" one is also used for IRQs if those
> > > > > > happen while userspace is executing).
> > > > > > 
> > > > > > So, in principle I would think something like:
> > > > > >  * to jump into userspace, you would:
> > > > > >     - block all signals
> > > > > >     - set "userspace" sigaltstack
> > > > > >     - setup mcontext for rt_sigreturn
> > > > > >     - setup RSP for rt_sigreturn
> > > > > >     - call rt_sigreturn syscall
> > > > > >  * all signal handlers can (except pure IRQs):
> > > > > >     - check on which stack they are
> > > > > >       -> easy to detect whether we are in kernel mode
> > > > > >     - for IRQs one can probably handle them directly (and return)
> > > > > >     - in user mode:
> > > > > >        + store mcontext location and information needed for rt_sigreturn
> > > > > >        + jump back into kernel task stack
> > > > > >  * kernel task handler to continue would:
> > > > > >     - set sigaltstack to IRQ stack
> > > > > >     - fetch register from mcontext
> > > > > >     - unblock all signals
> > > > > >     - handle syscall/signal in whatever way needed
> > > > > > 
> > > > > > Now that I wrote about it, I am thinking that it might be possible to
> > > > > > just use the kernel task stack for the signal stack. One would probably
> > > > > > need to increase the kernel stack size a bit, but it would also mean
> > > > > > that no special code is needed for "rt_sigreturn" handling. The rest
> > > > > > would remain the same.
> > > > > > 
> > > > > > Thoughts?
> > > > > > 
> > > > > > Benjamin
> > > > > > 
> > > > > > > [SNIP]
> > > > > > 
> > > > > 
> > > > 
> > > 
> >