[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <d1d08a207844b98d0beba96f48beaf73e30c3bd2.camel@sipsolutions.net>
Date: Thu, 19 Jun 2025 12:31:53 +0200
From: Benjamin Berg <benjamin@...solutions.net>
To: Hajime Tazaki <thehajime@...il.com>, linux-um@...ts.infradead.org
Cc: ricarkol@...gle.com, Liam.Howlett@...cle.com,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH v9 04/13] x86/um: nommu: syscall handling
Hi,
On Thu, 2025-06-19 at 10:04 +0900, Hajime Tazaki wrote:
> This commit introduces an entry point of syscall interface for !MMU
> mode. It uses an entry function, __kernel_vsyscall, a kernel-wide global
> symbol accessible from any locations.
>
> Although it isn't in the scope of this commit, it can be also exposed
> via vdso image which is directly accessible from userspace. A standard
> library (i.e., libc) can utilize this entry point to implement syscall
> wrapper; we can also use this by hooking syscall for unmodified userspace
> applications/libraries, which will be implemented in the subsequent
> commit.
>
> This only supports 64-bit mode of x86 architecture.
>
> Signed-off-by: Hajime Tazaki <thehajime@...il.com>
> Signed-off-by: Ricardo Koller <ricarkol@...gle.com>
> ---
> arch/x86/um/Makefile | 4 ++
> arch/x86/um/nommu/Makefile | 8 +++
> arch/x86/um/nommu/do_syscall_64.c | 37 ++++++++++
> arch/x86/um/nommu/entry_64.S | 91 +++++++++++++++++++++++++
> arch/x86/um/nommu/syscalls.h | 16 +++++
> arch/x86/um/shared/sysdep/syscalls_64.h | 6 ++
> 6 files changed, 162 insertions(+)
> create mode 100644 arch/x86/um/nommu/Makefile
> create mode 100644 arch/x86/um/nommu/do_syscall_64.c
> create mode 100644 arch/x86/um/nommu/entry_64.S
> create mode 100644 arch/x86/um/nommu/syscalls.h
>
> diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
> index b42c31cd2390..227af2a987e2 100644
> --- a/arch/x86/um/Makefile
> +++ b/arch/x86/um/Makefile
> @@ -32,6 +32,10 @@ obj-y += syscalls_64.o vdso/
> subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o \
> ../lib/memmove_64.o ../lib/memset_64.o
>
> +ifneq ($(CONFIG_MMU),y)
> +obj-y += nommu/
> +endif
> +
> endif
>
> subarch-$(CONFIG_MODULES) += ../kernel/module.o
> diff --git a/arch/x86/um/nommu/Makefile b/arch/x86/um/nommu/Makefile
> new file mode 100644
> index 000000000000..d72c63afffa5
> --- /dev/null
> +++ b/arch/x86/um/nommu/Makefile
> @@ -0,0 +1,8 @@
> +# SPDX-License-Identifier: GPL-2.0
> +ifeq ($(CONFIG_X86_32),y)
> + BITS := 32
> +else
> + BITS := 64
> +endif
> +
> +obj-y = do_syscall_$(BITS).o entry_$(BITS).o
> diff --git a/arch/x86/um/nommu/do_syscall_64.c b/arch/x86/um/nommu/do_syscall_64.c
> new file mode 100644
> index 000000000000..5d0fa83e7fdc
> --- /dev/null
> +++ b/arch/x86/um/nommu/do_syscall_64.c
> @@ -0,0 +1,37 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/kernel.h>
> +#include <linux/ptrace.h>
> +#include <kern_util.h>
> +#include <sysdep/syscalls.h>
> +#include <os.h>
> +
> +__visible void do_syscall_64(struct pt_regs *regs)
> +{
> + int syscall;
> +
> + syscall = PT_SYSCALL_NR(regs->regs.gp);
> + UPT_SYSCALL_NR(®s->regs) = syscall;
> +
> + pr_debug("syscall(%d) (current=%lx) (fn=%lx)\n",
> + syscall, (unsigned long)current,
> + (unsigned long)sys_call_table[syscall]);
You probably want to drop the pr_debug from the syscall path.
> + if (likely(syscall < NR_syscalls)) {
> + PT_REGS_SET_SYSCALL_RETURN(regs,
> + EXECUTE_SYSCALL(syscall, regs));
> + }
> +
> + pr_debug("syscall(%d) --> %lx\n", syscall,
> + regs->regs.gp[HOST_AX]);
> +
> + PT_REGS_SYSCALL_RET(regs) = regs->regs.gp[HOST_AX];
> +
> + /* execve succeeded */
> + if (syscall == __NR_execve && regs->regs.gp[HOST_AX] == 0)
> + userspace(¤t->thread.regs.regs);
That said, this is what I am stumbling over. Why do you need to jump
into userspace() here? It seems odd to me to need a special case in the
syscall path itself.
Aren't there other possibilities to hook/override the kernel task
state?
> +
> + /* force do_signal() --> is_syscall() */
> + set_thread_flag(TIF_SIGPENDING);
> + interrupt_end();
Same here. The MMU UML code seems to also do this, but restricted to
ptrace'd processes? Maybe I am just missing something obvious …
Benjamin
> +}
> diff --git a/arch/x86/um/nommu/entry_64.S b/arch/x86/um/nommu/entry_64.S
> new file mode 100644
> index 000000000000..e9bfc7b93c84
> --- /dev/null
> +++ b/arch/x86/um/nommu/entry_64.S
> @@ -0,0 +1,91 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#include <asm/errno.h>
> +
> +#include <linux/linkage.h>
> +#include <asm/percpu.h>
> +#include <asm/desc.h>
> +
> +#include "../entry/calling.h"
> +
> +#ifdef CONFIG_SMP
> +#error need to stash these variables somewhere else
> +#endif
> +
> +#define UM_GLOBAL_VAR(x) .data; .align 8; .globl x; x:; .long 0
> +
> +UM_GLOBAL_VAR(current_top_of_stack)
> +UM_GLOBAL_VAR(current_ptregs)
> +
> +.code64
> +.section .entry.text, "ax"
> +
> +.align 8
> +#undef ENTRY
> +#define ENTRY(x) .text; .globl x; .type x,%function; x:
> +#undef END
> +#define END(x) .size x, . - x
> +
> +/*
> + * %rcx has the return address (we set it before entering __kernel_vsyscall).
> + *
> + * Registers on entry:
> + * rax system call number
> + * rcx return address
> + * rdi arg0
> + * rsi arg1
> + * rdx arg2
> + * r10 arg3
> + * r8 arg4
> + * r9 arg5
> + *
> + * (note: we are allowed to mess with r11: r11 is callee-clobbered
> + * register in C ABI)
> + */
> +ENTRY(__kernel_vsyscall)
> +
> + movq %rsp, %r11
> +
> + /* Point rsp to the top of the ptregs array, so we can
> + just fill it with a bunch of push'es. */
> + movq current_ptregs, %rsp
> +
> + /* 8 bytes * 20 registers (plus 8 for the push) */
> + addq $168, %rsp
> +
> + /* Construct struct pt_regs on stack */
> + pushq $0 /* pt_regs->ss (index 20) */
> + pushq %r11 /* pt_regs->sp */
> + pushfq /* pt_regs->flags */
> + pushq $0 /* pt_regs->cs */
> + pushq %rcx /* pt_regs->ip */
> + pushq %rax /* pt_regs->orig_ax */
> +
> + PUSH_AND_CLEAR_REGS rax=$-ENOSYS
> +
> + mov %rsp, %rdi
> +
> + /*
> + * Switch to current top of stack, so "current->" points
> + * to the right task.
> + */
> + movq current_top_of_stack, %rsp
> +
> + call do_syscall_64
> +
> + movq current_ptregs, %rsp
> +
> + POP_REGS
> +
> + addq $8, %rsp /* skip orig_ax */
> + popq %rcx /* pt_regs->ip */
> + addq $8, %rsp /* skip cs */
> + addq $8, %rsp /* skip flags */
> + popq %rsp
> +
> + /*
> + * not return w/ ret but w/ jmp as the stack is already popped before
> + * entering __kernel_vsyscall
> + */
> + jmp *%rcx
> +
> +END(__kernel_vsyscall)
> diff --git a/arch/x86/um/nommu/syscalls.h b/arch/x86/um/nommu/syscalls.h
> new file mode 100644
> index 000000000000..a2433756b1fc
> --- /dev/null
> +++ b/arch/x86/um/nommu/syscalls.h
> @@ -0,0 +1,16 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __UM_NOMMU_SYSCALLS_H
> +#define __UM_NOMMU_SYSCALLS_H
> +
> +
> +#define task_top_of_stack(task) \
> +({ \
> + unsigned long __ptr = (unsigned long)task->stack; \
> + __ptr += THREAD_SIZE; \
> + __ptr; \
> +})
> +
> +extern long current_top_of_stack;
> +extern long current_ptregs;
> +
> +#endif
> diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h
> index b6b997225841..ffd80ee3b9dc 100644
> --- a/arch/x86/um/shared/sysdep/syscalls_64.h
> +++ b/arch/x86/um/shared/sysdep/syscalls_64.h
> @@ -25,4 +25,10 @@ extern syscall_handler_t *sys_call_table[];
> extern syscall_handler_t sys_modify_ldt;
> extern syscall_handler_t sys_arch_prctl;
>
> +#ifndef CONFIG_MMU
> +extern void do_syscall_64(struct pt_regs *regs);
> +extern long __kernel_vsyscall(int64_t a0, int64_t a1, int64_t a2, int64_t a3,
> + int64_t a4, int64_t a5, int64_t a6);
> +#endif
> +
> #endif
Powered by blists - more mailing lists