[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <412c514334ac14a992cab3e7b86170b96d60be1c.camel@sipsolutions.net>
Date: Thu, 19 Jun 2025 12:40:49 +0200
From: Benjamin Berg <benjamin@...solutions.net>
To: Hajime Tazaki <thehajime@...il.com>, linux-um@...ts.infradead.org
Cc: ricarkol@...gle.com, Liam.Howlett@...cle.com,
linux-kernel@...r.kernel.org
Subject: Re: [PATCH v9 07/13] um: nommu: configure fs register on host
syscall invocation
On Thu, 2025-06-19 at 10:04 +0900, Hajime Tazaki wrote:
> As userspace on UML/!MMU also need to configure %fs register when it is
> running to correctly access thread structure, host syscalls implemented
> in os-Linux drivers may be puzzled when they are called. Thus it has to
> configure %fs register via arch_prctl(SET_FS) on every host syscalls.
Really, I still think that we should "just" get rid of libc entirely
inside UML. That would avoid so many weird/potential issues …
Doesn't change the fact that FS/GS needs to be restored when doing
thread switches and such. Though one might be able to do it entirely
within arch_switch_to then.
Benjamin
>
> Signed-off-by: Hajime Tazaki <thehajime@...il.com>
> Signed-off-by: Ricardo Koller <ricarkol@...gle.com>
> ---
> arch/um/include/shared/os.h | 6 +++
> arch/um/os-Linux/process.c | 6 +++
> arch/um/os-Linux/start_up.c | 21 +++++++++
> arch/x86/um/nommu/do_syscall_64.c | 37 ++++++++++++++++
> arch/x86/um/nommu/syscalls_64.c | 71 +++++++++++++++++++++++++++++++
> 5 files changed, 141 insertions(+)
>
> diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
> index 1251f08e26d0..7c6a8bc0447c 100644
> --- a/arch/um/include/shared/os.h
> +++ b/arch/um/include/shared/os.h
> @@ -189,6 +189,7 @@ extern void check_host_supports_tls(int *supports_tls, int *tls_min);
> extern void get_host_cpu_features(
> void (*flags_helper_func)(char *line),
> void (*cache_helper_func)(char *line));
> +extern int host_has_fsgsbase;
>
> /* mem.c */
> extern int create_mem_file(unsigned long long len);
> @@ -213,6 +214,11 @@ extern int os_protect_memory(void *addr, unsigned long len,
> extern int os_unmap_memory(void *addr, int len);
> extern int os_drop_memory(void *addr, int length);
> extern int can_drop_memory(void);
> +extern int os_arch_prctl(int pid, int option, unsigned long *arg);
> +#ifndef CONFIG_MMU
> +extern long long host_fs;
> +#endif
> +
>
> void os_set_pdeathsig(void);
>
> diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c
> index 8a1ab59a089f..3a6d34ccd12b 100644
> --- a/arch/um/os-Linux/process.c
> +++ b/arch/um/os-Linux/process.c
> @@ -16,6 +16,7 @@
> #include <sys/prctl.h>
> #include <sys/wait.h>
> #include <asm/unistd.h>
> +#include <sys/syscall.h> /* For SYS_xxx definitions */
> #include <linux/threads.h>
> #include <init.h>
> #include <longjmp.h>
> @@ -178,6 +179,11 @@ int __init can_drop_memory(void)
> return ok;
> }
>
> +int os_arch_prctl(int pid, int option, unsigned long *arg2)
> +{
> + return syscall(SYS_arch_prctl, option, arg2);
> +}
> +
> void init_new_thread_signals(void)
> {
> set_handler(SIGSEGV);
> diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
> index 4e1f05360c49..55dd92bd2a0b 100644
> --- a/arch/um/os-Linux/start_up.c
> +++ b/arch/um/os-Linux/start_up.c
> @@ -20,6 +20,8 @@
> #include <sys/resource.h>
> #include <asm/ldt.h>
> #include <asm/unistd.h>
> +#include <sys/auxv.h>
> +#include <asm/hwcap2.h>
> #include <init.h>
> #include <os.h>
> #include <kern_util.h>
> @@ -36,6 +38,8 @@
> #include <skas.h>
> #include "internal.h"
>
> +int host_has_fsgsbase;
> +
> static void ptrace_child(void)
> {
> int ret;
> @@ -459,6 +463,20 @@ __uml_setup("seccomp=", uml_seccomp_config,
> " This is insecure and should only be used with a trusted userspace\n\n"
> );
>
> +static void __init check_fsgsbase(void)
> +{
> + unsigned long auxv = getauxval(AT_HWCAP2);
> +
> + os_info("Checking FSGSBASE instructions...");
> + if (auxv & HWCAP2_FSGSBASE) {
> + host_has_fsgsbase = 1;
> + os_info("OK\n");
> + } else {
> + host_has_fsgsbase = 0;
> + os_info("disabled\n");
> + }
> +}
> +
> void __init os_early_checks(void)
> {
> int pid;
> @@ -484,6 +502,9 @@ void __init os_early_checks(void)
> using_seccomp = 0;
> check_ptrace();
>
> + /* probe fsgsbase instruction */
> + check_fsgsbase();
> +
> pid = start_ptraced_child();
> if (init_pid_registers(pid))
> fatal("Failed to initialize default registers");
> diff --git a/arch/x86/um/nommu/do_syscall_64.c b/arch/x86/um/nommu/do_syscall_64.c
> index 5d0fa83e7fdc..796beb0089fc 100644
> --- a/arch/x86/um/nommu/do_syscall_64.c
> +++ b/arch/x86/um/nommu/do_syscall_64.c
> @@ -2,10 +2,38 @@
>
> #include <linux/kernel.h>
> #include <linux/ptrace.h>
> +#include <asm/fsgsbase.h>
> +#include <asm/prctl.h>
> #include <kern_util.h>
> #include <sysdep/syscalls.h>
> #include <os.h>
>
> +static int os_x86_arch_prctl(int pid, int option, unsigned long *arg2)
> +{
> + if (!host_has_fsgsbase)
> + return os_arch_prctl(pid, option, arg2);
> +
> + switch (option) {
> + case ARCH_SET_FS:
> + wrfsbase(*arg2);
> + break;
> + case ARCH_SET_GS:
> + wrgsbase(*arg2);
> + break;
> + case ARCH_GET_FS:
> + *arg2 = rdfsbase();
> + break;
> + case ARCH_GET_GS:
> + *arg2 = rdgsbase();
> + break;
> + default:
> + pr_warn("%s: unsupported option: 0x%x", __func__, option);
> + break;
> + }
> +
> + return 0;
> +}
> +
> __visible void do_syscall_64(struct pt_regs *regs)
> {
> int syscall;
> @@ -17,6 +45,9 @@ __visible void do_syscall_64(struct pt_regs *regs)
> syscall, (unsigned long)current,
> (unsigned long)sys_call_table[syscall]);
>
> + /* set fs register to the original host one */
> + os_x86_arch_prctl(0, ARCH_SET_FS, (void *)host_fs);
> +
> if (likely(syscall < NR_syscalls)) {
> PT_REGS_SET_SYSCALL_RETURN(regs,
> EXECUTE_SYSCALL(syscall, regs));
> @@ -34,4 +65,10 @@ __visible void do_syscall_64(struct pt_regs *regs)
> /* force do_signal() --> is_syscall() */
> set_thread_flag(TIF_SIGPENDING);
> interrupt_end();
> +
> + /* restore back fs register to userspace configured one */
> + os_x86_arch_prctl(0, ARCH_SET_FS,
> + (void *)(current->thread.regs.regs.gp[FS_BASE
> + / sizeof(unsigned long)]));
> +
> }
> diff --git a/arch/x86/um/nommu/syscalls_64.c b/arch/x86/um/nommu/syscalls_64.c
> index c78c442aed1d..5bb6d55b4bb5 100644
> --- a/arch/x86/um/nommu/syscalls_64.c
> +++ b/arch/x86/um/nommu/syscalls_64.c
> @@ -13,8 +13,70 @@
> #include <asm/prctl.h> /* XXX This should get the constants from libc */
> #include <registers.h>
> #include <os.h>
> +#include <asm/thread_info.h>
> +#include <asm/mman.h>
> #include "syscalls.h"
>
> +/*
> + * The guest libc can change FS, which confuses the host libc.
> + * In fact, changing FS directly is not supported (check
> + * man arch_prctl). So, whenever we make a host syscall,
> + * we should be changing FS to the original FS (not the
> + * one set by the guest libc). This original FS is stored
> + * in host_fs.
> + */
> +long long host_fs = -1;
> +
> +long arch_prctl(struct task_struct *task, int option,
> + unsigned long __user *arg2)
> +{
> + long ret = -EINVAL;
> + unsigned long *ptr = arg2, tmp;
> +
> + switch (option) {
> + case ARCH_SET_FS:
> + if (host_fs == -1)
> + os_arch_prctl(0, ARCH_GET_FS, (void *)&host_fs);
> + ret = 0;
> + break;
> + case ARCH_SET_GS:
> + ret = 0;
> + break;
> + case ARCH_GET_FS:
> + case ARCH_GET_GS:
> + ptr = &tmp;
> + break;
> + }
> +
> + ret = os_arch_prctl(0, option, ptr);
> + if (ret)
> + return ret;
> +
> + switch (option) {
> + case ARCH_SET_FS:
> + current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)] =
> + (unsigned long) arg2;
> + break;
> + case ARCH_SET_GS:
> + current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)] =
> + (unsigned long) arg2;
> + break;
> + case ARCH_GET_FS:
> + ret = put_user(current->thread.regs.regs.gp[FS_BASE / sizeof(unsigned long)], arg2);
> + break;
> + case ARCH_GET_GS:
> + ret = put_user(current->thread.regs.regs.gp[GS_BASE / sizeof(unsigned long)], arg2);
> + break;
> + }
> +
> + return ret;
> +}
> +
> +SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
> +{
> + return arch_prctl(current, option, (unsigned long __user *) arg2);
> +}
> +
> void arch_switch_to(struct task_struct *to)
> {
> /*
> @@ -42,3 +104,12 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
>
> return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
> }
> +
> +static int __init um_nommu_setup_hostfs(void)
> +{
> + /* initialize the host_fs value at boottime */
> + os_arch_prctl(0, ARCH_GET_FS, (void *)&host_fs);
> +
> + return 0;
> +}
> +arch_initcall(um_nommu_setup_hostfs);
Powered by blists - more mailing lists