[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20210510185316.3307264-8-hpa@zytor.com>
Date: Mon, 10 May 2021 11:53:16 -0700
From: "H. Peter Anvin" <hpa@...or.com>
To: Ingo Molnar <mingo@...hat.com>,
Thomas Gleixner <tglx@...utronix.de>,
Borislav Petkov <bp@...en8.de>,
Andy Lutomirski <luto@...nel.org>
Cc: "H. Peter Anvin" <hpa@...or.com>,
Linux Kernel Mailing List <linux-kernel@...r.kernel.org>
Subject: [RFC v2 PATCH 7/7] x86/entry: use int for syscall number; handle all invalid syscall nrs
From: "H. Peter Anvin (Intel)" <hpa@...or.com>
Redefine the system call number consistently to be "int". The value -1
is a non-system call (which can be poked in by ptrace/seccomp to
indicate that no further processing should be done and that the return
value should be the current value in regs->ax, default to -ENOSYS; any
other value which does not correspond to a valid system call
unconditionally calls sys_ni_syscall() and returns -ENOSYS just like
any system call that corresponds to a hole in the system call table.
This is the defined semantics of syscall_get_nr(), so that is what all
the architecture-independent code already expects. As documented in
<asm-generic/syscall.h> (which is simply the documentation file for
<asm/syscall.h>):
/**
* syscall_get_nr - find what system call a task is executing
* @task: task of interest, must be blocked
* @regs: task_pt_regs() of @task
*
* If @task is executing a system call or is at system call
* tracing about to attempt one, returns the system call number.
* If @task is not executing a system call, i.e. it's blocked
* inside the kernel for a fault or signal, returns -1.
*
* Note this returns int even on 64-bit machines. Only 32 bits of
* system call number can be meaningful. If the actual arch value
* is 64 bits, this truncates to 32 bits so 0xffffffff means -1.
*
* It's only valid to call this when @task is known to be blocked.
*/
int syscall_get_nr(struct task_struct *task, struct pt_regs *regs);
Signed-off-by: H. Peter Anvin (Intel) <hpa@...or.com>
---
arch/x86/entry/common.c | 79 +++++++++++++++++++++++-----------
arch/x86/entry/entry_64.S | 2 +-
arch/x86/include/asm/syscall.h | 2 +-
3 files changed, 55 insertions(+), 28 deletions(-)
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 00da0f5420de..bf1ccaf101d7 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -36,61 +36,89 @@
#include <asm/irq_stack.h>
#ifdef CONFIG_X86_64
-__visible noinstr void do_syscall_64(struct pt_regs *regs, unsigned long nr)
+
+static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
+{
+ unsigned long unr = nr;
+
+ if (likely(unr < NR_syscalls)) {
+ unr = array_index_nospec(unr, NR_syscalls);
+ regs->ax = sys_call_table[unr](regs);
+ return true;
+ }
+ return false;
+}
+
+static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
+{
+ unsigned long xnr = nr;
+
+ xnr -= __X32_SYSCALL_BIT;
+
+ if (IS_ENABLED(CONFIG_X86_X32_ABI) &&
+ likely(xnr < X32_NR_syscalls)) {
+ xnr = array_index_nospec(xnr, X32_NR_syscalls);
+ regs->ax = x32_sys_call_table[xnr](regs);
+ return true;
+ }
+ return false;
+}
+
+__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
{
add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);
instrumentation_begin();
- if (likely(nr < NR_syscalls)) {
- nr = array_index_nospec(nr, NR_syscalls);
- regs->ax = sys_call_table[nr](regs);
-#ifdef CONFIG_X86_X32_ABI
- } else if (likely((nr & __X32_SYSCALL_BIT) &&
- (nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) {
- nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT,
- X32_NR_syscalls);
- regs->ax = x32_sys_call_table[nr](regs);
-#endif
+
+ if (!do_syscall_x64(regs, nr) &&
+ !do_syscall_x32(regs, nr) &&
+ nr != -1) {
+ /* Invalid system call, but still a system call? */
+ regs->ax = __x64_sys_ni_syscall(regs);
}
+
instrumentation_end();
syscall_exit_to_user_mode(regs);
}
#endif
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
-static __always_inline unsigned int syscall_32_enter(struct pt_regs *regs)
+static __always_inline int syscall_32_enter(struct pt_regs *regs)
{
if (IS_ENABLED(CONFIG_IA32_EMULATION))
current_thread_info()->status |= TS_COMPAT;
- return (unsigned int)regs->orig_ax;
+ return (int)regs->orig_ax;
}
/*
* Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL.
*/
-static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs,
- unsigned int nr)
+static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs, int nr)
{
- if (likely(nr < IA32_NR_syscalls)) {
- nr = array_index_nospec(nr, IA32_NR_syscalls);
- regs->ax = ia32_sys_call_table[nr](regs);
+ unsigned long unr = nr;
+
+ if (likely(unr < IA32_NR_syscalls)) {
+ unr = array_index_nospec(unr, IA32_NR_syscalls);
+ regs->ax = ia32_sys_call_table[unr](regs);
+ } else if (nr != -1) {
+ regs->ax = __ia32_sys_ni_syscall(regs);
}
}
/* Handles int $0x80 */
__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
{
- unsigned int nr = syscall_32_enter(regs);
+ int nr = syscall_32_enter(regs);
add_random_kstack_offset();
/*
- * Subtlety here: if ptrace pokes something larger than 2^32-1 into
- * orig_ax, the unsigned int return value truncates it. This may
- * or may not be necessary, but it matches the old asm behavior.
+ * Subtlety here: if ptrace pokes something larger than 2^31-1 into
+ * orig_ax, the int return value truncates it. This matches
+ * the semantics of syscall_get_nr().
*/
- nr = (unsigned int)syscall_enter_from_user_mode(regs, nr);
+ nr = syscall_enter_from_user_mode(regs, nr);
instrumentation_begin();
do_syscall_32_irqs_on(regs, nr);
@@ -101,7 +129,7 @@ __visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
{
- unsigned int nr = syscall_32_enter(regs);
+ int nr = syscall_32_enter(regs);
int res;
add_random_kstack_offset();
@@ -136,8 +164,7 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
return false;
}
- /* The case truncates any ptrace induced syscall nr > 2^32 -1 */
- nr = (unsigned int)syscall_enter_from_user_mode_work(regs, nr);
+ nr = syscall_enter_from_user_mode_work(regs, nr);
/* Now this is just like a normal syscall. */
do_syscall_32_irqs_on(regs, nr);
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 1d9db15fdc69..85f04ea0e368 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -108,7 +108,7 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
/* IRQs are off. */
movq %rsp, %rdi
- movq %rax, %rsi
+ movslq %eax, %rsi
call do_syscall_64 /* returns with IRQs disabled */
/*
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index f6593cafdbd9..f7e2d82d24fb 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -159,7 +159,7 @@ static inline int syscall_get_arch(struct task_struct *task)
? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
}
-void do_syscall_64(struct pt_regs *regs, unsigned long nr);
+void do_syscall_64(struct pt_regs *regs, int nr);
void do_int80_syscall_32(struct pt_regs *regs);
long do_fast_syscall_32(struct pt_regs *regs);
--
2.31.1
Powered by blists - more mailing lists