[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <53A88606.2050108@mit.edu>
Date: Mon, 23 Jun 2014 12:54:46 -0700
From: Andy Lutomirski <luto@...capital.net>
To: Qiaowei Ren <qiaowei.ren@...el.com>,
"H. Peter Anvin" <hpa@...or.com>,
Thomas Gleixner <tglx@...utronix.de>,
Ingo Molnar <mingo@...hat.com>,
Dave Hansen <dave.hansen@...el.com>
CC: x86@...nel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH v6 04/10] x86, mpx: hook #BR exception handler to allocate
bound tables
On 06/18/2014 02:44 AM, Qiaowei Ren wrote:
> This patch handles a #BR exception for non-existent tables by
> carving the space out of the normal processes address space
> (essentially calling mmap() from inside the kernel) and then
> pointing the bounds-directory over to it.
>
> The tables need to be accessed and controlled by userspace
> because the compiler generates instructions for MPX-enabled
> code which frequently store and retrieve entries from the bounds
> tables. Any direct kernel involvement (like a syscall) to access
> the tables would destroy performance since these are so frequent.
>
> The tables are carved out of userspace because we have no better
> spot to put them. For each pointer which is being tracked by MPX,
> the bounds tables contain 4 longs worth of data, and the tables
> are indexed virtually. If we were to preallocate the tables, we
> would theoretically need to allocate 4x the virtual space that
> we have available for userspace somewhere else. We don't have
> that room in the kernel address space.
>
> Signed-off-by: Qiaowei Ren <qiaowei.ren@...el.com>
> ---
> arch/x86/include/asm/mpx.h | 20 ++++++++++++++
> arch/x86/kernel/Makefile | 1 +
> arch/x86/kernel/mpx.c | 63 ++++++++++++++++++++++++++++++++++++++++++++
> arch/x86/kernel/traps.c | 56 ++++++++++++++++++++++++++++++++++++++-
> 4 files changed, 139 insertions(+), 1 deletions(-)
> create mode 100644 arch/x86/kernel/mpx.c
>
> diff --git a/arch/x86/include/asm/mpx.h b/arch/x86/include/asm/mpx.h
> index 5725ac4..b7598ac 100644
> --- a/arch/x86/include/asm/mpx.h
> +++ b/arch/x86/include/asm/mpx.h
> @@ -18,6 +18,8 @@
> #define MPX_BT_ENTRY_SHIFT 5
> #define MPX_IGN_BITS 3
>
> +#define MPX_BD_ENTRY_TAIL 3
> +
> #else
>
> #define MPX_BD_ENTRY_OFFSET 20
> @@ -26,13 +28,31 @@
> #define MPX_BT_ENTRY_SHIFT 4
> #define MPX_IGN_BITS 2
>
> +#define MPX_BD_ENTRY_TAIL 2
> +
> #endif
>
> +#define MPX_BNDSTA_TAIL 2
> +#define MPX_BNDCFG_TAIL 12
> +#define MPX_BNDSTA_ADDR_MASK (~((1UL<<MPX_BNDSTA_TAIL)-1))
> +#define MPX_BNDCFG_ADDR_MASK (~((1UL<<MPX_BNDCFG_TAIL)-1))
> +#define MPX_BT_ADDR_MASK (~((1UL<<MPX_BD_ENTRY_TAIL)-1))
> +
> #define MPX_BD_SIZE_BYTES (1UL<<(MPX_BD_ENTRY_OFFSET+MPX_BD_ENTRY_SHIFT))
> #define MPX_BT_SIZE_BYTES (1UL<<(MPX_BT_ENTRY_OFFSET+MPX_BT_ENTRY_SHIFT))
>
> #define MPX_BNDSTA_ERROR_CODE 0x3
> +#define MPX_BD_ENTRY_VALID_FLAG 0x1
>
> unsigned long mpx_mmap(unsigned long len);
>
> +#ifdef CONFIG_X86_INTEL_MPX
> +int do_mpx_bt_fault(struct xsave_struct *xsave_buf);
> +#else
> +static inline int do_mpx_bt_fault(struct xsave_struct *xsave_buf)
> +{
> + return -EINVAL;
> +}
> +#endif /* CONFIG_X86_INTEL_MPX */
> +
> #endif /* _ASM_X86_MPX_H */
> diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
> index f4d9600..3e81aed 100644
> --- a/arch/x86/kernel/Makefile
> +++ b/arch/x86/kernel/Makefile
> @@ -41,6 +41,7 @@ obj-$(CONFIG_PREEMPT) += preempt.o
>
> obj-y += process.o
> obj-y += i387.o xsave.o
> +obj-$(CONFIG_X86_INTEL_MPX) += mpx.o
> obj-y += ptrace.o
> obj-$(CONFIG_X86_32) += tls.o
> obj-$(CONFIG_IA32_EMULATION) += tls.o
> diff --git a/arch/x86/kernel/mpx.c b/arch/x86/kernel/mpx.c
> new file mode 100644
> index 0000000..4230c7b
> --- /dev/null
> +++ b/arch/x86/kernel/mpx.c
> @@ -0,0 +1,63 @@
> +#include <linux/kernel.h>
> +#include <linux/syscalls.h>
> +#include <asm/mpx.h>
> +
> +static int allocate_bt(long __user *bd_entry)
> +{
> + unsigned long bt_addr, old_val = 0;
> + int ret = 0;
> +
> + bt_addr = mpx_mmap(MPX_BT_SIZE_BYTES);
> + if (IS_ERR((void *)bt_addr)) {
> + pr_err("Bounds table allocation failed at entry addr %p\n",
> + bd_entry);
> + return bt_addr;
> + }
> + bt_addr = (bt_addr & MPX_BT_ADDR_MASK) | MPX_BD_ENTRY_VALID_FLAG;
> +
> + ret = user_atomic_cmpxchg_inatomic(&old_val, bd_entry, 0, bt_addr);
> + if (ret)
> + goto out;
> +
> + /*
> + * there is a existing bounds table pointed at this bounds
> + * directory entry, and so we need to free the bounds table
> + * allocated just now.
> + */
> + if (old_val)
> + goto out;
> +
> + pr_debug("Allocate bounds table %lx at entry %p\n",
> + bt_addr, bd_entry);
> + return 0;
> +
> +out:
> + vm_munmap(bt_addr & MPX_BT_ADDR_MASK, MPX_BT_SIZE_BYTES);
> + return ret;
> +}
> +
> +/*
> + * When a BNDSTX instruction attempts to save bounds to a BD entry
> + * with the lack of the valid bit being set, a #BR is generated.
> + * This is an indication that no BT exists for this entry. In this
> + * case the fault handler will allocate a new BT.
> + *
> + * With 32-bit mode, the size of BD is 4MB, and the size of each
> + * bound table is 16KB. With 64-bit mode, the size of BD is 2GB,
> + * and the size of each bound table is 4MB.
> + */
> +int do_mpx_bt_fault(struct xsave_struct *xsave_buf)
> +{
> + unsigned long status;
> + unsigned long bd_entry, bd_base;
> +
> + bd_base = xsave_buf->bndcsr.cfg_reg_u & MPX_BNDCFG_ADDR_MASK;
> + status = xsave_buf->bndcsr.status_reg;
> +
> + bd_entry = status & MPX_BNDSTA_ADDR_MASK;
> + if ((bd_entry < bd_base) ||
> + (bd_entry >= bd_base + MPX_BD_SIZE_BYTES))
> + return -EINVAL;
> +
> + return allocate_bt((long __user *)bd_entry);
> +}
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index f73b5d4..35b9b29 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -59,6 +59,7 @@
> #include <asm/fixmap.h>
> #include <asm/mach_traps.h>
> #include <asm/alternative.h>
> +#include <asm/mpx.h>
>
> #ifdef CONFIG_X86_64
> #include <asm/x86_init.h>
> @@ -213,7 +214,6 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
>
> DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip )
> DO_ERROR (X86_TRAP_OF, SIGSEGV, "overflow", overflow )
> -DO_ERROR (X86_TRAP_BR, SIGSEGV, "bounds", bounds )
> DO_ERROR_INFO(X86_TRAP_UD, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip )
> DO_ERROR (X86_TRAP_OLD_MF, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun )
> DO_ERROR (X86_TRAP_TS, SIGSEGV, "invalid TSS", invalid_TSS )
> @@ -263,6 +263,60 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
> }
> #endif
>
> +dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
> +{
> + enum ctx_state prev_state;
> + unsigned long status;
> + struct xsave_struct *xsave_buf;
> + struct task_struct *tsk = current;
> +
> + prev_state = exception_enter();
> + if (notify_die(DIE_TRAP, "bounds", regs, error_code,
> + X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
> + goto exit;
> + conditional_sti(regs);
> +
> + if (!user_mode(regs))
> + die("bounds", regs, error_code);
Does this need to be user_mode_vm?
> +
> + if (!cpu_has_mpx) {
> + /* The exception is not from Intel MPX */
> + do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
> + goto exit;
> + }
> +
> + fpu_xsave(&tsk->thread.fpu);
> + xsave_buf = &(tsk->thread.fpu.state->xsave);
> + status = xsave_buf->bndcsr.status_reg;
> +
> + /*
> + * The error code field of the BNDSTATUS register communicates status
> + * information of a bound range exception #BR or operation involving
> + * bound directory.
> + */
> + switch (status & MPX_BNDSTA_ERROR_CODE) {
> + case 2:
> + /*
> + * Bound directory has invalid entry.
> + * No signal will be sent to the user space.
This comment is a lie.
> + */
> + if (do_mpx_bt_fault(xsave_buf))
> + force_sig(SIGBUS, tsk);
Would it make sense to assign and use a new si_code value here?
--Andy
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists