[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <55AEBF76.4040501@oracle.com>
Date: Tue, 21 Jul 2015 17:53:58 -0400
From: Boris Ostrovsky <boris.ostrovsky@...cle.com>
To: Andy Lutomirski <luto@...nel.org>,
Peter Zijlstra <peterz@...radead.org>,
Steven Rostedt <rostedt@...dmis.org>
CC: "security@...nel.org" <security@...nel.org>,
X86 ML <x86@...nel.org>, Borislav Petkov <bp@...en8.de>,
Sasha Levin <sasha.levin@...cle.com>,
linux-kernel@...r.kernel.org,
Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>,
stable@...r.kernel.org, Jan Beulich <jbeulich@...e.com>,
Andrew Cooper <andrew.cooper3@...rix.com>,
xen-devel <xen-devel@...ts.xen.org>
Subject: Re: [PATCH v2 1/3] x86/ldt: Make modify_ldt synchronous
On 07/21/2015 03:59 PM, Andy Lutomirski wrote:
> modify_ldt has questionable locking and does not synchronize
> threads. Improve it: redesign the locking and synchronize all
> threads' LDTs using an IPI on all modifications.
>
> This will dramatically slow down modify_ldt in multithreaded
> programs, but there shouldn't be any multithreaded programs that
> care about modify_ldt's performance in the first place.
>
> Cc: stable@...r.kernel.org
> Signed-off-by: Andy Lutomirski <luto@...nel.org>
> ---
> arch/x86/include/asm/desc.h | 15 ---
> arch/x86/include/asm/mmu.h | 3 +-
> arch/x86/include/asm/mmu_context.h | 48 ++++++-
> arch/x86/kernel/cpu/common.c | 4 +-
> arch/x86/kernel/cpu/perf_event.c | 12 +-
> arch/x86/kernel/ldt.c | 247 +++++++++++++++++++------------------
> arch/x86/kernel/process_64.c | 4 +-
> arch/x86/kernel/step.c | 6 +-
> arch/x86/power/cpu.c | 3 +-
> 9 files changed, 192 insertions(+), 150 deletions(-)
>
> diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
> index a0bf89fd2647..4e10d73cf018 100644
> --- a/arch/x86/include/asm/desc.h
> +++ b/arch/x86/include/asm/desc.h
> @@ -280,21 +280,6 @@ static inline void clear_LDT(void)
> set_ldt(NULL, 0);
> }
>
> -/*
> - * load one particular LDT into the current CPU
> - */
> -static inline void load_LDT_nolock(mm_context_t *pc)
> -{
> - set_ldt(pc->ldt, pc->size);
> -}
> -
> -static inline void load_LDT(mm_context_t *pc)
> -{
> - preempt_disable();
> - load_LDT_nolock(pc);
> - preempt_enable();
> -}
> -
> static inline unsigned long get_desc_base(const struct desc_struct *desc)
> {
> return (unsigned)(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
> diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
> index 09b9620a73b4..364d27481a52 100644
> --- a/arch/x86/include/asm/mmu.h
> +++ b/arch/x86/include/asm/mmu.h
> @@ -9,8 +9,7 @@
> * we put the segment information here.
> */
> typedef struct {
> - void *ldt;
> - int size;
> + struct ldt_struct *ldt;
>
> #ifdef CONFIG_X86_64
> /* True if mm supports a task running in 32 bit compatibility mode. */
> diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
> index 5e8daee7c5c9..1ff121fbf366 100644
> --- a/arch/x86/include/asm/mmu_context.h
> +++ b/arch/x86/include/asm/mmu_context.h
> @@ -34,6 +34,44 @@ static inline void load_mm_cr4(struct mm_struct *mm) {}
> #endif
>
> /*
> + * ldt_structs can be allocated, used, and freed, but they are never
> + * modified while live.
> + */
> +struct ldt_struct {
> + int size;
> + int __pad; /* keep the descriptors naturally aligned. */
> + struct desc_struct entries[];
> +};
This breaks Xen which expects LDT to be page-aligned. Not sure why.
Jan, Andrew?
-boris
> +
> +static inline void load_mm_ldt(struct mm_struct *mm)
> +{
> + struct ldt_struct *ldt;
> + DEBUG_LOCKS_WARN_ON(!irqs_disabled());
> +
> + /* lockless_dereference synchronizes with smp_store_release */
> + ldt = lockless_dereference(mm->context.ldt);
> +
> + /*
> + * Any change to mm->context.ldt is followed by an IPI to all
> + * CPUs with the mm active. The LDT will not be freed until
> + * after the IPI is handled by all such CPUs. This means that,
> + * if the ldt_struct changes before we return, the values we see
> + * will be safe, and the new values will be loaded before we run
> + * any user code.
> + *
> + * NB: don't try to convert this to use RCU without extreme care.
> + * We would still need IRQs off, because we don't want to change
> + * the local LDT after an IPI loaded a newer value than the one
> + * that we can see.
> + */
> +
> + if (unlikely(ldt))
> + set_ldt(ldt->entries, ldt->size);
> + else
> + clear_LDT();
> +}
> +
> +/*
> * Used for LDT copy/destruction.
> */
> int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
> @@ -78,12 +116,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
> * was called and then modify_ldt changed
> * prev->context.ldt but suppressed an IPI to this CPU.
> * In this case, prev->context.ldt != NULL, because we
> - * never free an LDT while the mm still exists. That
> - * means that next->context.ldt != prev->context.ldt,
> - * because mms never share an LDT.
> + * never set context.ldt to NULL while the mm still
> + * exists. That means that next->context.ldt !=
> + * prev->context.ldt, because mms never share an LDT.
> */
> if (unlikely(prev->context.ldt != next->context.ldt))
> - load_LDT_nolock(&next->context);
> + load_mm_ldt(next);
> }
> #ifdef CONFIG_SMP
> else {
> @@ -106,7 +144,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
> load_cr3(next->pgd);
> trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
> load_mm_cr4(next);
> - load_LDT_nolock(&next->context);
> + load_mm_ldt(next);
> }
> }
> #endif
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index 922c5e0cea4c..cb9e5df42dd2 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -1410,7 +1410,7 @@ void cpu_init(void)
> load_sp0(t, ¤t->thread);
> set_tss_desc(cpu, t);
> load_TR_desc();
> - load_LDT(&init_mm.context);
> + load_mm_ldt(&init_mm);
>
> clear_all_debug_regs();
> dbg_restore_debug_regs();
> @@ -1459,7 +1459,7 @@ void cpu_init(void)
> load_sp0(t, thread);
> set_tss_desc(cpu, t);
> load_TR_desc();
> - load_LDT(&init_mm.context);
> + load_mm_ldt(&init_mm);
>
> t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
>
> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
> index 3658de47900f..9469dfa55607 100644
> --- a/arch/x86/kernel/cpu/perf_event.c
> +++ b/arch/x86/kernel/cpu/perf_event.c
> @@ -2179,21 +2179,25 @@ static unsigned long get_segment_base(unsigned int segment)
> int idx = segment >> 3;
>
> if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
> + struct ldt_struct *ldt;
> +
> if (idx > LDT_ENTRIES)
> return 0;
>
> - if (idx > current->active_mm->context.size)
> + /* IRQs are off, so this synchronizes with smp_store_release */
> + ldt = lockless_dereference(current->active_mm->context.ldt);
> + if (!ldt || idx > ldt->size)
> return 0;
>
> - desc = current->active_mm->context.ldt;
> + desc = &ldt->entries[idx];
> } else {
> if (idx > GDT_ENTRIES)
> return 0;
>
> - desc = raw_cpu_ptr(gdt_page.gdt);
> + desc = raw_cpu_ptr(gdt_page.gdt) + idx;
> }
>
> - return get_desc_base(desc + idx);
> + return get_desc_base(desc);
> }
>
> #ifdef CONFIG_COMPAT
> diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
> index c37886d759cc..d65e6ec90338 100644
> --- a/arch/x86/kernel/ldt.c
> +++ b/arch/x86/kernel/ldt.c
> @@ -20,82 +20,70 @@
> #include <asm/mmu_context.h>
> #include <asm/syscalls.h>
>
> -#ifdef CONFIG_SMP
> static void flush_ldt(void *current_mm)
> {
> - if (current->active_mm == current_mm)
> - load_LDT(¤t->active_mm->context);
> + if (current->active_mm == current_mm) {
> + /* context.lock is held for us, so we don't need any locking. */
> + mm_context_t *pc = ¤t->active_mm->context;
> + set_ldt(pc->ldt->entries, pc->ldt->size);
> + }
> }
> -#endif
>
> -static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
> +static struct ldt_struct *alloc_ldt_struct(int size)
> {
> - void *oldldt, *newldt;
> - int oldsize;
> + struct ldt_struct *new_ldt;
> + int alloc_size;
>
> - if (mincount <= pc->size)
> - return 0;
> - oldsize = pc->size;
> - mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
> - (~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
> - if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
> - newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
> + if (size > LDT_ENTRIES)
> + return NULL;
> +
> + BUILD_BUG_ON(LDT_ENTRY_SIZE != sizeof(struct desc_struct));
> + alloc_size = sizeof(struct ldt_struct) + size * LDT_ENTRY_SIZE;
> +
> + if (alloc_size > PAGE_SIZE)
> + new_ldt = vmalloc(alloc_size);
> else
> - newldt = (void *)__get_free_page(GFP_KERNEL);
> + new_ldt = (void *)__get_free_page(GFP_KERNEL);
> + if (!new_ldt)
> + return NULL;
>
> - if (!newldt)
> - return -ENOMEM;
> + new_ldt->size = size;
> + paravirt_alloc_ldt(new_ldt->entries, size);
> + return new_ldt;
> +}
>
> - if (oldsize)
> - memcpy(newldt, pc->ldt, oldsize * LDT_ENTRY_SIZE);
> - oldldt = pc->ldt;
> - memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
> - (mincount - oldsize) * LDT_ENTRY_SIZE);
> +static void install_ldt(struct mm_struct *current_mm,
> + struct ldt_struct *ldt)
> +{
> + /* context.lock is held */
> + preempt_disable();
>
> - paravirt_alloc_ldt(newldt, mincount);
> + /* Synchronizes with lockless_dereference in load_mm_ldt. */
> + smp_store_release(¤t_mm->context.ldt, ldt);
>
> -#ifdef CONFIG_X86_64
> - /* CHECKME: Do we really need this ? */
> - wmb();
> -#endif
> - pc->ldt = newldt;
> - wmb();
> - pc->size = mincount;
> - wmb();
> + /* Activate for this CPU. */
> + flush_ldt(current->mm);
>
> - if (reload) {
> #ifdef CONFIG_SMP
> - preempt_disable();
> - load_LDT(pc);
> - if (!cpumask_equal(mm_cpumask(current->mm),
> - cpumask_of(smp_processor_id())))
> - smp_call_function(flush_ldt, current->mm, 1);
> - preempt_enable();
> -#else
> - load_LDT(pc);
> + /* Synchronize with other CPUs. */
> + if (!cpumask_equal(mm_cpumask(current_mm),
> + cpumask_of(smp_processor_id())))
> + smp_call_function(flush_ldt, current_mm, 1);
> #endif
> - }
> - if (oldsize) {
> - paravirt_free_ldt(oldldt, oldsize);
> - if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
> - vfree(oldldt);
> - else
> - put_page(virt_to_page(oldldt));
> - }
> - return 0;
> + preempt_enable();
> }
>
> -static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
> +static void free_ldt_struct(struct ldt_struct *ldt)
> {
> - int err = alloc_ldt(new, old->size, 0);
> - int i;
> -
> - if (err < 0)
> - return err;
> -
> - for (i = 0; i < old->size; i++)
> - write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
> - return 0;
> + if (unlikely(ldt)) {
> + int alloc_size = sizeof(struct ldt_struct) +
> + ldt->size * LDT_ENTRY_SIZE;
> + paravirt_free_ldt(ldt->entries, ldt->size);
> + if (alloc_size > PAGE_SIZE)
> + vfree(ldt);
> + else
> + put_page(virt_to_page(ldt));
> + }
> }
>
> /*
> @@ -106,15 +94,35 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
> {
> struct mm_struct *old_mm;
> int retval = 0;
> + int i;
>
> mutex_init(&mm->context.lock);
> - mm->context.size = 0;
> old_mm = current->mm;
> - if (old_mm && old_mm->context.size > 0) {
> - mutex_lock(&old_mm->context.lock);
> - retval = copy_ldt(&mm->context, &old_mm->context);
> - mutex_unlock(&old_mm->context.lock);
> + if (!old_mm) {
> + mm->context.ldt = NULL;
> + return 0;
> }
> +
> + mutex_lock(&old_mm->context.lock);
> + if (old_mm->context.ldt) {
> + struct ldt_struct *new_ldt =
> + alloc_ldt_struct(old_mm->context.ldt->size);
> + if (!new_ldt) {
> + retval = -ENOMEM;
> + goto out_unlock;
> + }
> +
> + for (i = 0; i < old_mm->context.ldt->size; i++)
> + write_ldt_entry(new_ldt->entries, i,
> + &old_mm->context.ldt->entries[i]);
> +
> + mm->context.ldt = new_ldt;
> + } else {
> + mm->context.ldt = NULL;
> + }
> +
> +out_unlock:
> + mutex_unlock(&old_mm->context.lock);
> return retval;
> }
>
> @@ -125,53 +133,47 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm)
> */
> void destroy_context(struct mm_struct *mm)
> {
> - if (mm->context.size) {
> -#ifdef CONFIG_X86_32
> - /* CHECKME: Can this ever happen ? */
> - if (mm == current->active_mm)
> - clear_LDT();
> -#endif
> - paravirt_free_ldt(mm->context.ldt, mm->context.size);
> - if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
> - vfree(mm->context.ldt);
> - else
> - put_page(virt_to_page(mm->context.ldt));
> - mm->context.size = 0;
> - }
> + free_ldt_struct(mm->context.ldt);
> + mm->context.ldt = NULL;
> }
>
> static int read_ldt(void __user *ptr, unsigned long bytecount)
> {
> - int err;
> + int retval;
> unsigned long size;
> struct mm_struct *mm = current->mm;
>
> - if (!mm->context.size)
> - return 0;
> + mutex_lock(&mm->context.lock);
> +
> + if (!mm->context.ldt) {
> + retval = 0;
> + goto out_unlock;
> + }
> +
> if (bytecount > LDT_ENTRY_SIZE * LDT_ENTRIES)
> bytecount = LDT_ENTRY_SIZE * LDT_ENTRIES;
>
> - mutex_lock(&mm->context.lock);
> - size = mm->context.size * LDT_ENTRY_SIZE;
> + size = mm->context.ldt->size * LDT_ENTRY_SIZE;
> if (size > bytecount)
> size = bytecount;
>
> - err = 0;
> - if (copy_to_user(ptr, mm->context.ldt, size))
> - err = -EFAULT;
> - mutex_unlock(&mm->context.lock);
> - if (err < 0)
> - goto error_return;
> + if (copy_to_user(ptr, mm->context.ldt->entries, size)) {
> + retval = -EFAULT;
> + goto out_unlock;
> + }
> +
> if (size != bytecount) {
> - /* zero-fill the rest */
> + /* Zero-fill the rest and pretend we read bytecount bytes. */
> if (clear_user(ptr + size, bytecount - size) != 0) {
> - err = -EFAULT;
> - goto error_return;
> + retval = -EFAULT;
> + goto out_unlock;
> }
> }
> - return bytecount;
> -error_return:
> - return err;
> + retval = bytecount;
> +
> +out_unlock:
> + mutex_unlock(&mm->context.lock);
> + return retval;
> }
>
> static int read_default_ldt(void __user *ptr, unsigned long bytecount)
> @@ -189,12 +191,16 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount)
> return bytecount;
> }
>
> +static struct desc_struct blank_desc;
> +
> static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
> {
> struct mm_struct *mm = current->mm;
> struct desc_struct ldt;
> - int error;
> + int error, i;
> struct user_desc ldt_info;
> + int oldsize, newsize;
> + struct ldt_struct *new_ldt, *old_ldt;
>
> error = -EINVAL;
> if (bytecount != sizeof(ldt_info))
> @@ -213,34 +219,41 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
> goto out;
> }
>
> - mutex_lock(&mm->context.lock);
> - if (ldt_info.entry_number >= mm->context.size) {
> - error = alloc_ldt(¤t->mm->context,
> - ldt_info.entry_number + 1, 1);
> - if (error < 0)
> - goto out_unlock;
> - }
> -
> - /* Allow LDTs to be cleared by the user. */
> - if (ldt_info.base_addr == 0 && ldt_info.limit == 0) {
> - if (oldmode || LDT_empty(&ldt_info)) {
> - memset(&ldt, 0, sizeof(ldt));
> - goto install;
> + if ((oldmode && ldt_info.base_addr == 0 && ldt_info.limit == 0) ||
> + LDT_empty(&ldt_info)) {
> + /* The user wants to clear the entry. */
> + memset(&ldt, 0, sizeof(ldt));
> + } else {
> + if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
> + error = -EINVAL;
> + goto out;
> }
> +
> + fill_ldt(&ldt, &ldt_info);
> + if (oldmode)
> + ldt.avl = 0;
> }
>
> - if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) {
> - error = -EINVAL;
> + mutex_lock(&mm->context.lock);
> +
> + old_ldt = mm->context.ldt;
> + oldsize = old_ldt ? old_ldt->size : 0;
> + newsize = max((int)(ldt_info.entry_number + 1), oldsize);
> +
> + error = -ENOMEM;
> + new_ldt = alloc_ldt_struct(newsize);
> + if (!new_ldt)
> goto out_unlock;
> - }
>
> - fill_ldt(&ldt, &ldt_info);
> - if (oldmode)
> - ldt.avl = 0;
> + for (i = 0; i < oldsize; i++)
> + write_ldt_entry(new_ldt->entries, i,
> + &old_ldt->entries[i]);
> + for (i = oldsize; i < newsize; i++)
> + write_ldt_entry(new_ldt->entries, i, &blank_desc);
> + write_ldt_entry(new_ldt->entries, ldt_info.entry_number, &ldt);
>
> - /* Install the new entry ... */
> -install:
> - write_ldt_entry(mm->context.ldt, ldt_info.entry_number, &ldt);
> + install_ldt(mm, new_ldt);
> + free_ldt_struct(old_ldt);
> error = 0;
>
> out_unlock:
> diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
> index 71d7849a07f7..f6b916387590 100644
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -121,11 +121,11 @@ void __show_regs(struct pt_regs *regs, int all)
> void release_thread(struct task_struct *dead_task)
> {
> if (dead_task->mm) {
> - if (dead_task->mm->context.size) {
> + if (dead_task->mm->context.ldt) {
> pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
> dead_task->comm,
> dead_task->mm->context.ldt,
> - dead_task->mm->context.size);
> + dead_task->mm->context.ldt->size);
> BUG();
> }
> }
> diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
> index 9b4d51d0c0d0..6273324186ac 100644
> --- a/arch/x86/kernel/step.c
> +++ b/arch/x86/kernel/step.c
> @@ -5,6 +5,7 @@
> #include <linux/mm.h>
> #include <linux/ptrace.h>
> #include <asm/desc.h>
> +#include <asm/mmu_context.h>
>
> unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs)
> {
> @@ -30,10 +31,11 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
> seg &= ~7UL;
>
> mutex_lock(&child->mm->context.lock);
> - if (unlikely((seg >> 3) >= child->mm->context.size))
> + if (unlikely(!child->mm->context.ldt ||
> + (seg >> 3) >= child->mm->context.ldt->size))
> addr = -1L; /* bogus selector, access would fault */
> else {
> - desc = child->mm->context.ldt + seg;
> + desc = &child->mm->context.ldt->entries[seg];
> base = get_desc_base(desc);
>
> /* 16-bit code segment? */
> diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
> index 0d7dd1f5ac36..9ab52791fed5 100644
> --- a/arch/x86/power/cpu.c
> +++ b/arch/x86/power/cpu.c
> @@ -22,6 +22,7 @@
> #include <asm/fpu/internal.h>
> #include <asm/debugreg.h>
> #include <asm/cpu.h>
> +#include <asm/mmu_context.h>
>
> #ifdef CONFIG_X86_32
> __visible unsigned long saved_context_ebx;
> @@ -153,7 +154,7 @@ static void fix_processor_context(void)
> syscall_init(); /* This sets MSR_*STAR and related */
> #endif
> load_TR_desc(); /* This does ltr */
> - load_LDT(¤t->active_mm->context); /* This does lldt */
> + load_mm_ldt(current->active_mm); /* This does lldt */
>
> fpu__resume_cpu();
> }
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists