[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <706E9982-24E3-442B-808A-172909449DD4@zytor.com>
Date: Sat, 26 Sep 2015 12:50:38 -0700
From: "H. Peter Anvin" <hpa@...or.com>
To: Denys Vlasenko <dvlasenk@...hat.com>,
Ingo Molnar <mingo@...nel.org>
CC: Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>,
Boris Ostrovsky <boris.ostrovsky@...cle.com>,
David Vrabel <david.vrabel@...rix.com>,
Joerg Roedel <joro@...tes.org>, Gleb Natapov <gleb@...nel.org>,
Paolo Bonzini <pbonzini@...hat.com>, kvm@...r.kernel.org,
x86@...nel.org, linux-kernel@...r.kernel.org
Subject: Re: [PATCH] x86: Use entire page for the per-cpu GDT only if paravirt-enabled
NAK. We really should map the GDT read-only on all 64 bit systems, since we can't hide the address from SLDT. Same with the IDT.
On September 26, 2015 11:00:40 AM PDT, Denys Vlasenko <dvlasenk@...hat.com> wrote:
>We have our GDT in a page-sized per-cpu structure, gdt_page.
>
>On x86_64 kernel, GDT is 128 bytes - only ~3% of that page is used.
>
>It is page-sized because of paravirt. Hypervisors need to know when
>GDT is changed, so they remap it read-only and handle write faults.
>If it's not in its own page, other writes nearby will cause
>those faults too.
>
>In other words, we need GDT to live in a separate page
>only if CONFIG_HYPERVISOR_GUEST=y.
>
>This patch reduces GDT alignment to cacheline-aligned
>if CONFIG_HYPERVISOR_GUEST is not set.
>
>Patch also renames gdt_page to cpu_gdt (mimicking naming of existing
>cpu_tss per-cpu variable), since now it is not always a full page.
>
> $ objdump -x vmlinux | grep .data..percpu | sort
>Before:
> (offset) (size)
>0000000000000000 w O .data..percpu 0000000000004000
>irq_stack_union
>0000000000004000 w O .data..percpu 0000000000005000
>exception_stacks
>0000000000009000 w O .data..percpu 0000000000001000 gdt_page <<<
>HERE
> 000000000000a000 w O .data..percpu 0000000000000008 espfix_waddr
> 000000000000a008 w O .data..percpu 0000000000000008 espfix_stack
> ...
> 0000000000019398 g .data..percpu 0000000000000000 __per_cpu_end
>After:
>0000000000000000 w O .data..percpu 0000000000004000
>irq_stack_union
>0000000000004000 w O .data..percpu 0000000000005000
>exception_stacks
> 0000000000009000 w O .data..percpu 0000000000000008 espfix_waddr
> 0000000000009008 w O .data..percpu 0000000000000008 espfix_stack
> ...
> 0000000000013c80 w O .data..percpu 0000000000000040 cyc2ns
> 0000000000013cc0 w O .data..percpu 00000000000022c0 cpu_tss
>0000000000015f80 w O .data..percpu 0000000000000080 cpu_gdt <<<
>HERE
> 0000000000016000 w O .data..percpu 0000000000000018 cpu_tlbstate
> ...
> 0000000000018418 g .data..percpu 0000000000000000 __per_cpu_end
>
>Run-tested on a 144 CPU machine.
>
>Signed-off-by: Denys Vlasenko <dvlasenk@...hat.com>
>CC: Ingo Molnar <mingo@...nel.org>
>CC: H. Peter Anvin <hpa@...or.com>
>CC: Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
>CC: Boris Ostrovsky <boris.ostrovsky@...cle.com>
>CC: David Vrabel <david.vrabel@...rix.com>
>CC: Joerg Roedel <joro@...tes.org>
>CC: Gleb Natapov <gleb@...nel.org>
>CC: Paolo Bonzini <pbonzini@...hat.com>
>CC: kvm@...r.kernel.org
>CC: x86@...nel.org
>CC: linux-kernel@...r.kernel.org
>---
> arch/x86/entry/entry_32.S | 2 +-
> arch/x86/include/asm/desc.h | 16 +++++++++++-----
> arch/x86/kernel/cpu/common.c | 10 ++++++++--
> arch/x86/kernel/cpu/perf_event.c | 2 +-
> arch/x86/kernel/head_32.S | 4 ++--
> arch/x86/kernel/head_64.S | 2 +-
> arch/x86/kernel/vmlinux.lds.S | 2 +-
> arch/x86/tools/relocs.c | 2 +-
> arch/x86/xen/enlighten.c | 4 ++--
> 9 files changed, 28 insertions(+), 16 deletions(-)
>
>diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
>index b2909bf..bc6ae1c 100644
>--- a/arch/x86/entry/entry_32.S
>+++ b/arch/x86/entry/entry_32.S
>@@ -429,7 +429,7 @@ ldt_ss:
> * compensating for the offset by changing to the ESPFIX segment with
> * a base address that matches for the difference.
> */
>-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS *
>8)
>+#define GDT_ESPFIX_SS PER_CPU_VAR(cpu_gdt) + (GDT_ENTRY_ESPFIX_SS * 8)
> mov %esp, %edx /* load kernel esp */
> mov PT_OLDESP(%esp), %eax /* load userspace esp */
> mov %dx, %ax /* eax: new kernel esp */
>diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
>index 4e10d73..76de300 100644
>--- a/arch/x86/include/asm/desc.h
>+++ b/arch/x86/include/asm/desc.h
>@@ -39,15 +39,21 @@ extern gate_desc idt_table[];
> extern struct desc_ptr debug_idt_descr;
> extern gate_desc debug_idt_table[];
>
>-struct gdt_page {
>+struct cpu_gdt {
> struct desc_struct gdt[GDT_ENTRIES];
>-} __attribute__((aligned(PAGE_SIZE)));
>-
>-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
>+}
>+#ifdef CONFIG_HYPERVISOR_GUEST
>+/* Xen et al want GDT to have its own page. They remap it read-only */
>+__attribute__((aligned(PAGE_SIZE)));
>+DECLARE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt);
>+#else
>+____cacheline_aligned;
>+DECLARE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt);
>+#endif
>
> static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
> {
>- return per_cpu(gdt_page, cpu).gdt;
>+ return per_cpu(cpu_gdt, cpu).gdt;
> }
>
> #ifdef CONFIG_X86_64
>diff --git a/arch/x86/kernel/cpu/common.c
>b/arch/x86/kernel/cpu/common.c
>index de22ea7..6b90785 100644
>--- a/arch/x86/kernel/cpu/common.c
>+++ b/arch/x86/kernel/cpu/common.c
>@@ -92,7 +92,13 @@ static const struct cpu_dev default_cpu = {
>
> static const struct cpu_dev *this_cpu = &default_cpu;
>
>-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
>+#ifdef CONFIG_HYPERVISOR_GUEST
>+/* Xen et al want GDT to have its own page. They remap it read-only */
>+DEFINE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt) =
>+#else
>+DEFINE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt) =
>+#endif
>+{ .gdt = {
> #ifdef CONFIG_X86_64
> /*
> * We need valid kernel segments for data and code in long mode too
>@@ -144,7 +150,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page,
>gdt_page) = { .gdt = {
> GDT_STACK_CANARY_INIT
> #endif
> } };
>-EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
>+EXPORT_PER_CPU_SYMBOL_GPL(cpu_gdt);
>
> static int __init x86_mpx_setup(char *s)
> {
>diff --git a/arch/x86/kernel/cpu/perf_event.c
>b/arch/x86/kernel/cpu/perf_event.c
>index 66dd3fe9..41ebc94 100644
>--- a/arch/x86/kernel/cpu/perf_event.c
>+++ b/arch/x86/kernel/cpu/perf_event.c
>@@ -2198,7 +2198,7 @@ static unsigned long get_segment_base(unsigned
>int segment)
> if (idx > GDT_ENTRIES)
> return 0;
>
>- desc = raw_cpu_ptr(gdt_page.gdt) + idx;
>+ desc = raw_cpu_ptr(cpu_gdt.gdt) + idx;
> }
>
> return get_desc_base(desc);
>diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
>index 0e2d96f..33c9d10 100644
>--- a/arch/x86/kernel/head_32.S
>+++ b/arch/x86/kernel/head_32.S
>@@ -521,7 +521,7 @@ setup_once:
> * relocation. Manually set base address in stack canary
> * segment descriptor.
> */
>- movl $gdt_page,%eax
>+ movl $cpu_gdt,%eax
> movl $stack_canary,%ecx
> movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
> shrl $16, %ecx
>@@ -758,7 +758,7 @@ idt_descr:
> .word 0 # 32 bit align gdt_desc.address
> ENTRY(early_gdt_descr)
> .word GDT_ENTRIES*8-1
>- .long gdt_page /* Overwritten for secondary CPUs */
>+ .long cpu_gdt /* Overwritten for secondary CPUs */
>
> /*
> * The boot_gdt must mirror the equivalent in setup.S and is
>diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
>index 1d40ca8..5a80e8c 100644
>--- a/arch/x86/kernel/head_64.S
>+++ b/arch/x86/kernel/head_64.S
>@@ -510,7 +510,7 @@ NEXT_PAGE(level1_fixmap_pgt)
> early_gdt_descr:
> .word GDT_ENTRIES*8-1
> early_gdt_descr_base:
>- .quad INIT_PER_CPU_VAR(gdt_page)
>+ .quad INIT_PER_CPU_VAR(cpu_gdt)
>
> ENTRY(phys_base)
> /* This must match the first entry in level2_kernel_pgt */
>diff --git a/arch/x86/kernel/vmlinux.lds.S
>b/arch/x86/kernel/vmlinux.lds.S
>index 74e4bf1..198e46b 100644
>--- a/arch/x86/kernel/vmlinux.lds.S
>+++ b/arch/x86/kernel/vmlinux.lds.S
>@@ -348,7 +348,7 @@ SECTIONS
> * for the boot processor.
> */
> #define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
>-INIT_PER_CPU(gdt_page);
>+INIT_PER_CPU(cpu_gdt);
> INIT_PER_CPU(irq_stack_union);
>
> /*
>diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
>index 0c2fae8..5a1da82 100644
>--- a/arch/x86/tools/relocs.c
>+++ b/arch/x86/tools/relocs.c
>@@ -736,7 +736,7 @@ static void percpu_init(void)
> *
> * The "gold" linker incorrectly associates:
> * init_per_cpu__irq_stack_union
>- * init_per_cpu__gdt_page
>+ * init_per_cpu__cpu_gdt
> */
> static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
> {
>diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
>index 30d12af..baecfc6 100644
>--- a/arch/x86/xen/enlighten.c
>+++ b/arch/x86/xen/enlighten.c
>@@ -1592,10 +1592,10 @@ asmlinkage __visible void __init
>xen_start_kernel(void)
>
> /*
> * The only reliable way to retain the initial address of the
>- * percpu gdt_page is to remember it here, so we can go and
>+ * percpu cpu_gdt is to remember it here, so we can go and
> * mark it RW later, when the initial percpu area is freed.
> */
>- xen_initial_gdt = &per_cpu(gdt_page, 0);
>+ xen_initial_gdt = &per_cpu(cpu_gdt, 0);
>
> xen_smp_init();
>
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists