[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <1443290440-14930-1-git-send-email-dvlasenk@redhat.com>
Date: Sat, 26 Sep 2015 20:00:40 +0200
From: Denys Vlasenko <dvlasenk@...hat.com>
To: Ingo Molnar <mingo@...nel.org>
Cc: Denys Vlasenko <dvlasenk@...hat.com>,
"H. Peter Anvin" <hpa@...or.com>,
Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>,
Boris Ostrovsky <boris.ostrovsky@...cle.com>,
David Vrabel <david.vrabel@...rix.com>,
Joerg Roedel <joro@...tes.org>, Gleb Natapov <gleb@...nel.org>,
Paolo Bonzini <pbonzini@...hat.com>, kvm@...r.kernel.org,
x86@...nel.org, linux-kernel@...r.kernel.org
Subject: [PATCH] x86: Use entire page for the per-cpu GDT only if paravirt-enabled
We have our GDT in a page-sized per-cpu structure, gdt_page.
On x86_64 kernel, GDT is 128 bytes - only ~3% of that page is used.
It is page-sized because of paravirt. Hypervisors need to know when
GDT is changed, so they remap it read-only and handle write faults.
If it's not in its own page, other writes nearby will cause
those faults too.
In other words, we need GDT to live in a separate page
only if CONFIG_HYPERVISOR_GUEST=y.
This patch reduces GDT alignment to cacheline-aligned
if CONFIG_HYPERVISOR_GUEST is not set.
Patch also renames gdt_page to cpu_gdt (mimicking naming of existing
cpu_tss per-cpu variable), since now it is not always a full page.
$ objdump -x vmlinux | grep .data..percpu | sort
Before:
(offset) (size)
0000000000000000 w O .data..percpu 0000000000004000 irq_stack_union
0000000000004000 w O .data..percpu 0000000000005000 exception_stacks
0000000000009000 w O .data..percpu 0000000000001000 gdt_page <<< HERE
000000000000a000 w O .data..percpu 0000000000000008 espfix_waddr
000000000000a008 w O .data..percpu 0000000000000008 espfix_stack
...
0000000000019398 g .data..percpu 0000000000000000 __per_cpu_end
After:
0000000000000000 w O .data..percpu 0000000000004000 irq_stack_union
0000000000004000 w O .data..percpu 0000000000005000 exception_stacks
0000000000009000 w O .data..percpu 0000000000000008 espfix_waddr
0000000000009008 w O .data..percpu 0000000000000008 espfix_stack
...
0000000000013c80 w O .data..percpu 0000000000000040 cyc2ns
0000000000013cc0 w O .data..percpu 00000000000022c0 cpu_tss
0000000000015f80 w O .data..percpu 0000000000000080 cpu_gdt <<< HERE
0000000000016000 w O .data..percpu 0000000000000018 cpu_tlbstate
...
0000000000018418 g .data..percpu 0000000000000000 __per_cpu_end
Run-tested on a 144 CPU machine.
Signed-off-by: Denys Vlasenko <dvlasenk@...hat.com>
CC: Ingo Molnar <mingo@...nel.org>
CC: H. Peter Anvin <hpa@...or.com>
CC: Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>
CC: Boris Ostrovsky <boris.ostrovsky@...cle.com>
CC: David Vrabel <david.vrabel@...rix.com>
CC: Joerg Roedel <joro@...tes.org>
CC: Gleb Natapov <gleb@...nel.org>
CC: Paolo Bonzini <pbonzini@...hat.com>
CC: kvm@...r.kernel.org
CC: x86@...nel.org
CC: linux-kernel@...r.kernel.org
---
arch/x86/entry/entry_32.S | 2 +-
arch/x86/include/asm/desc.h | 16 +++++++++++-----
arch/x86/kernel/cpu/common.c | 10 ++++++++--
arch/x86/kernel/cpu/perf_event.c | 2 +-
arch/x86/kernel/head_32.S | 4 ++--
arch/x86/kernel/head_64.S | 2 +-
arch/x86/kernel/vmlinux.lds.S | 2 +-
arch/x86/tools/relocs.c | 2 +-
arch/x86/xen/enlighten.c | 4 ++--
9 files changed, 28 insertions(+), 16 deletions(-)
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index b2909bf..bc6ae1c 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -429,7 +429,7 @@ ldt_ss:
* compensating for the offset by changing to the ESPFIX segment with
* a base address that matches for the difference.
*/
-#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
+#define GDT_ESPFIX_SS PER_CPU_VAR(cpu_gdt) + (GDT_ENTRY_ESPFIX_SS * 8)
mov %esp, %edx /* load kernel esp */
mov PT_OLDESP(%esp), %eax /* load userspace esp */
mov %dx, %ax /* eax: new kernel esp */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 4e10d73..76de300 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -39,15 +39,21 @@ extern gate_desc idt_table[];
extern struct desc_ptr debug_idt_descr;
extern gate_desc debug_idt_table[];
-struct gdt_page {
+struct cpu_gdt {
struct desc_struct gdt[GDT_ENTRIES];
-} __attribute__((aligned(PAGE_SIZE)));
-
-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+}
+#ifdef CONFIG_HYPERVISOR_GUEST
+/* Xen et al want GDT to have its own page. They remap it read-only */
+__attribute__((aligned(PAGE_SIZE)));
+DECLARE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt);
+#else
+____cacheline_aligned;
+DECLARE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt);
+#endif
static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
{
- return per_cpu(gdt_page, cpu).gdt;
+ return per_cpu(cpu_gdt, cpu).gdt;
}
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index de22ea7..6b90785 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -92,7 +92,13 @@ static const struct cpu_dev default_cpu = {
static const struct cpu_dev *this_cpu = &default_cpu;
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+#ifdef CONFIG_HYPERVISOR_GUEST
+/* Xen et al want GDT to have its own page. They remap it read-only */
+DEFINE_PER_CPU_PAGE_ALIGNED(struct cpu_gdt, cpu_gdt) =
+#else
+DEFINE_PER_CPU_ALIGNED(struct cpu_gdt, cpu_gdt) =
+#endif
+{ .gdt = {
#ifdef CONFIG_X86_64
/*
* We need valid kernel segments for data and code in long mode too
@@ -144,7 +150,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
GDT_STACK_CANARY_INIT
#endif
} };
-EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
+EXPORT_PER_CPU_SYMBOL_GPL(cpu_gdt);
static int __init x86_mpx_setup(char *s)
{
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 66dd3fe9..41ebc94 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2198,7 +2198,7 @@ static unsigned long get_segment_base(unsigned int segment)
if (idx > GDT_ENTRIES)
return 0;
- desc = raw_cpu_ptr(gdt_page.gdt) + idx;
+ desc = raw_cpu_ptr(cpu_gdt.gdt) + idx;
}
return get_desc_base(desc);
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 0e2d96f..33c9d10 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -521,7 +521,7 @@ setup_once:
* relocation. Manually set base address in stack canary
* segment descriptor.
*/
- movl $gdt_page,%eax
+ movl $cpu_gdt,%eax
movl $stack_canary,%ecx
movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
shrl $16, %ecx
@@ -758,7 +758,7 @@ idt_descr:
.word 0 # 32 bit align gdt_desc.address
ENTRY(early_gdt_descr)
.word GDT_ENTRIES*8-1
- .long gdt_page /* Overwritten for secondary CPUs */
+ .long cpu_gdt /* Overwritten for secondary CPUs */
/*
* The boot_gdt must mirror the equivalent in setup.S and is
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 1d40ca8..5a80e8c 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -510,7 +510,7 @@ NEXT_PAGE(level1_fixmap_pgt)
early_gdt_descr:
.word GDT_ENTRIES*8-1
early_gdt_descr_base:
- .quad INIT_PER_CPU_VAR(gdt_page)
+ .quad INIT_PER_CPU_VAR(cpu_gdt)
ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 74e4bf1..198e46b 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -348,7 +348,7 @@ SECTIONS
* for the boot processor.
*/
#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
-INIT_PER_CPU(gdt_page);
+INIT_PER_CPU(cpu_gdt);
INIT_PER_CPU(irq_stack_union);
/*
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index 0c2fae8..5a1da82 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -736,7 +736,7 @@ static void percpu_init(void)
*
* The "gold" linker incorrectly associates:
* init_per_cpu__irq_stack_union
- * init_per_cpu__gdt_page
+ * init_per_cpu__cpu_gdt
*/
static int is_percpu_sym(ElfW(Sym) *sym, const char *symname)
{
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 30d12af..baecfc6 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1592,10 +1592,10 @@ asmlinkage __visible void __init xen_start_kernel(void)
/*
* The only reliable way to retain the initial address of the
- * percpu gdt_page is to remember it here, so we can go and
+ * percpu cpu_gdt is to remember it here, so we can go and
* mark it RW later, when the initial percpu area is freed.
*/
- xen_initial_gdt = &per_cpu(gdt_page, 0);
+ xen_initial_gdt = &per_cpu(cpu_gdt, 0);
xen_smp_init();
--
1.8.1.4
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists