Use %gs as the PDA base-segment in the kernel. This patch is the meat of the PDA change. This patch makes several related changes: 1: Most significantly, %gs is now used in the kernel. This means that on entry, the old value of %gs is saved away, and it is reloaded with __KERNEL_PDA. 2: entry.S constructs the stack in the shape of struct pt_regs, and this is passed around the kernel so that the process's saved register state can be accessed. Unfortunately struct pt_regs doesn't currently have space for %gs (or %fs). This patch extends pt_regs to add space for gs (no space is allocated for %fs, since it won't be used, and it would just complicate the code in entry.S to work around the space). 3: Because %gs is now saved on the stack like %ds, %es and the integer registers, there are a number of places where it no longer needs to be handled specially; namely context switch, and saving/restoring the register state in a signal context. 4: And since kernel threads run in kernel space and call normal kernel code, they need to be created with their %gs == __KERNEL_PDA. NOTE: even though it's called "ptrace-abi.h", this file does not define a user-space visible ABI. Signed-off-by: Jeremy Fitzhardinge Cc: Chuck Ebbert <76306.1226@compuserve.com> Cc: Zachary Amsden Cc: Jan Beulich Cc: Andi Kleen --- arch/i386/kernel/asm-offsets.c | 1 arch/i386/kernel/cpu/common.c | 21 +++++++++- arch/i386/kernel/entry.S | 81 +++++++++++++++++++++++----------------- arch/i386/kernel/head.S | 33 +++++++++++++--- arch/i386/kernel/process.c | 27 ++++++------- arch/i386/kernel/signal.c | 6 -- include/asm-i386/mmu_context.h | 4 - include/asm-i386/processor.h | 4 + include/asm-i386/ptrace.h | 2 kernel/fork.c | 2 10 files changed, 119 insertions(+), 62 deletions(-) diff -r 1b8b053ad03f arch/i386/kernel/asm-offsets.c --- a/arch/i386/kernel/asm-offsets.c Mon Sep 25 23:31:01 2006 -0700 +++ b/arch/i386/kernel/asm-offsets.c Mon Sep 25 23:31:04 2006 -0700 @@ -68,6 +68,7 @@ void foo(void) OFFSET(PT_EAX, pt_regs, eax); OFFSET(PT_DS, pt_regs, xds); OFFSET(PT_ES, pt_regs, xes); + OFFSET(PT_GS, pt_regs, xgs); OFFSET(PT_ORIG_EAX, pt_regs, orig_eax); OFFSET(PT_EIP, pt_regs, eip); OFFSET(PT_CS, pt_regs, xcs); diff -r 1b8b053ad03f arch/i386/kernel/cpu/common.c --- a/arch/i386/kernel/cpu/common.c Mon Sep 25 23:31:01 2006 -0700 +++ b/arch/i386/kernel/cpu/common.c Mon Sep 25 23:31:04 2006 -0700 @@ -587,6 +587,14 @@ void __init early_cpu_init(void) #endif } +/* Make sure %gs is initialized properly in idle threads */ +struct pt_regs * __devinit idle_regs(struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + regs->xgs = __KERNEL_PDA; + return regs; +} + __cpuinit int alloc_gdt(int cpu) { struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); @@ -640,6 +648,14 @@ struct i386_pda boot_pda = { .pcurrent = &init_task, }; +static inline void set_kernel_gs(void) +{ + /* Set %gs for this CPU's PDA. Memory clobber is to create a + barrier with respect to any PDA operations, so the compiler + doesn't move any before here. */ + asm volatile ("mov %0, %%gs" : : "r" (__KERNEL_PDA) : "memory"); +} + /* Initialize the CPU's GDT and PDA. The boot CPU does this for itself, but secondaries find this done for them. */ __cpuinit int init_gdt(int cpu, struct task_struct *idle) @@ -698,6 +714,7 @@ static void __cpuinit _cpu_init(int cpu, the boot CPU, this will transition from the boot gdt+pda to the real ones). */ load_gdt(cpu_gdt_descr); + set_kernel_gs(); if (cpu_test_and_set(cpu, cpu_initialized)) { printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); @@ -736,8 +753,8 @@ static void __cpuinit _cpu_init(int cpu, __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); #endif - /* Clear %fs and %gs. */ - asm volatile ("movl %0, %%fs; movl %0, %%gs" : : "r" (0)); + /* Clear %fs. */ + asm volatile ("mov %0, %%fs" : : "r" (0)); /* Clear all 6 debug registers: */ set_debugreg(0, 0); diff -r 1b8b053ad03f arch/i386/kernel/entry.S --- a/arch/i386/kernel/entry.S Mon Sep 25 23:31:01 2006 -0700 +++ b/arch/i386/kernel/entry.S Mon Sep 25 23:32:16 2006 -0700 @@ -30,12 +30,13 @@ * 18(%esp) - %eax * 1C(%esp) - %ds * 20(%esp) - %es - * 24(%esp) - orig_eax - * 28(%esp) - %eip - * 2C(%esp) - %cs - * 30(%esp) - %eflags - * 34(%esp) - %oldesp - * 38(%esp) - %oldss + * 24(%esp) - %gs + * 28(%esp) - orig_eax + * 2C(%esp) - %eip + * 30(%esp) - %cs + * 34(%esp) - %eflags + * 38(%esp) - %oldesp + * 3C(%esp) - %oldss * * "current" is in register %ebx during any slow entries. */ @@ -91,6 +92,9 @@ 1: #define SAVE_ALL \ cld; \ + pushl %gs; \ + CFI_ADJUST_CFA_OFFSET 4;\ + /*CFI_REL_OFFSET gs, 0;*/\ pushl %es; \ CFI_ADJUST_CFA_OFFSET 4;\ /*CFI_REL_OFFSET es, 0;*/\ @@ -120,8 +124,10 @@ 1: CFI_REL_OFFSET ebx, 0;\ movl $(__USER_DS), %edx; \ movl %edx, %ds; \ - movl %edx, %es; - + movl %edx, %es; \ + movl $(__KERNEL_PDA), %edx; \ + movl %edx, %gs + #define RESTORE_INT_REGS \ popl %ebx; \ CFI_ADJUST_CFA_OFFSET -4;\ @@ -153,17 +159,22 @@ 2: popl %es; \ 2: popl %es; \ CFI_ADJUST_CFA_OFFSET -4;\ /*CFI_RESTORE es;*/\ -.section .fixup,"ax"; \ -3: movl $0,(%esp); \ +3: popl %gs; \ + CFI_ADJUST_CFA_OFFSET -4;\ + /*CFI_RESTORE gs;*/\ +.pushsection .fixup,"ax"; \ +4: movl $0,(%esp); \ jmp 1b; \ -4: movl $0,(%esp); \ +5: movl $0,(%esp); \ jmp 2b; \ -.previous; \ +6: movl $0,(%esp); \ + jmp 3b; \ .section __ex_table,"a";\ .align 4; \ - .long 1b,3b; \ - .long 2b,4b; \ -.previous + .long 1b,4b; \ + .long 2b,5b; \ + .long 3b,6b; \ +.popsection #define RING0_INT_FRAME \ CFI_STARTPROC simple;\ @@ -321,10 +332,17 @@ 1: movl (%ebp),%ebp movl PT_EIP(%esp), %edx movl PT_OLDESP(%esp), %ecx xorl %ebp,%ebp +1: mov PT_GS(%esp), %gs TRACE_IRQS_ON ENABLE_INTERRUPTS_SYSEXIT CFI_ENDPROC - +.pushsection .fixup,"ax" +2: movl $0,PT_GS(%esp) + jmp 1b +.section __ex_table,"a" + .align 4 + .long 1b,2b +.popsection # system call handler stub ENTRY(system_call) @@ -370,7 +388,7 @@ restore_nocheck: TRACE_IRQS_IRET restore_nocheck_notrace: RESTORE_REGS - addl $4, %esp + addl $4, %esp # skip orig_eax/error_code CFI_ADJUST_CFA_OFFSET -4 1: INTERRUPT_RETURN .section .fixup,"ax" @@ -512,14 +530,12 @@ syscall_badsys: /* put ESP to the proper location */ \ movl %eax, %esp; #define UNWIND_ESPFIX_STACK \ - pushl %eax; \ CFI_ADJUST_CFA_OFFSET 4; \ movl %ss, %eax; \ /* see if on 16bit stack */ \ - cmpw $__ESPFIX_SS, %ax; \ + cmp $__ESPFIX_SS, %eax; \ je 28f; \ -27: popl %eax; \ - CFI_ADJUST_CFA_OFFSET -4; \ +27: CFI_ADJUST_CFA_OFFSET -4; \ .section .fixup,"ax"; \ 28: movl $__KERNEL_DS, %eax; \ movl %eax, %ds; \ @@ -588,13 +604,15 @@ KPROBE_ENTRY(page_fault) CFI_ADJUST_CFA_OFFSET 4 ALIGN error_code: + /* the function address is in %gs's slot on the stack */ + pushl %es + CFI_ADJUST_CFA_OFFSET 4 pushl %ds CFI_ADJUST_CFA_OFFSET 4 /*CFI_REL_OFFSET ds, 0*/ pushl %eax CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET eax, 0 - xorl %eax, %eax pushl %ebp CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ebp, 0 @@ -607,7 +625,6 @@ error_code: pushl %edx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET edx, 0 - decl %eax # eax = -1 pushl %ecx CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ecx, 0 @@ -615,21 +632,17 @@ error_code: CFI_ADJUST_CFA_OFFSET 4 CFI_REL_OFFSET ebx, 0 cld - pushl %es - CFI_ADJUST_CFA_OFFSET 4 - /*CFI_REL_OFFSET es, 0*/ UNWIND_ESPFIX_STACK - popl %ecx - CFI_ADJUST_CFA_OFFSET -4 - /*CFI_REGISTER es, ecx*/ - movl PT_ES(%esp), %edi # get the function address + movl PT_GS(%esp), %edi # get the function address movl PT_ORIG_EAX(%esp), %edx # get the error code - movl %eax, PT_ORIG_EAX(%esp) - movl %ecx, PT_ES(%esp) - /*CFI_REL_OFFSET es, ES*/ + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart + mov %gs, PT_GS(%esp) + /*CFI_REL_OFFSET gs, GS*/ movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es + movl $(__KERNEL_PDA), %ecx + movl %ecx, %gs movl %esp,%eax # pt_regs pointer call *%edi jmp ret_from_exception @@ -939,6 +952,7 @@ ENTRY(arch_unwind_init_running) movl %ebx, PT_EAX(%edx) movl $__USER_DS, PT_DS(%edx) movl $__USER_DS, PT_ES(%edx) + movl $0, PT_GS(%edx) movl %ebx, PT_ORIG_EAX(%edx) movl %ecx, PT_EIP(%edx) movl 12(%esp), %ecx diff -r 1b8b053ad03f arch/i386/kernel/head.S --- a/arch/i386/kernel/head.S Mon Sep 25 23:31:01 2006 -0700 +++ b/arch/i386/kernel/head.S Mon Sep 25 23:31:04 2006 -0700 @@ -302,6 +302,7 @@ 2: movl %cr0,%eax movl %eax,%cr0 call check_x87 + call setup_pda lgdt cpu_gdt_descr lidt idt_descr ljmp $(__KERNEL_CS),$1f @@ -312,10 +313,13 @@ 1: movl $(__KERNEL_DS),%eax # reload all movl %eax,%ds movl %eax,%es - xorl %eax,%eax # Clear FS/GS and LDT + xorl %eax,%eax # Clear FS and LDT movl %eax,%fs - movl %eax,%gs lldt %ax + + movl $(__KERNEL_PDA),%eax + mov %eax,%gs + cld # gcc2 wants the direction flag cleared at all times pushl %eax # fake return address #ifdef CONFIG_SMP @@ -345,6 +349,23 @@ 1: movb $1,X86_HARD_MATH .byte 0xDB,0xE4 /* fsetpm for 287, ignored by 387 */ ret +/* + * Point the GDT at this CPU's PDA. On boot this will be + * cpu_gdt_table and boot_pda; for secondary CPUs, these will be + * that CPU's GDT and PDA. + */ +setup_pda: + /* get the PDA pointer */ + movl start_pda, %eax + + /* slot the PDA address into the GDT */ + mov cpu_gdt_descr+2, %ecx + mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ + shr $16, %eax + mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ + mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */ + ret + /* * setup_idt * @@ -424,7 +445,9 @@ ENTRY(empty_zero_page) * This starts the data section. */ .data - +ENTRY(start_pda) + .long boot_pda + ENTRY(stack_start) .long init_thread_union+THREAD_SIZE .long __BOOT_DS @@ -458,7 +481,7 @@ idt_descr: # boot GDT descriptor (later on used by CPU#0): .word 0 # 32 bit align gdt_desc.address -cpu_gdt_descr: +ENTRY(cpu_gdt_descr) .word GDT_ENTRIES*8-1 .long cpu_gdt_table @@ -518,7 +541,7 @@ ENTRY(cpu_gdt_table) .quad 0x004092000000ffff /* 0xc8 APM DS data */ .quad 0x0000920000000000 /* 0xd0 - ESPFIX 16-bit SS */ - .quad 0x0000000000000000 /* 0xd8 - PDA */ + .quad 0x00cf92000000ffff /* 0xd8 - PDA */ .quad 0x0000000000000000 /* 0xe0 - unused */ .quad 0x0000000000000000 /* 0xe8 - unused */ .quad 0x0000000000000000 /* 0xf0 - unused */ diff -r 1b8b053ad03f arch/i386/kernel/process.c --- a/arch/i386/kernel/process.c Mon Sep 25 23:31:01 2006 -0700 +++ b/arch/i386/kernel/process.c Mon Sep 25 23:31:04 2006 -0700 @@ -56,6 +56,7 @@ #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); @@ -336,6 +337,7 @@ int kernel_thread(int (*fn)(void *), voi regs.xds = __USER_DS; regs.xes = __USER_DS; + regs.xgs = __KERNEL_PDA; regs.orig_eax = -1; regs.eip = (unsigned long) kernel_thread_helper; regs.xcs = __KERNEL_CS | get_kernel_rpl(); @@ -421,7 +423,6 @@ int copy_thread(int nr, unsigned long cl p->thread.eip = (unsigned long) ret_from_fork; savesegment(fs,p->thread.fs); - savesegment(gs,p->thread.gs); tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { @@ -645,16 +646,16 @@ struct task_struct fastcall * __switch_t load_esp0(tss, next); /* - * Save away %fs and %gs. No need to save %es and %ds, as - * those are always kernel segments while inside the kernel. - * Doing this before setting the new TLS descriptors avoids - * the situation where we temporarily have non-reloadable - * segments in %fs and %gs. This could be an issue if the - * NMI handler ever used %fs or %gs (it does not today), or - * if the kernel is running inside of a hypervisor layer. + * Save away %fs. No need to save %gs, as it was saved on the + * stack on entry. No need to save %es and %ds, as those are + * always kernel segments while inside the kernel. Doing this + * before setting the new TLS descriptors avoids the situation + * where we temporarily have non-reloadable segments in %fs + * and %gs. This could be an issue if the NMI handler ever + * used %fs or %gs (it does not today), or if the kernel is + * running inside of a hypervisor layer. */ savesegment(fs, prev->fs); - savesegment(gs, prev->gs); /* * Load the per-thread Thread-Local Storage descriptor. @@ -662,16 +663,14 @@ struct task_struct fastcall * __switch_t load_TLS(next, cpu); /* - * Restore %fs and %gs if needed. + * Restore %fs if needed. * - * Glibc normally makes %fs be zero, and %gs is one of - * the TLS segments. + * Glibc normally makes %fs be zero. */ if (unlikely(prev->fs | next->fs)) loadsegment(fs, next->fs); - if (prev->gs | next->gs) - loadsegment(gs, next->gs); + write_pda(pcurrent, next_p); /* * Restore IOPL if needed. diff -r 1b8b053ad03f arch/i386/kernel/signal.c --- a/arch/i386/kernel/signal.c Mon Sep 25 23:31:01 2006 -0700 +++ b/arch/i386/kernel/signal.c Mon Sep 25 23:31:04 2006 -0700 @@ -128,7 +128,7 @@ restore_sigcontext(struct pt_regs *regs, X86_EFLAGS_TF | X86_EFLAGS_SF | X86_EFLAGS_ZF | \ X86_EFLAGS_AF | X86_EFLAGS_PF | X86_EFLAGS_CF) - GET_SEG(gs); + COPY_SEG(gs); GET_SEG(fs); COPY_SEG(es); COPY_SEG(ds); @@ -244,9 +244,7 @@ setup_sigcontext(struct sigcontext __use { int tmp, err = 0; - tmp = 0; - savesegment(gs, tmp); - err |= __put_user(tmp, (unsigned int __user *)&sc->gs); + err |= __put_user(regs->xgs, (unsigned int __user *)&sc->gs); savesegment(fs, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->fs); diff -r 1b8b053ad03f include/asm-i386/mmu_context.h --- a/include/asm-i386/mmu_context.h Mon Sep 25 23:31:01 2006 -0700 +++ b/include/asm-i386/mmu_context.h Mon Sep 25 23:31:04 2006 -0700 @@ -62,8 +62,8 @@ static inline void switch_mm(struct mm_s #endif } -#define deactivate_mm(tsk, mm) \ - asm("movl %0,%%fs ; movl %0,%%gs": :"r" (0)) +#define deactivate_mm(tsk, mm) \ + asm("movl %0,%%fs": :"r" (0)); #define activate_mm(prev, next) \ switch_mm((prev),(next),NULL) diff -r 1b8b053ad03f include/asm-i386/processor.h --- a/include/asm-i386/processor.h Mon Sep 25 23:31:01 2006 -0700 +++ b/include/asm-i386/processor.h Mon Sep 25 23:31:04 2006 -0700 @@ -477,6 +477,7 @@ struct thread_struct { .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ + .gs = __KERNEL_PDA, \ } /* @@ -504,7 +505,8 @@ static inline void load_esp0(struct tss_ } #define start_thread(regs, new_eip, new_esp) do { \ - __asm__("movl %0,%%fs ; movl %0,%%gs": :"r" (0)); \ + __asm__("movl %0,%%fs": :"r" (0)); \ + regs->xgs = 0; \ set_fs(USER_DS); \ regs->xds = __USER_DS; \ regs->xes = __USER_DS; \ diff -r 1b8b053ad03f include/asm-i386/ptrace.h --- a/include/asm-i386/ptrace.h Mon Sep 25 23:31:01 2006 -0700 +++ b/include/asm-i386/ptrace.h Mon Sep 25 23:31:04 2006 -0700 @@ -33,6 +33,8 @@ struct pt_regs { long eax; int xds; int xes; + /* int xfs; */ + int xgs; long orig_eax; long eip; int xcs; diff -r 1b8b053ad03f kernel/fork.c --- a/kernel/fork.c Mon Sep 25 23:31:01 2006 -0700 +++ b/kernel/fork.c Mon Sep 25 23:31:04 2006 -0700 @@ -1299,7 +1299,7 @@ fork_out: return ERR_PTR(retval); } -struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) +noinline struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs) { memset(regs, 0, sizeof(struct pt_regs)); return regs;