Core Xen Implementation This patch is a rollup of all the core pieces of the Xen implementation, including booting, memory management, interrupts, time and so on. This adds a second entry point to head.S, which is jumped to when booted by Xen. This allows startup under Xen to be easily detected. Because Xen starts the kernel in a fairly sane state, very little setup is needed here; it just needs to jump into xen_start_kernel to init the paravirt_ops structure, and then jump into start_kernel proper. This also makes a few small adjustments to the gdt tables to make them properly suited to Xen. This patch also contains an initial set of paravirt ops for Xen, mostly ones which can just be implemented with the generic native_* version. At boot time, the global paravirt_ops structure is populated with the Xen pointers in xen_start_kernel, which then jumps to the standard start_kernel. Also implements Xen pagetable handling, including the machinery to implement direct pagetables. This means that pte update operations are intercepted so that pseudo-physical addresses can be converted to machine addresses. It also implements late pinning/early unpinning so that pagetables are initialized while they're normal read-write memory as much as possible, yet pinned to make cr3 reloads as efficient as possible. Xen implements interrupts in terms of event channels. This patch maps an event through an event channel to an irq, and then feeds it through the normal interrupt path, via a Xen irq_chip implementation. Context switches are optimised with the use of multicalls, so that the multiple hypercalls needed to perform the switch can be implemented with a single hypercall. In order to simplify the source changes, this is done via a generic multicall batching mechanism, which can be used to batch arbitrary series of hypercalls into a multicall. Time is implemented by using a clocksource which is driven from the hypervisor's nanosecond timebase. Xen implements time by extrapolating from known timestamps using the tsc; the hypervisor is responsible for making sure that the tsc is constant rate and synchronized between vcpus. Signed-off-by: Jeremy Fitzhardinge Cc: Ian Pratt Cc: Christian Limpach Cc: Adrian Bunk --- arch/i386/Makefile | 3 arch/i386/kernel/entry.S | 79 ++++ arch/i386/kernel/head.S | 5 arch/i386/kernel/paravirt.c | 3 arch/i386/kernel/vmlinux.lds.S | 1 arch/i386/xen/Kconfig | 2 arch/i386/xen/Makefile | 2 arch/i386/xen/enlighten.c | 727 ++++++++++++++++++++++++++++++++++++++ arch/i386/xen/events.c | 487 +++++++++++++++++++++++++ arch/i386/xen/features.c | 29 + arch/i386/xen/mmu.c | 377 +++++++++++++++++++ arch/i386/xen/mmu.h | 47 ++ arch/i386/xen/multicalls.c | 61 +++ arch/i386/xen/multicalls.h | 26 + arch/i386/xen/setup.c | 93 ++++ arch/i386/xen/time.c | 393 ++++++++++++++++++++ arch/i386/xen/xen-head.S | 34 + arch/i386/xen/xen-ops.h | 34 + include/asm-i386/irq.h | 1 include/asm-i386/xen/hypercall.h | 18 include/xen/events.h | 28 + include/xen/features.h | 26 + include/xen/page.h | 178 +++++++++ 23 files changed, 2650 insertions(+), 4 deletions(-) =================================================================== --- a/arch/i386/Makefile +++ b/arch/i386/Makefile @@ -93,6 +93,9 @@ mcore-$(CONFIG_X86_ES7000) := mach-defau mcore-$(CONFIG_X86_ES7000) := mach-default core-$(CONFIG_X86_ES7000) := arch/i386/mach-es7000/ +# Xen paravirtualization support +core-$(CONFIG_XEN) += arch/i386/xen/ + # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default =================================================================== --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -1023,6 +1023,85 @@ ENTRY(kernel_thread_helper) CFI_ENDPROC ENDPROC(kernel_thread_helper) +#ifdef CONFIG_XEN +/* Xen only supports sysenter/sysexit in ring0 guests, + and only if it the guest asks for it. So for now, + this should never be used. */ +ENTRY(xen_sti_sysexit) + CFI_STARTPROC + ud2 + CFI_ENDPROC +ENDPROC(xen_sti_sysexit) + +ENTRY(xen_hypervisor_callback) + CFI_STARTPROC + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + mov %esp, %eax + call xen_evtchn_do_upcall + jmp ret_from_intr + CFI_ENDPROC +ENDPROC(xen_hypervisor_callback) + +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we fix up by reattempting the load, and zeroing the segment +# register if the load fails. +# Category 2 we fix up by jumping to do_iret_error. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by maintaining a status value in EAX. +ENTRY(xen_failsafe_callback) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 + movl $1,%eax +1: mov 4(%esp),%ds +2: mov 8(%esp),%es +3: mov 12(%esp),%fs +4: mov 16(%esp),%gs + testl %eax,%eax + popl %eax + CFI_ADJUST_CFA_OFFSET -4 + lea 16(%esp),%esp + CFI_ADJUST_CFA_OFFSET -16 + jz 5f + addl $16,%esp + jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) +5: pushl $0 # EAX == 0 => Category 1 (Bad segment) + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + jmp ret_from_exception + CFI_ENDPROC + +.section .fixup,"ax" +6: xorl %eax,%eax + movl %eax,4(%esp) + jmp 1b +7: xorl %eax,%eax + movl %eax,8(%esp) + jmp 2b +8: xorl %eax,%eax + movl %eax,12(%esp) + jmp 3b +9: xorl %eax,%eax + movl %eax,16(%esp) + jmp 4b +.previous +.section __ex_table,"a" + .align 4 + .long 1b,6b + .long 2b,7b + .long 3b,8b + .long 4b,9b +.previous +ENDPROC(xen_failsafe_callback) + +#endif /* CONFIG_XEN */ + .section .rodata,"a" #include "syscall_table.S" =================================================================== --- a/arch/i386/kernel/head.S +++ b/arch/i386/kernel/head.S @@ -533,6 +533,8 @@ unhandled_paravirt: ud2 #endif +#include "../xen/xen-head.S" + /* * Real beginning of normal "text" segment */ @@ -542,7 +544,8 @@ ENTRY(_stext) /* * BSS section */ -.section ".bss.page_aligned","w" +.section ".bss.page_aligned","wa" + .align PAGE_SIZE_asm ENTRY(swapper_pg_dir) .fill 1024,4,0 ENTRY(empty_zero_page) =================================================================== --- a/arch/i386/kernel/paravirt.c +++ b/arch/i386/kernel/paravirt.c @@ -292,6 +292,7 @@ struct paravirt_ops paravirt_ops = { .apic_read = native_apic_read, .setup_boot_clock = setup_boot_APIC_clock, .setup_secondary_clock = setup_secondary_APIC_clock, + .startup_ipi_hook = paravirt_nop, #endif .set_lazy_mode = paravirt_nop, @@ -342,8 +343,6 @@ struct paravirt_ops paravirt_ops = { .dup_mmap = paravirt_nop, .exit_mmap = paravirt_nop, .activate_mm = paravirt_nop, - - .startup_ipi_hook = paravirt_nop, }; /* =================================================================== --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -94,6 +94,7 @@ SECTIONS . = ALIGN(4096); .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) *(.data.idt) } =================================================================== --- a/arch/i386/xen/Kconfig +++ b/arch/i386/xen/Kconfig @@ -4,7 +4,7 @@ config XEN bool "Enable support for Xen hypervisor" - depends on PARAVIRT && HZ_100 && !PREEMPT && !NO_HZ + depends on PARAVIRT && !PREEMPT default y help This is the Linux Xen port. =================================================================== --- /dev/null +++ b/arch/i386/xen/Makefile @@ -0,0 +1,2 @@ +obj-y := enlighten.o setup.o events.o time.o \ + features.o mmu.o multicalls.o =================================================================== --- /dev/null +++ b/arch/i386/xen/enlighten.c @@ -0,0 +1,727 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xen-ops.h" +#include "mmu.h" +#include "multicalls.h" + +EXPORT_SYMBOL_GPL(hypercall_page); + +DEFINE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); + +DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); +DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +DEFINE_PER_CPU(unsigned long, xen_cr3); + +/* Code defined in entry.S (not a function) */ +extern const char xen_sti_sysexit[]; + +struct start_info *xen_start_info; +EXPORT_SYMBOL_GPL(xen_start_info); + +static void xen_vcpu_setup(int cpu) +{ + per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; +} + +static void __init xen_banner(void) +{ + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", + paravirt_ops.name); + printk(KERN_INFO "Hypervisor signature: %s\n", xen_start_info->magic); +} + +static void xen_cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + unsigned maskedx = ~0; + if (*eax == 1) + maskedx = ~((1 << X86_FEATURE_APIC) | + (1 << X86_FEATURE_ACPI) | + (1 << X86_FEATURE_ACC)); + + asm(XEN_EMULATE_PREFIX "cpuid" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); + *edx &= maskedx; +} + +static void xen_set_debugreg(int reg, unsigned long val) +{ + HYPERVISOR_set_debugreg(reg, val); +} + +static unsigned long xen_get_debugreg(int reg) +{ + return HYPERVISOR_get_debugreg(reg); +} + +static unsigned long xen_save_fl(void) +{ + struct vcpu_info *vcpu; + unsigned long flags; + + preempt_disable(); + vcpu = x86_read_percpu(xen_vcpu); + /* flag has opposite sense of mask */ + flags = !vcpu->evtchn_upcall_mask; + preempt_enable(); + + /* convert to IF type flag + -0 -> 0x00000000 + -1 -> 0xffffffff + */ + return (-flags) & X86_EFLAGS_IF; +} + +static void xen_restore_fl(unsigned long flags) +{ + struct vcpu_info *vcpu; + + preempt_disable(); + + /* convert from IF type flag */ + flags = !(flags & X86_EFLAGS_IF); + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = flags; + if (flags == 0) { + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + force_evtchn_callback(); + preempt_enable(); + } else + preempt_enable_no_resched(); +} + +static void xen_irq_disable(void) +{ + struct vcpu_info *vcpu; + preempt_disable(); + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = 1; + preempt_enable_no_resched(); +} + +static void xen_irq_enable(void) +{ + struct vcpu_info *vcpu; + + preempt_disable(); + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = 0; + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + force_evtchn_callback(); + preempt_enable(); +} + +static void xen_safe_halt(void) +{ + /* Blocking includes an implicit local_irq_enable(). */ + if (HYPERVISOR_sched_op(SCHEDOP_block, 0) != 0) + BUG(); +} + +static void xen_halt(void) +{ +#if 0 + if (irqs_disabled()) + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); +#endif +} + +static void xen_set_lazy_mode(enum paravirt_lazy_mode mode) +{ + enum paravirt_lazy_mode *lazy = &get_cpu_var(xen_lazy_mode); + + xen_mc_flush(); + + *lazy = mode; + + put_cpu_var(xen_lazy_mode); +} + +static unsigned long xen_store_tr(void) +{ + return 0; +} + +static void xen_set_ldt(const void *addr, unsigned entries) +{ + unsigned long linear_addr = (unsigned long)addr; + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + + op = mcs.args; + op->cmd = MMUEXT_SET_LDT; + if (linear_addr) { + /* ldt my be vmalloced, use arbitrary_virt_to_machine */ + xmaddr_t maddr; + maddr = arbitrary_virt_to_machine((unsigned long)addr); + linear_addr = (unsigned long)maddr.maddr; + } + op->arg1.linear_addr = linear_addr; + op->arg2.nr_ents = entries; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(); +} + +static void xen_load_gdt(const struct Xgt_desc_struct *dtr) +{ + unsigned long *frames; + unsigned long va = dtr->address; + unsigned int size = dtr->size + 1; + int f; + struct multicall_space mcs; + + BUG_ON(size > 16*PAGE_SIZE); + BUG_ON(va & ~PAGE_MASK); + + mcs = xen_mc_entry(sizeof(*frames) * (size/PAGE_SIZE)); + frames = mcs.args; + + for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { + frames[f] = virt_to_mfn(va); + make_lowmem_page_readonly((void *)va); + } + + MULTI_set_gdt(mcs.mc, frames, size/8); + + xen_mc_issue(); +} + +static void load_TLS_descriptor(struct thread_struct *t, + unsigned int cpu, unsigned int i) +{ + struct desc_struct *gdt = get_cpu_gdt_table(cpu); + xmaddr_t maddr = virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); + struct multicall_space mc = xen_mc_entry(0); + + MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]); +} + +static void xen_load_tls(struct thread_struct *t, unsigned int cpu) +{ + load_TLS_descriptor(t, cpu, 0); + load_TLS_descriptor(t, cpu, 1); + load_TLS_descriptor(t, cpu, 2); + + xen_mc_issue(); +} + +static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, u32 low, u32 high) +{ + unsigned long lp = (unsigned long)&dt[entrynum]; + xmaddr_t mach_lp = virt_to_machine(lp); + u64 entry = (u64)high << 32 | low; + + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry)) + BUG(); +} + +static int cvt_gate_to_trap(int vector, u32 low, u32 high, struct trap_info *info) +{ + u8 type, dpl; + + type = (high >> 8) & 0x1f; + dpl = (high >> 13) & 3; + + if (type != 0xf && type != 0xe) + return 0; + + info->vector = vector; + info->address = (high & 0xffff0000) | (low & 0x0000ffff); + info->cs = low >> 16; + info->flags = dpl; + /* interrupt gates clear IF */ + if (type == 0xe) + info->flags |= 4; + + return 1; +} + +/* Locations of each CPU's IDT */ +static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc); + +/* Set an IDT entry. If the entry is part of the current IDT, then + also update Xen. */ +static void xen_write_idt_entry(struct desc_struct *dt, int entrynum, u32 low, u32 high) +{ + + int cpu = smp_processor_id(); + unsigned long p = (unsigned long)&dt[entrynum]; + unsigned long start = per_cpu(idt_desc, cpu).address; + unsigned long end = start + per_cpu(idt_desc, cpu).size + 1; + + xen_mc_flush(); + + write_dt_entry(dt, entrynum, low, high); + + if (p >= start && (p + 8) <= end) { + struct trap_info info[2]; + + info[1].address = 0; + + if (cvt_gate_to_trap(entrynum, low, high, &info[0])) + if (HYPERVISOR_set_trap_table(info)) + BUG(); + } +} + +/* Load a new IDT into Xen. In principle this can be per-CPU, so we + hold a spinlock to protect the static traps[] array (static because + it avoids allocation, and saves stack space). */ +static void xen_load_idt(const struct Xgt_desc_struct *desc) +{ + static DEFINE_SPINLOCK(lock); + static struct trap_info traps[257]; + + int cpu = smp_processor_id(); + unsigned in, out, count; + + per_cpu(idt_desc, cpu) = *desc; + + count = desc->size / 8; + BUG_ON(count > 256); + + spin_lock(&lock); + for(in = out = 0; in < count; in++) { + const u32 *entry = (u32 *)(desc->address + in * 8); + + if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) + out++; + } + traps[out].address = 0; + + xen_mc_flush(); + if (HYPERVISOR_set_trap_table(traps)) + BUG(); + + spin_unlock(&lock); +} + +/* Write a GDT descriptor entry. Ignore LDT descriptors, since + they're handled differently. */ +static void xen_write_gdt_entry(struct desc_struct *dt, int entry, u32 low, u32 high) +{ + switch ((high >> 8) & 0xff) { + case DESCTYPE_LDT: + case DESCTYPE_TSS: + /* ignore */ + break; + + default: { + xmaddr_t maddr = virt_to_machine(&dt[entry]); + u64 desc = (u64)high << 32 | low; + + xen_mc_flush(); + if (HYPERVISOR_update_descriptor(maddr.maddr, desc)) + BUG(); + } + + } +} + +static void xen_load_esp0(struct tss_struct *tss, + struct thread_struct *thread) +{ + if (xen_get_lazy_mode() != PARAVIRT_LAZY_CPU) { + if (HYPERVISOR_stack_switch(__KERNEL_DS, thread->esp0)) + BUG(); + } else { + struct multicall_space mcs = xen_mc_entry(0); + MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0); + } +} + +static void xen_set_iopl_mask(unsigned mask) +{ +#if 0 + struct physdev_set_iopl set_iopl; + + /* Force the change at ring 0. */ + set_iopl.iopl = (mask == 0) ? 1 : (mask >> 12) & 3; + HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); +#endif +} + +static void xen_io_delay(void) +{ +} + +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned long xen_apic_read(unsigned long reg) +{ + return 0; +} +#endif + +static void xen_flush_tlb(void) +{ + struct mmuext_op op; + + op.cmd = MMUEXT_TLB_FLUSH_LOCAL; + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + +static void xen_flush_tlb_single(unsigned long addr) +{ + struct mmuext_op op; + + op.cmd = MMUEXT_INVLPG_LOCAL; + op.arg1.linear_addr = addr & PAGE_MASK; + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + +static unsigned long xen_read_cr2(void) +{ + return x86_read_percpu(xen_vcpu)->arch.cr2; +} + +static void xen_write_cr4(unsigned long cr4) +{ + /* never allow TSC to be disabled */ + native_write_cr4(cr4 & ~X86_CR4_TSD); +} + +/* + * Page-directory addresses above 4GB do not fit into architectural %cr3. + * When accessing %cr3, or equivalent field in vcpu_guest_context, guests + * must use the following accessor macros to pack/unpack valid MFNs. + */ +#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) +#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) + +static unsigned long xen_read_cr3(void) +{ + return x86_read_percpu(xen_cr3); +} + +static void xen_write_cr3(unsigned long cr3) +{ + if (cr3 == x86_read_percpu(xen_cr3)) { + /* just a simple tlb flush */ + xen_flush_tlb(); + return; + } + + x86_write_percpu(xen_cr3, cr3); + + + { + struct mmuext_op *op; + struct multicall_space mcs = xen_mc_entry(sizeof(*op)); + unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); + + op = mcs.args; + op->cmd = MMUEXT_NEW_BASEPTR; + op->arg1.mfn = mfn; + + MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); + + xen_mc_issue(); + } +} + +static void xen_alloc_pt(u32 pfn) +{ + /* XXX pfn isn't necessarily a lowmem page */ + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +} + +static void xen_alloc_pd(u32 pfn) +{ + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); +} + +static void xen_release_pd(u32 pfn) +{ + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + +static void xen_release_pt(u32 pfn) +{ + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); +} + +static void xen_alloc_pd_clone(u32 pfn, u32 clonepfn, + u32 start, u32 count) +{ + xen_alloc_pd(pfn); +} + +static __init void xen_pagetable_setup_start(pgd_t *base) +{ + pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; + + init_mm.pgd = base; + /* + * copy top-level of Xen-supplied pagetable into place. For + * !PAE we can use this as-is, but for PAE it is a stand-in + * while we copy the pmd pages. + */ + memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); + + if (PTRS_PER_PMD > 1) { + int i; + /* + * For PAE, need to allocate new pmds, rather than + * share Xen's, since Xen doesn't like pmd's being + * shared between address spaces. + */ + for(i = 0; i < PTRS_PER_PGD; i++) { + if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { + pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + + memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), + PAGE_SIZE); + + xen_alloc_pd(PFN_DOWN(__pa(pmd))); + + set_pgd(&base[i], __pgd(1 + __pa(pmd))); + } else + pgd_clear(&base[i]); + } + } + + /* make sure zero_page is mapped RO so we can use it in pagetables */ + make_lowmem_page_readonly(empty_zero_page); + make_lowmem_page_readonly(base); + /* + * Switch to new pagetable. This is done before + * pagetable_init has done anything so that the new pages + * added to the table can be prepared properly for Xen. + */ + xen_write_cr3(__pa(base)); +} + +static __init void xen_pagetable_setup_done(pgd_t *base) +{ + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* + * Create a mapping for the shared info page. + * Should be set_fixmap(), but shared_info is a machine + * address with no corresponding pseudo-phys address. + */ + set_pte_mfn(fix_to_virt(FIX_PARAVIRT_BOOTMAP), + PFN_DOWN(xen_start_info->shared_info), + PAGE_KERNEL); + + HYPERVISOR_shared_info = + (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); + + } else + HYPERVISOR_shared_info = + (struct shared_info *)__va(xen_start_info->shared_info); + + xen_pgd_pin(base); + + xen_vcpu_setup(smp_processor_id()); +} + +static const struct paravirt_ops xen_paravirt_ops __initdata = { + .paravirt_enabled = 1, + .shared_kernel_pmd = 0, + + .name = "Xen", + .banner = xen_banner, + + .patch = paravirt_patch_default, + + .memory_setup = xen_memory_setup, + .arch_setup = xen_arch_setup, + .init_IRQ = xen_init_IRQ, + .time_init = xen_time_init, + + .cpuid = xen_cpuid, + + .set_debugreg = xen_set_debugreg, + .get_debugreg = xen_get_debugreg, + + .clts = native_clts, + + .read_cr0 = native_read_cr0, + .write_cr0 = native_write_cr0, + + .read_cr2 = xen_read_cr2, + .write_cr2 = native_write_cr2, + + .read_cr3 = xen_read_cr3, + .write_cr3 = xen_write_cr3, + + .read_cr4 = native_read_cr4, + .read_cr4_safe = native_read_cr4_safe, + .write_cr4 = xen_write_cr4, + + .save_fl = xen_save_fl, + .restore_fl = xen_restore_fl, + .irq_disable = xen_irq_disable, + .irq_enable = xen_irq_enable, + .safe_halt = xen_safe_halt, + .halt = xen_halt, + .wbinvd = native_wbinvd, + + .read_msr = native_read_msr_safe, + .write_msr = native_write_msr_safe, + .read_tsc = native_read_tsc, + .read_pmc = native_read_pmc, + + .iret = (void *)&hypercall_page[__HYPERVISOR_iret], + .irq_enable_sysexit = (void *)xen_sti_sysexit, + + .load_tr_desc = paravirt_nop, + .set_ldt = xen_set_ldt, + .load_gdt = xen_load_gdt, + .load_idt = xen_load_idt, + .load_tls = xen_load_tls, + + .store_gdt = native_store_gdt, + .store_idt = native_store_idt, + .store_tr = xen_store_tr, + + .write_ldt_entry = xen_write_ldt_entry, + .write_gdt_entry = xen_write_gdt_entry, + .write_idt_entry = xen_write_idt_entry, + .load_esp0 = xen_load_esp0, + + .set_iopl_mask = xen_set_iopl_mask, + .io_delay = xen_io_delay, + .set_wallclock = xen_set_wallclock, + .get_wallclock = xen_get_wallclock, + .get_cpu_khz = xen_cpu_khz, + .sched_clock = xen_clocksource_read, + +#ifdef CONFIG_X86_LOCAL_APIC + .apic_write = paravirt_nop, + .apic_write_atomic = paravirt_nop, + .apic_read = xen_apic_read, + .setup_boot_clock = paravirt_nop, + .setup_secondary_clock = paravirt_nop, + .startup_ipi_hook = paravirt_nop, +#endif + + .flush_tlb_user = xen_flush_tlb, + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_single = xen_flush_tlb_single, + + .pte_update = paravirt_nop, + .pte_update_defer = paravirt_nop, + + .pagetable_setup_start = xen_pagetable_setup_start, + .pagetable_setup_done = xen_pagetable_setup_done, + .activate_mm = xen_activate_mm, + .dup_mmap = xen_dup_mmap, + .exit_mmap = xen_exit_mmap, + + .set_pte = xen_set_pte, + .set_pte_at = xen_set_pte_at, + .set_pmd = xen_set_pmd, + + .alloc_pt = xen_alloc_pt, + .alloc_pd = xen_alloc_pd, + .alloc_pd_clone = xen_alloc_pd_clone, + .release_pd = xen_release_pd, + .release_pt = xen_release_pt, + + .pte_val = xen_pte_val, + .pgd_val = xen_pgd_val, + + .make_pte = xen_make_pte, + .make_pgd = xen_make_pgd, + +#ifdef CONFIG_X86_PAE + .set_pte_atomic = xen_set_pte_atomic, + .set_pte_present = xen_set_pte_at, + .set_pud = xen_set_pud, + .pte_clear = xen_pte_clear, + .pmd_clear = xen_pmd_clear, + + .make_pmd = xen_make_pmd, + .pmd_val = xen_pmd_val, +#endif /* PAE */ + + .set_lazy_mode = xen_set_lazy_mode, +}; + +/* First C function to be called on Xen boot */ +static asmlinkage void __init xen_start_kernel(void) +{ + pgd_t *pgd; + + if (!xen_start_info) + return; + + BUG_ON(memcmp(xen_start_info->magic, "xen-3.0", 7) != 0); + + /* Install Xen paravirt ops */ + paravirt_ops = xen_paravirt_ops; + + xen_setup_features(); + + /* Get mfn list */ + if (!xen_feature(XENFEAT_auto_translated_physmap)) + phys_to_machine_mapping = (unsigned long *)xen_start_info->mfn_list; + + pgd = (pgd_t *)xen_start_info->pt_base; + + init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; + + init_mm.pgd = pgd; /* use the Xen pagetables to start */ + + /* keep using Xen gdt for now; no urgent need to change it */ + + x86_write_percpu(xen_cr3, __pa(pgd)); + xen_vcpu_setup(0); + + paravirt_ops.kernel_rpl = 1; + if (xen_feature(XENFEAT_supervisor_mode_kernel)) + paravirt_ops.kernel_rpl = 0; + + /* set the limit of our address space */ + reserve_top_address(-HYPERVISOR_VIRT_START + 2 * PAGE_SIZE); + + /* set up basic CPUID stuff */ + cpu_detect(&new_cpu_data); + new_cpu_data.hard_math = 1; + new_cpu_data.x86_capability[0] = cpuid_edx(1); + + /* Poke various useful things into boot_params */ + LOADER_TYPE = (9 << 4) | 0; + INITRD_START = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0; + INITRD_SIZE = xen_start_info->mod_len; + + /* Start the world */ + start_kernel(); +} + +paravirt_probe(xen_start_kernel); =================================================================== --- /dev/null +++ b/arch/i386/xen/events.c @@ -0,0 +1,487 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include "xen-ops.h" + +/* + * This lock protects updates to the following mapping and reference-count + * arrays. The lock does not need to be acquired to read the mapping tables. + */ +static DEFINE_SPINLOCK(irq_mapping_update_lock); + +/* IRQ <-> VIRQ mapping. */ +static DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; + +/* Packed IRQ information: binding type, sub-type index, and event channel. */ +struct packed_irq +{ + unsigned short evtchn; + unsigned char index; + unsigned char type; +}; + +static struct packed_irq irq_info[NR_IRQS]; + +/* Binding types. */ +enum { IRQT_UNBOUND, IRQT_PIRQ, IRQT_VIRQ, IRQT_IPI, IRQT_EVTCHN }; + +/* Convenient shorthand for packed representation of an unbound IRQ. */ +#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) + +static int evtchn_to_irq[NR_EVENT_CHANNELS] = { + [0 ... NR_EVENT_CHANNELS-1] = -1 +}; +static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; +static u8 cpu_evtchn[NR_EVENT_CHANNELS]; + +/* Reference counts for bindings to IRQs. */ +static int irq_bindcount[NR_IRQS]; + +/* Xen will never allocate port zero for any purpose. */ +#define VALID_EVTCHN(chn) ((chn) != 0) + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void force_evtchn_callback(void) +{ + (void)HYPERVISOR_xen_version(0, NULL); +} +EXPORT_SYMBOL_GPL(force_evtchn_callback); + +static struct irq_chip xen_dynamic_chip; + +/* Constructor for packed IRQ information. */ +static inline struct packed_irq mk_irq_info(u32 type, u32 index, u32 evtchn) +{ + return (struct packed_irq) { evtchn, index, type }; +} + +/* + * Accessors for packed IRQ information. + */ +static inline unsigned int evtchn_from_irq(int irq) +{ + return irq_info[irq].evtchn; +} + +static inline unsigned int index_from_irq(int irq) +{ + return irq_info[irq].index; +} + +static inline unsigned int type_from_irq(int irq) +{ + return irq_info[irq].type; +} + +static inline unsigned long active_evtchns(unsigned int cpu, + struct shared_info *sh, + unsigned int idx) +{ + return (sh->evtchn_pending[idx] & + cpu_evtchn_mask[cpu][idx] & + ~sh->evtchn_mask[idx]); +} + +static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) +{ + int irq = evtchn_to_irq[chn]; + + BUG_ON(irq == -1); +#ifdef CONFIG_SMP + irq_desc[irq].affinity = cpumask_of_cpu(cpu); +#endif + + __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); + __set_bit(chn, cpu_evtchn_mask[cpu]); + + cpu_evtchn[chn] = cpu; +} + +static void init_evtchn_cpu_bindings(void) +{ +#ifdef CONFIG_SMP + int i; + /* By default all event channels notify CPU#0. */ + for (i = 0; i < NR_IRQS; i++) + irq_desc[i].affinity = cpumask_of_cpu(0); +#endif + + memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); + memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); +} + +static inline unsigned int cpu_from_evtchn(unsigned int evtchn) +{ + return cpu_evtchn[evtchn]; +} + +static inline void clear_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_clear_bit(port, &s->evtchn_pending[0]); +} + +static inline void set_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, &s->evtchn_pending[0]); +} + + +/** + * notify_remote_via_irq - send event to remote end of event channel via irq + * @irq: irq of event channel to send event to + * + * Unlike notify_remote_via_evtchn(), this is safe to use across + * save/restore. Notifications on a broken connection are silently + * dropped. + */ +void notify_remote_via_irq(int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + notify_remote_via_evtchn(evtchn); +} +EXPORT_SYMBOL_GPL(notify_remote_via_irq); + +static void mask_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + sync_set_bit(port, &s->evtchn_mask[0]); +} + +static void unmask_evtchn(int port) +{ + struct shared_info *s = HYPERVISOR_shared_info; + unsigned int cpu = get_cpu(); + + BUG_ON(!irqs_disabled()); + + /* Slow path (hypercall) if this is a non-local port. */ + if (unlikely(cpu != cpu_from_evtchn(port))) { + struct evtchn_unmask unmask = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); + } else { + struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); + + sync_clear_bit(port, &s->evtchn_mask[0]); + + /* + * The following is basically the equivalent of + * 'hw_resend_irq'. Just like a real IO-APIC we 'lose + * the interrupt edge' if the channel is masked. + */ + if (sync_test_bit(port, &s->evtchn_pending[0]) && + !sync_test_and_set_bit(port / BITS_PER_LONG, + &vcpu_info->evtchn_pending_sel)) + vcpu_info->evtchn_upcall_pending = 1; + } + + put_cpu(); +} + +static int find_unbound_irq(void) +{ + int irq; + + /* Only allocate from dynirq range */ + for (irq = 0; irq < NR_IRQS; irq++) + if (irq_bindcount[irq] == 0) + break; + + if (irq == NR_IRQS) + panic("No available IRQ to bind to: increase NR_IRQS!\n"); + + return irq; +} + +static int bind_evtchn_to_irq(unsigned int evtchn) +{ + int irq; + + spin_lock(&irq_mapping_update_lock); + + irq = evtchn_to_irq[evtchn]; + + if (irq == -1) { + irq = find_unbound_irq(); + + dynamic_irq_init(irq); + set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, + handle_level_irq, "event"); + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_EVTCHN, 0, evtchn); + } + + irq_bindcount[irq]++; + + spin_unlock(&irq_mapping_update_lock); + + return irq; +} + +static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) +{ + struct evtchn_bind_virq bind_virq; + int evtchn, irq; + + spin_lock(&irq_mapping_update_lock); + + irq = per_cpu(virq_to_irq, cpu)[virq]; + + if (irq == -1) { + bind_virq.virq = virq; + bind_virq.vcpu = cpu; + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq) != 0) + BUG(); + evtchn = bind_virq.port; + + irq = find_unbound_irq(); + + dynamic_irq_init(irq); + set_irq_chip_and_handler_name(irq, &xen_dynamic_chip, + handle_level_irq, "virq"); + + evtchn_to_irq[evtchn] = irq; + irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); + + per_cpu(virq_to_irq, cpu)[virq] = irq; + + bind_evtchn_to_cpu(evtchn, cpu); + } + + irq_bindcount[irq]++; + + spin_unlock(&irq_mapping_update_lock); + + return irq; +} + +static void unbind_from_irq(unsigned int irq) +{ + struct evtchn_close close; + int evtchn = evtchn_from_irq(irq); + + spin_lock(&irq_mapping_update_lock); + + if (VALID_EVTCHN(evtchn) && (--irq_bindcount[irq] == 0)) { + close.port = evtchn; + if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) + BUG(); + + switch (type_from_irq(irq)) { + case IRQT_VIRQ: + per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) + [index_from_irq(irq)] = -1; + break; + default: + break; + } + + /* Closed ports are implicitly re-bound to VCPU0. */ + bind_evtchn_to_cpu(evtchn, 0); + + evtchn_to_irq[evtchn] = -1; + irq_info[irq] = IRQ_UNBOUND; + + dynamic_irq_init(irq); + } + + spin_unlock(&irq_mapping_update_lock); +} + +int bind_evtchn_to_irqhandler(unsigned int evtchn, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, const char *devname, void *dev_id) +{ + unsigned int irq; + int retval; + + irq = bind_evtchn_to_irq(evtchn); + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_evtchn_to_irqhandler); + +int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, const char *devname, void *dev_id) +{ + unsigned int irq; + int retval; + + irq = bind_virq_to_irq(virq, cpu); + retval = request_irq(irq, handler, irqflags, devname, dev_id); + if (retval != 0) { + unbind_from_irq(irq); + return retval; + } + + return irq; +} +EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); + +void unbind_from_irqhandler(unsigned int irq, void *dev_id) +{ + free_irq(irq, dev_id); + unbind_from_irq(irq); +} +EXPORT_SYMBOL_GPL(unbind_from_irqhandler); + +/* + Search the CPUs pending events bitmasks. For each one found, map + the event number to an irq, and feed it into do_IRQ() for + handling. + + Xen uses a two-level bitmap to speed searching. The first level is + a bitset of words which contain pending event bits. The second + level is a bitset of pending events themselves. +*/ +fastcall void xen_evtchn_do_upcall(struct pt_regs *regs) +{ + int cpu = get_cpu(); + struct shared_info *s = HYPERVISOR_shared_info; + struct vcpu_info *vcpu_info = __get_cpu_var(xen_vcpu); + unsigned long pending_words; + + vcpu_info->evtchn_upcall_pending = 0; + + /* NB. No need for a barrier here -- XCHG is a barrier on x86. */ + pending_words = xchg(&vcpu_info->evtchn_pending_sel, 0); + while (pending_words != 0) { + unsigned long pending_bits; + int word_idx = __ffs(pending_words); + pending_words &= ~(1UL << word_idx); + + while ((pending_bits = active_evtchns(cpu, s, word_idx)) != 0) { + int bit_idx = __ffs(pending_bits); + int port = (word_idx * BITS_PER_LONG) + bit_idx; + int irq = evtchn_to_irq[port]; + + if (irq != -1) { + regs->orig_eax = ~irq; + do_IRQ(regs); + } + } + } + + put_cpu(); +} + +/* Rebind an evtchn so that it gets delivered to a specific cpu */ +static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) +{ + struct evtchn_bind_vcpu bind_vcpu; + int evtchn = evtchn_from_irq(irq); + + if (!VALID_EVTCHN(evtchn)) + return; + + /* Send future instances of this interrupt to other vcpu. */ + bind_vcpu.port = evtchn; + bind_vcpu.vcpu = tcpu; + + /* + * If this fails, it usually just indicates that we're dealing with a + * virq or IPI channel, which don't actually need to be rebound. Ignore + * it, but don't do the xenlinux-level rebind in that case. + */ + if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) + bind_evtchn_to_cpu(evtchn, tcpu); +} + + +static void set_affinity_irq(unsigned irq, cpumask_t dest) +{ + unsigned tcpu = first_cpu(dest); + rebind_irq_to_cpu(irq, tcpu); +} + +static void enable_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + unmask_evtchn(evtchn); +} + +static void disable_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + if (VALID_EVTCHN(evtchn)) + mask_evtchn(evtchn); +} + +static void ack_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + + move_native_irq(irq); + + if (VALID_EVTCHN(evtchn)) + clear_evtchn(evtchn); +} + +static int retrigger_dynirq(unsigned int irq) +{ + int evtchn = evtchn_from_irq(irq); + int ret = 0; + + if (VALID_EVTCHN(evtchn)) { + set_evtchn(evtchn); + ret = 1; + } + + return ret; +} + +static struct irq_chip xen_dynamic_chip __read_mostly = { + .name = "xen-dyn", + .mask = disable_dynirq, + .unmask = enable_dynirq, + .ack = ack_dynirq, + .set_affinity = set_affinity_irq, + .retrigger = retrigger_dynirq, +}; + +void __init xen_init_IRQ(void) +{ + int i; + + init_evtchn_cpu_bindings(); + + /* No event channels are 'live' right now. */ + for (i = 0; i < NR_EVENT_CHANNELS; i++) + mask_evtchn(i); + + /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ + for (i = 0; i < NR_IRQS; i++) + irq_bindcount[i] = 0; + + irq_ctx_init(smp_processor_id()); +} =================================================================== --- /dev/null +++ b/arch/i386/xen/features.c @@ -0,0 +1,29 @@ +/****************************************************************************** + * features.c + * + * Xen feature flags. + * + * Copyright (c) 2006, Ian Campbell, XenSource Inc. + */ +#include +#include +#include +#include +#include + +u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; +EXPORT_SYMBOL_GPL(xen_features); + +void xen_setup_features(void) +{ + struct xen_feature_info fi; + int i, j; + + for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { + fi.submap_idx = i; + if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) + break; + for (j=0; j<32; j++) + xen_features[i*32+j] = !!(fi.submap & 1< + +#include +#include +#include + +#include +#include + +#include +#include + +#include "mmu.h" + +xmaddr_t arbitrary_virt_to_machine(unsigned long address) +{ + pte_t *pte = lookup_address(address); + unsigned offset = address & PAGE_MASK; + + BUG_ON(pte == NULL); + + return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); +} + +void make_lowmem_page_readonly(void *vaddr) +{ + pte_t *pte, ptev; + unsigned long address = (unsigned long)vaddr; + + pte = lookup_address(address); + BUG_ON(pte == NULL); + + ptev = pte_wrprotect(*pte); + + if(HYPERVISOR_update_va_mapping(address, ptev, 0)) + BUG(); +} + +void make_lowmem_page_readwrite(void *vaddr) +{ + pte_t *pte, ptev; + unsigned long address = (unsigned long)vaddr; + + pte = lookup_address(address); + BUG_ON(pte == NULL); + + ptev = pte_mkwrite(*pte); + + if(HYPERVISOR_update_va_mapping(address, ptev, 0)) + BUG(); +} + + +void xen_set_pte(pte_t *ptep, pte_t pte) +{ + struct mmu_update u; + + u.ptr = virt_to_machine(ptep).maddr; + u.val = pte_val_ma(pte); + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) + BUG(); +} + +void xen_set_pmd(pmd_t *ptr, pmd_t val) +{ + struct mmu_update u; + + u.ptr = virt_to_machine(ptr).maddr; + u.val = pmd_val_ma(val); + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) + BUG(); +} + +#ifdef CONFIG_X86_PAE +void xen_set_pud(pud_t *ptr, pud_t val) +{ + struct mmu_update u; + + u.ptr = virt_to_machine(ptr).maddr; + u.val = pud_val_ma(val); + if (HYPERVISOR_mmu_update(&u, 1, NULL, DOMID_SELF) < 0) + BUG(); +} +#endif + +/* + * Associate a virtual page frame with a given physical page frame + * and protection flags for that frame. + */ +void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = swapper_pg_dir + pgd_index(vaddr); + if (pgd_none(*pgd)) { + BUG(); + return; + } + pud = pud_offset(pgd, vaddr); + if (pud_none(*pud)) { + BUG(); + return; + } + pmd = pmd_offset(pud, vaddr); + if (pmd_none(*pmd)) { + BUG(); + return; + } + pte = pte_offset_kernel(pmd, vaddr); + /* stored as-is, to permit clearing entries */ + xen_set_pte(pte, mfn_pte(mfn, flags)); + + /* + * It's enough to flush this one mapping. + * (PGE mappings get flushed as well) + */ + __flush_tlb_one(vaddr); +} + +void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) +{ + if ((mm != current->mm && mm != &init_mm) || + HYPERVISOR_update_va_mapping(addr, pteval, 0) != 0) + xen_set_pte(ptep, pteval); +} + +#ifdef CONFIG_X86_PAE +void xen_set_pte_atomic(pte_t *ptep, pte_t pte) +{ + set_64bit((u64 *)ptep, pte_val_ma(pte)); +} + +void xen_pte_clear(struct mm_struct *mm, unsigned long addr,pte_t *ptep) +{ + ptep->pte_low = 0; + smp_wmb(); + ptep->pte_high = 0; +} + +void xen_pmd_clear(pmd_t *pmdp) +{ + xen_set_pmd(pmdp, __pmd(0)); +} + +unsigned long long xen_pte_val(pte_t pte) +{ + unsigned long long ret = 0; + + if (pte.pte_low) { + ret = ((unsigned long long)pte.pte_high << 32) | pte.pte_low; + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + } + + return ret; +} + +unsigned long long xen_pmd_val(pmd_t pmd) +{ + unsigned long long ret = pmd.pmd; + if (ret) + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +unsigned long long xen_pgd_val(pgd_t pgd) +{ + unsigned long long ret = pgd.pgd; + if (ret) + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +pte_t xen_make_pte(unsigned long long pte) +{ + if (pte & 1) + pte = phys_to_machine(XPADDR(pte)).maddr; + + return (pte_t){ pte, pte >> 32 }; +} + +pmd_t xen_make_pmd(unsigned long long pmd) +{ + if (pmd & 1) + pmd = phys_to_machine(XPADDR(pmd)).maddr; + + return (pmd_t){ pmd }; +} + +pgd_t xen_make_pgd(unsigned long long pgd) +{ + if (pgd & _PAGE_PRESENT) + pgd = phys_to_machine(XPADDR(pgd)).maddr; + + return (pgd_t){ pgd }; +} +#else /* !PAE */ +unsigned long xen_pte_val(pte_t pte) +{ + unsigned long ret = pte.pte_low; + + if (ret & _PAGE_PRESENT) + ret = machine_to_phys(XMADDR(ret)).paddr; + + return ret; +} + +unsigned long xen_pmd_val(pmd_t pmd) +{ + BUG(); + return 0; +} + +unsigned long xen_pgd_val(pgd_t pgd) +{ + unsigned long ret = pgd.pgd; + if (ret) + ret = machine_to_phys(XMADDR(ret)).paddr | 1; + return ret; +} + +pte_t xen_make_pte(unsigned long pte) +{ + if (pte & _PAGE_PRESENT) + pte = phys_to_machine(XPADDR(pte)).maddr; + + return (pte_t){ pte }; +} + +pmd_t xen_make_pmd(unsigned long pmd) +{ + BUG(); + return __pmd(0); +} + +pgd_t xen_make_pgd(unsigned long pgd) +{ + if (pgd & _PAGE_PRESENT) + pgd = phys_to_machine(XPADDR(pgd)).maddr; + + return (pgd_t){ pgd }; +} +#endif /* CONFIG_X86_PAE */ + + + +static void pgd_walk_set_prot(void *pt, pgprot_t flags) +{ + unsigned long pfn = PFN_DOWN(__pa(pt)); + + if (HYPERVISOR_update_va_mapping((unsigned long)pt, + pfn_pte(pfn, flags), 0) < 0) + BUG(); +} + +static void pgd_walk(pgd_t *pgd_base, pgprot_t flags) +{ + pgd_t *pgd = pgd_base; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int g, u, m; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return; + + for (g = 0; g < USER_PTRS_PER_PGD; g++, pgd++) { + if (pgd_none(*pgd)) + continue; + pud = pud_offset(pgd, 0); + + if (PTRS_PER_PUD > 1) /* not folded */ + pgd_walk_set_prot(pud,flags); + + for (u = 0; u < PTRS_PER_PUD; u++, pud++) { + if (pud_none(*pud)) + continue; + pmd = pmd_offset(pud, 0); + + if (PTRS_PER_PMD > 1) /* not folded */ + pgd_walk_set_prot(pmd,flags); + + for (m = 0; m < PTRS_PER_PMD; m++, pmd++) { + if (pmd_none(*pmd)) + continue; + + /* This can get called before mem_map + is set up, so we assume nothing is + highmem at that point. */ + if (mem_map == NULL || + !PageHighMem(pmd_page(*pmd))) { + pte = pte_offset_kernel(pmd,0); + pgd_walk_set_prot(pte,flags); + } + } + } + } + + if (HYPERVISOR_update_va_mapping((unsigned long)pgd_base, + pfn_pte(PFN_DOWN(__pa(pgd_base)), + flags), + UVMF_TLB_FLUSH) < 0) + BUG(); +} + + +/* This is called just after a mm has been duplicated from its parent, + but it has not been used yet. We need to make sure that its + pagetable is all read-only, and can be pinned. */ +void xen_pgd_pin(pgd_t *pgd) +{ + struct mmuext_op op; + + pgd_walk(pgd, PAGE_KERNEL_RO); + +#if defined(CONFIG_X86_PAE) + op.cmd = MMUEXT_PIN_L3_TABLE; +#else + op.cmd = MMUEXT_PIN_L2_TABLE; +#endif + op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) + BUG(); +} + +/* Release a pagetables pages back as normal RW */ +void xen_pgd_unpin(pgd_t *pgd) +{ + struct mmuext_op op; + + op.cmd = MMUEXT_UNPIN_TABLE; + op.arg1.mfn = pfn_to_mfn(PFN_DOWN(__pa(pgd))); + + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) + BUG(); + + pgd_walk(pgd, PAGE_KERNEL); +} + + +void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) +{ + xen_pgd_pin(next->pgd); +} + +void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +{ + xen_pgd_pin(mm->pgd); +} + +void xen_exit_mmap(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + + /* + * We aggressively remove defunct pgd from cr3. We execute unmap_vmas() + * *much* faster this way, as no tlb flushes means bigger wrpt batches. + */ + if (tsk->active_mm == mm) { + tsk->active_mm = &init_mm; + atomic_inc(&init_mm.mm_count); + + switch_mm(mm, &init_mm, tsk); + + atomic_dec(&mm->mm_count); + BUG_ON(atomic_read(&mm->mm_count) == 0); + } + + task_unlock(tsk); + + xen_pgd_unpin(mm->pgd); +} =================================================================== --- /dev/null +++ b/arch/i386/xen/mmu.h @@ -0,0 +1,47 @@ +#ifndef _XEN_MMU_H + +#include +#include + +void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); + +void xen_set_pte(pte_t *ptep, pte_t pteval); +void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); +void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); + +void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); +void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); +void xen_exit_mmap(struct mm_struct *mm); + +void xen_pgd_pin(pgd_t *pgd); +void xen_pgd_unpin(pgd_t *pgd); + +#ifdef CONFIG_X86_PAE +unsigned long long xen_pte_val(pte_t); +unsigned long long xen_pmd_val(pmd_t); +unsigned long long xen_pgd_val(pgd_t); + +pte_t xen_make_pte(unsigned long long); +pmd_t xen_make_pmd(unsigned long long); +pgd_t xen_make_pgd(unsigned long long); + +void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval); +void xen_set_pte_atomic(pte_t *ptep, pte_t pte); +void xen_set_pud(pud_t *ptr, pud_t val); +void xen_pte_clear(struct mm_struct *mm, unsigned long addr,pte_t *ptep); +void xen_pmd_clear(pmd_t *pmdp); + + +#else +unsigned long xen_pte_val(pte_t); +unsigned long xen_pmd_val(pmd_t); +unsigned long xen_pgd_val(pgd_t); + +pte_t xen_make_pte(unsigned long); +pmd_t xen_make_pmd(unsigned long); +pgd_t xen_make_pgd(unsigned long); +#endif + +#endif /* _XEN_MMU_H */ =================================================================== --- /dev/null +++ b/arch/i386/xen/multicalls.c @@ -0,0 +1,61 @@ +#include + +#include + +#include "multicalls.h" + +#define MC_BATCH 8 +#define MC_ARGS (MC_BATCH * 32 / sizeof(u64)) + +struct mc_buffer { + struct multicall_entry entries[MC_BATCH]; + u64 args[MC_ARGS]; + unsigned mcidx, argidx; +}; + +static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); + +void xen_mc_flush(void) +{ + struct mc_buffer *b = &get_cpu_var(mc_buffer); + int ret = 0; + + if (b->mcidx) { + int i; + + if (HYPERVISOR_multicall(b->entries, b->mcidx) != 0) + BUG(); + for(i = 0; i < b->mcidx; i++) + if (b->entries[i].result < 0) + ret++; + b->mcidx = 0; + b->argidx = 0; + } else + BUG_ON(b->argidx != 0); + + put_cpu_var(mc_buffer); + + BUG_ON(ret); +} + +struct multicall_space xen_mc_entry(size_t args) +{ + struct mc_buffer *b = &get_cpu_var(mc_buffer); + struct multicall_space ret; + unsigned argspace = (args + sizeof(u64) - 1) / sizeof(u64); + + BUG_ON(argspace > MC_ARGS); + + if (b->mcidx == MC_BATCH || + (b->argidx + argspace) > MC_ARGS) + xen_mc_flush(); + + ret.mc = &b->entries[b->mcidx]; + b->mcidx++; + ret.args = &b->args[b->argidx]; + b->argidx += argspace; + + put_cpu_var(mc_buffer); + + return ret; +} =================================================================== --- /dev/null +++ b/arch/i386/xen/multicalls.h @@ -0,0 +1,26 @@ +#ifndef _XEN_MULTICALLS_H +#define _XEN_MULTICALLS_H + +#include "xen-ops.h" + +/* Multicalls */ +struct multicall_space +{ + struct multicall_entry *mc; + void *args; +}; + +/* Allocate room for a multicall and its args */ +struct multicall_space xen_mc_entry(size_t args); + +/* Flush all pending multicalls */ +void xen_mc_flush(void); + +/* Issue a multicall if we're not in lazy mode */ +static inline void xen_mc_issue(void) +{ + if (xen_get_lazy_mode() == PARAVIRT_LAZY_NONE) + xen_mc_flush(); +} + +#endif /* _XEN_MULTICALLS_H */ =================================================================== --- /dev/null +++ b/arch/i386/xen/setup.c @@ -0,0 +1,93 @@ +/* + * Machine specific setup for xen + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include "xen-ops.h" + +/* These are code, but not functions. Defined in entry.S */ +extern const char xen_hypervisor_callback[]; +extern const char xen_failsafe_callback[]; + +static __initdata struct shared_info init_shared; + +/* + * Point at some empty memory to start with. We map the real shared_info + * page as soon as fixmap is up and running. + */ +struct shared_info *HYPERVISOR_shared_info = &init_shared; + +unsigned long *phys_to_machine_mapping; +EXPORT_SYMBOL(phys_to_machine_mapping); + +/** + * machine_specific_memory_setup - Hook for machine specific memory setup. + **/ + +char * __init xen_memory_setup(void) +{ + unsigned long max_pfn = xen_start_info->nr_pages; + + e820.nr_map = 0; + add_memory_region(0, PFN_PHYS(max_pfn), E820_RAM); + + return "Xen"; +} + +static void xen_idle(void) +{ + local_irq_disable(); + + if (need_resched()) + local_irq_enable(); + else { + current_thread_info()->status &= ~TS_POLLING; + smp_mb__after_clear_bit(); + safe_halt(); + current_thread_info()->status |= TS_POLLING; + } +} + +void __init xen_arch_setup(void) +{ + struct physdevop_set_iopl set_iopl; + int rc; + + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); + + if (!xen_feature(XENFEAT_auto_translated_physmap)) + HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); + + HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, + __KERNEL_CS, (unsigned long)xen_failsafe_callback); + + set_iopl.iopl = 1; + rc = HYPERVISOR_physdev_op(PHYSDEVOP_SET_IOPL, &set_iopl); + if (rc != 0) + printk(KERN_INFO "physdev_op failed %d\n", rc); + +#ifdef CONFIG_ACPI + if (!(xen_start_info->flags & SIF_INITDOMAIN)) { + printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); + disable_acpi(); + } +#endif + + memcpy(boot_command_line, xen_start_info->cmd_line, + MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? + COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); + + pm_idle = xen_idle; +} =================================================================== --- /dev/null +++ b/arch/i386/xen/time.c @@ -0,0 +1,393 @@ +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "xen-ops.h" + +#define XEN_SHIFT 22 +#define TIMER_SLOP 100000 /* Xen may fire a timer up to this many ns early */ + +/* These are perodically updated in shared_info, and then copied here. */ +struct shadow_time_info { + u64 tsc_timestamp; /* TSC at last update of time vals. */ + u64 system_timestamp; /* Time, in nanosecs, since boot. */ + u32 tsc_to_nsec_mul; + int tsc_shift; + u32 version; +}; + +static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); + +unsigned long xen_cpu_khz(void) +{ + u64 cpu_khz = 1000000ULL << 32; + const struct vcpu_time_info *info = + &HYPERVISOR_shared_info->vcpu_info[0].time; + + do_div(cpu_khz, info->tsc_to_system_mul); + if (info->tsc_shift < 0) + cpu_khz <<= -info->tsc_shift; + else + cpu_khz >>= info->tsc_shift; + + return cpu_khz; +} + +/* + * Reads a consistent set of time-base values from Xen, into a shadow data + * area. + */ +static void get_time_values_from_xen(void) +{ + struct vcpu_time_info *src; + struct shadow_time_info *dst; + + preempt_disable(); + + src = &__get_cpu_var(xen_vcpu)->time; + dst = &__get_cpu_var(shadow_time); + + do { + dst->version = src->version; + rmb(); + dst->tsc_timestamp = src->tsc_timestamp; + dst->system_timestamp = src->system_time; + dst->tsc_to_nsec_mul = src->tsc_to_system_mul; + dst->tsc_shift = src->tsc_shift; + rmb(); + } while ((src->version & 1) | (dst->version ^ src->version)); + + preempt_enable(); +} + +/* + * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, + * yielding a 64-bit result. + */ +static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) +{ + u64 product; +#ifdef __i386__ + u32 tmp1, tmp2; +#endif + + if (shift < 0) + delta >>= -shift; + else + delta <<= shift; + +#ifdef __i386__ + __asm__ ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) ); +#elif __x86_64__ + __asm__ ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) ); +#else +#error implement me! +#endif + + return product; +} + +static u64 get_nsec_offset(struct shadow_time_info *shadow) +{ + u64 now, delta; + rdtscll(now); + delta = now - shadow->tsc_timestamp; + return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); +} + +cycle_t xen_clocksource_read(void) +{ + struct shadow_time_info *shadow = &get_cpu_var(shadow_time); + cycle_t ret; + + get_time_values_from_xen(); + + ret = shadow->system_timestamp + get_nsec_offset(shadow); + + put_cpu_var(shadow_time); + + return ret; +} + +static void xen_read_wallclock(struct timespec *ts) +{ + const struct shared_info *s = HYPERVISOR_shared_info; + u32 version; + u64 delta; + struct timespec now; + + /* get wallclock at system boot */ + do { + version = s->wc_version; + rmb(); + now.tv_sec = s->wc_sec; + now.tv_nsec = s->wc_nsec; + rmb(); + } while ((s->wc_version & 1) | (version ^ s->wc_version)); + + delta = xen_clocksource_read(); /* time since system boot */ + delta += now.tv_sec * (u64)NSEC_PER_SEC + now.tv_nsec; + + now.tv_nsec = do_div(delta, NSEC_PER_SEC); + now.tv_sec = delta; + + set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); +} + +unsigned long xen_get_wallclock(void) +{ + struct timespec ts; + + xen_read_wallclock(&ts); + + return ts.tv_sec; +} + +int xen_set_wallclock(unsigned long now) +{ + /* do nothing for domU */ + return -1; +} + +static struct clocksource xen_clocksource __read_mostly = { + .name = "xen", + .rating = 400, + .read = xen_clocksource_read, + .mask = ~0, + .mult = 1<mode != CLOCK_EVT_MODE_ONESHOT); + + if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0) + BUG(); + + /* We may have missed the deadline, but there's no real way of + knowing for sure. If the event was in the past, then we'll + get an immediate interrupt. */ + + return 0; +} + +static const struct clock_event_device xen_timerop_clockevent = { + .name = "xen", + .features = CLOCK_EVT_FEAT_ONESHOT, + + .max_delta_ns = 0xffffffff, + .min_delta_ns = TIMER_SLOP, + + .mult = 1, + .shift = 0, + .rating = 500, + + .set_mode = xen_timerop_set_mode, + .set_next_event = xen_timerop_set_next_event, +}; + + + +static void xen_vcpuop_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + int cpu = smp_processor_id(); + + switch(mode) { + case CLOCK_EVT_MODE_PERIODIC: + WARN_ON(1); /* unsupported */ + break; + + case CLOCK_EVT_MODE_ONESHOT: + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + BUG(); + break; + + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, cpu, NULL) || + HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL)) + BUG(); + break; + } +} + +static int xen_vcpuop_set_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + int cpu = smp_processor_id(); + struct vcpu_set_singleshot_timer single; + int ret; + + WARN_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT); + + single.timeout_abs_ns = get_abs_timeout(delta); + single.flags = VCPU_SSHOTTMR_future; + + ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, cpu, &single); + + BUG_ON(ret != 0 && ret != -ETIME); + + return ret; +} + +static const struct clock_event_device xen_vcpuop_clockevent = { + .name = "xen", + .features = CLOCK_EVT_FEAT_ONESHOT, + + .max_delta_ns = 0xffffffff, + .min_delta_ns = TIMER_SLOP, + + .mult = 1, + .shift = 0, + .rating = 500, + + .set_mode = xen_vcpuop_set_mode, + .set_next_event = xen_vcpuop_set_next_event, +}; + +static const struct clock_event_device *xen_clockevent = + &xen_timerop_clockevent; +static DEFINE_PER_CPU(struct clock_event_device, xen_clock_events); + +static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) +{ + struct clock_event_device *evt = &__get_cpu_var(xen_clock_events); + irqreturn_t ret; + + ret = IRQ_NONE; + if (evt->event_handler) { + evt->event_handler(evt); + ret = IRQ_HANDLED; + } + + return ret; +} + +static void xen_setup_timer(int cpu) +{ + const char *name; + struct clock_event_device *evt; + int irq; + + printk("installing Xen timer for CPU %d\n", cpu); + + name = kasprintf(GFP_KERNEL, "timer%d", cpu); + if (!name) + name = ""; + + irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + name, NULL); + + evt = &get_cpu_var(xen_clock_events); + memcpy(evt, xen_clockevent, sizeof(*evt)); + + evt->cpumask = cpumask_of_cpu(cpu); + evt->irq = irq; + clockevents_register_device(evt); + + put_cpu_var(xen_clock_events); +} + +__init void xen_time_init(void) +{ + int cpu = smp_processor_id(); + + get_time_values_from_xen(); + + clocksource_register(&xen_clocksource); + + if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { + /* Successfully turned off 100hz tick, so we have the + vcpuop-based timer interface */ + printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); + xen_clockevent = &xen_vcpuop_clockevent; + } + + /* Set initial system time with full resolution */ + xen_read_wallclock(&xtime); + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + + tsc_disable = 0; + + xen_setup_timer(cpu); +} =================================================================== --- /dev/null +++ b/arch/i386/xen/xen-head.S @@ -0,0 +1,34 @@ +/* Xen-specific pieces of head.S, intended to be included in the right + place in head.S */ + +#ifdef CONFIG_XEN + +#include +#include +#include + +ENTRY(startup_xen) + movl %esi,xen_start_info + jmp startup_paravirt + +.pushsection ".bss.page_aligned" + .align PAGE_SIZE_asm +ENTRY(hypercall_page) + .skip 0x1000 +.popsection + + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz, "linux") + ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz, "2.6") + ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz, "xen-3.0") + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long, __PAGE_OFFSET) + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long, startup_xen) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long, hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz, "!writable_page_tables|pae_pgdir_above_4gb") +#ifdef CONFIG_X86_PAE + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "yes") +#else + ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz, "no") +#endif + ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz, "generic") + +#endif /*CONFIG_XEN */ =================================================================== --- /dev/null +++ b/arch/i386/xen/xen-ops.h @@ -0,0 +1,34 @@ +#ifndef XEN_OPS_H +#define XEN_OPS_H + +#include +#include + +DECLARE_PER_CPU(struct vcpu_info *, xen_vcpu); +DECLARE_PER_CPU(unsigned long, xen_cr3); + +extern struct start_info *xen_start_info; +extern struct shared_info *HYPERVISOR_shared_info; + +char * __init xen_memory_setup(void); +void __init xen_arch_setup(void); +void __init xen_init_IRQ(void); + +unsigned long xen_cpu_khz(void); +void __init xen_time_init(void); +unsigned long xen_get_wallclock(void); +int xen_set_wallclock(unsigned long time); +cycle_t xen_clocksource_read(void); + +DECLARE_PER_CPU(enum paravirt_lazy_mode, xen_lazy_mode); + +static inline unsigned xen_get_lazy_mode(void) +{ + unsigned ret = get_cpu_var(xen_lazy_mode); + put_cpu_var(xen_lazy_mode); + + return ret; +} + + +#endif /* XEN_OPS_H */ =================================================================== --- a/include/asm-i386/irq.h +++ b/include/asm-i386/irq.h @@ -41,6 +41,7 @@ extern void fixup_irqs(cpumask_t map); extern void fixup_irqs(cpumask_t map); #endif +unsigned int do_IRQ(struct pt_regs *regs); void init_IRQ(void); void __init native_init_IRQ(void); =================================================================== --- a/include/asm-i386/xen/hypercall.h +++ b/include/asm-i386/xen/hypercall.h @@ -410,4 +410,22 @@ MULTI_mmuext_op(struct multicall_entry * mcl->args[2] = (unsigned long)success_count; mcl->args[3] = domid; } + +static inline void +MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries) +{ + mcl->op = __HYPERVISOR_set_gdt; + mcl->args[0] = (unsigned long)frames; + mcl->args[1] = entries; +} + +static inline void +MULTI_stack_switch(struct multicall_entry *mcl, + unsigned long ss, unsigned long esp) +{ + mcl->op = __HYPERVISOR_stack_switch; + mcl->args[0] = ss; + mcl->args[1] = esp; +} + #endif /* __HYPERCALL_H__ */ =================================================================== --- /dev/null +++ b/include/xen/events.h @@ -0,0 +1,28 @@ +#ifndef _XEN_EVENTS_H +#define _XEN_EVENTS_H + +#include + +int bind_evtchn_to_irqhandler(unsigned int evtchn, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, const char *devname, + void *dev_id); +int bind_virq_to_irqhandler(unsigned int virq, unsigned int cpu, + irqreturn_t (*handler)(int, void *), + unsigned long irqflags, const char *devname, void *dev_id); + +/* + * Common unbind function for all event sources. Takes IRQ to unbind from. + * Automatically closes the underlying event channel (even for bindings + * made with bind_evtchn_to_irqhandler()). + */ +void unbind_from_irqhandler(unsigned int irq, void *dev_id); + +static inline void notify_remote_via_evtchn(int port) +{ + struct evtchn_send send = { .port = port }; + (void)HYPERVISOR_event_channel_op(EVTCHNOP_send, &send); +} + +extern void notify_remote_via_irq(int irq); +#endif /* _XEN_EVENTS_H */ =================================================================== --- /dev/null +++ b/include/xen/features.h @@ -0,0 +1,26 @@ +/****************************************************************************** + * features.h + * + * Query the features reported by Xen. + * + * Copyright (c) 2006, Ian Campbell + */ + +#ifndef __XEN_FEATURES_H__ +#define __XEN_FEATURES_H__ + +#include + +void xen_setup_features(void); + +extern u8 xen_features[XENFEAT_NR_SUBMAPS * 32]; + +static inline int xen_feature(int flag) +{ + switch(flag) { + } + + return xen_features[flag]; +} + +#endif /* __ASM_XEN_FEATURES_H__ */ =================================================================== --- /dev/null +++ b/include/xen/page.h @@ -0,0 +1,178 @@ +#ifndef __XEN_PAGE_H +#define __XEN_PAGE_H + +#include + +#include + +#include + +#ifdef CONFIG_X86_PAE +/* Xen machine address */ +typedef struct xmaddr { + unsigned long long maddr; +} xmaddr_t; + +/* Xen pseudo-physical address */ +typedef struct xpaddr { + unsigned long long paddr; +} xpaddr_t; +#else +/* Xen machine address */ +typedef struct xmaddr { + unsigned long maddr; +} xmaddr_t; + +/* Xen pseudo-physical address */ +typedef struct xpaddr { + unsigned long paddr; +} xpaddr_t; +#endif + +#define XMADDR(x) ((xmaddr_t) { .maddr = (x) }) +#define XPADDR(x) ((xpaddr_t) { .paddr = (x) }) + +/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ +#define INVALID_P2M_ENTRY (~0UL) +#define FOREIGN_FRAME_BIT (1UL<<31) +#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) + +extern unsigned long *phys_to_machine_mapping; + +static inline unsigned long pfn_to_mfn(unsigned long pfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return pfn; + + return phys_to_machine_mapping[(unsigned int)(pfn)] & + ~FOREIGN_FRAME_BIT; +} + +static inline int phys_to_machine_mapping_valid(unsigned long pfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) + return 1; + + return (phys_to_machine_mapping[pfn] != INVALID_P2M_ENTRY); +} + +static inline unsigned long mfn_to_pfn(unsigned long mfn) +{ + unsigned long pfn; + + if (xen_feature(XENFEAT_auto_translated_physmap)) + return mfn; + +#if 0 + if (unlikely((mfn >> machine_to_phys_order) != 0)) + return max_mapnr; +#endif + + pfn = 0; + /* + * The array access can fail (e.g., device space beyond end of RAM). + * In such cases it doesn't matter what we return (we return garbage), + * but we must handle the fault without crashing! + */ + __get_user(pfn, &machine_to_phys_mapping[mfn]); + + return pfn; +} + +static inline xmaddr_t phys_to_machine(xpaddr_t phys) +{ + unsigned offset = phys.paddr & ~PAGE_MASK; + return XMADDR(PFN_PHYS((u64)pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset); +} + +static inline xpaddr_t machine_to_phys(xmaddr_t machine) +{ + unsigned offset = machine.maddr & ~PAGE_MASK; + return XPADDR(PFN_PHYS((u64)mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset); +} + +/* + * We detect special mappings in one of two ways: + * 1. If the MFN is an I/O page then Xen will set the m2p entry + * to be outside our maximum possible pseudophys range. + * 2. If the MFN belongs to a different domain then we will certainly + * not have MFN in our p2m table. Conversely, if the page is ours, + * then we'll have p2m(m2p(MFN))==MFN. + * If we detect a special mapping then it doesn't have a 'struct page'. + * We force !pfn_valid() by returning an out-of-range pointer. + * + * NB. These checks require that, for any MFN that is not in our reservation, + * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if + * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN. + * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety. + * + * NB2. When deliberately mapping foreign pages into the p2m table, you *must* + * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we + * require. In all the cases we care about, the FOREIGN_FRAME bit is + * masked (e.g., pfn_to_mfn()) so behaviour there is correct. + */ +static inline unsigned long mfn_to_local_pfn(unsigned long mfn) +{ + extern unsigned long max_mapnr; + unsigned long pfn = mfn_to_pfn(mfn); + if ((pfn < max_mapnr) + && !xen_feature(XENFEAT_auto_translated_physmap) + && (phys_to_machine_mapping[pfn] != mfn)) + return max_mapnr; /* force !pfn_valid() */ + return pfn; +} + +static inline void set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + if (xen_feature(XENFEAT_auto_translated_physmap)) { + BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); + return; + } + phys_to_machine_mapping[pfn] = mfn; +} + +/* VIRT <-> MACHINE conversion */ +#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) +#define virt_to_mfn(v) (pfn_to_mfn(PFN_DOWN(__pa(v)))) +#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) + +#ifdef CONFIG_X86_PAE +#define pte_mfn(_pte) (((_pte).pte_low >> PAGE_SHIFT) |\ + (((_pte).pte_high & 0xfff) << (32-PAGE_SHIFT))) + +static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot) +{ + pte_t pte; + + pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | (pgprot_val(pgprot) >> 32); + pte.pte_high &= (__supported_pte_mask >> 32); + pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)); + pte.pte_low &= __supported_pte_mask; + + return pte; +} + +static inline unsigned long long pte_val_ma(pte_t x) +{ + return ((unsigned long long)x.pte_high << 32) | x.pte_low; +} +#define pmd_val_ma(v) ((v).pmd) +#define pud_val_ma(v) ((v).pgd.pgd) +#define __pte_ma(x) ((pte_t) { .pte_low=(x), .pte_high=(x)>>32 } ) +#define __pmd_ma(x) ((pmd_t) { (x) } ) +#else /* !X86_PAE */ +#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT) +#define mfn_pte(pfn, prot) __pte_ma(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) +#define pte_val_ma(x) ((x).pte_low) +#define pmd_val_ma(v) ((v).pud.pgd.pgd) +#define __pte_ma(x) ((pte_t) { (x) } ) +#endif /* CONFIG_X86_PAE */ + +#define pgd_val_ma(x) ((x).pgd) + + +xmaddr_t arbitrary_virt_to_machine(unsigned long address); +void make_lowmem_page_readonly(void *vaddr); +void make_lowmem_page_readwrite(void *vaddr); + +#endif /* __XEN_PAGE_H */ -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/