[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1171018590.2718.60.camel@localhost.localdomain>
Date: Fri, 09 Feb 2007 21:56:30 +1100
From: Rusty Russell <rusty@...tcorp.com.au>
To: lkml - Kernel Mailing List <linux-kernel@...r.kernel.org>
Cc: Andrew Morton <akpm@...l.org>, Andi Kleen <ak@....de>,
virtualization <virtualization@...ts.osdl.org>
Subject: [PATCH 6b/10] lguest: the host code (lg.ko)
This is the host module (lg.ko) which supports lguest:
arch/i386/lguest/hypervisor.S:
The actual guest <-> host switching code. This is compiled into
a C array, which is mapped to 0xFFC01000 in host and guests.
arch/i386/lguest/core.c:
The core of the hypervisor, which calls into the assembler
code which does this actual switch. Also contains helper
routines.
arch/i386/lguest/hypercalls.c:
The entry point for the 19 hypercalls.
arch/i386/lguest/interrupts_and_traps.c:
Handling of interrupts and traps, except page faults.
arch/i386/lguest/io.c:
I/O from guest to host, and between guests.
arch/i386/lguest/lguest_user.c:
/dev/lguest interface for lguest program to launch/control guests.
arch/i386/lguest/page_tables.c:
Shadow Page table handling: generally we build up the shadow
page tables by converting from guest page tables when a fault occurs.
arch/i386/lguest/segments.c:
Segmentation (GDT) handling: we have to ensure they're trimmed
to avoid guest access to the switching code.
Signed-off-by: Rusty Russell <rusty@...tcorp.com.au>
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/core.c
@@ -0,0 +1,425 @@
+/* World's simplest hypervisor, to test paravirt_ops and show
+ * unbelievers that virtualization is the future. Plus, it's fun! */
+#include <linux/module.h>
+#include <linux/stringify.h>
+#include <linux/stddef.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <asm/lguest.h>
+#include <asm/paravirt.h>
+#include <asm/desc.h>
+#include <asm/pgtable.h>
+#include <asm/uaccess.h>
+#include <asm/poll.h>
+#include <asm/highmem.h>
+#include <asm/asm-offsets.h>
+#include "lg.h"
+
+/* This is our hypervisor, compiled from hypervisor.S. */
+static char __initdata hypervisor_blob[] = {
+#include "hypervisor-blob.c"
+};
+
+#define MAX_LGUEST_GUESTS \
+ ((HYPERVISOR_SIZE-sizeof(hypervisor_blob))/sizeof(struct lguest_state))
+
+static struct vm_struct *hypervisor_vma;
+static int cpu_had_pge;
+static struct {
+ unsigned long offset;
+ unsigned short segment;
+} lguest_entry;
+struct page *hype_pages; /* Contiguous pages. */
+struct lguest lguests[MAX_LGUEST_GUESTS];
+DECLARE_MUTEX(lguest_lock);
+
+/* IDT entries are at start of hypervisor. */
+const unsigned long *__lguest_default_idt_entries(void)
+{
+ return (void *)HYPE_ADDR;
+}
+
+/* Next is switch_to_guest */
+static void *__lguest_switch_to_guest(void)
+{
+ return (void *)HYPE_ADDR + HYPE_DATA_SIZE;
+}
+
+/* Then we use everything else to hold guest state. */
+struct lguest_state *__lguest_states(void)
+{
+ return (void *)HYPE_ADDR + sizeof(hypervisor_blob);
+}
+
+static __init int map_hypervisor(void)
+{
+ unsigned int i;
+ int err;
+ struct page *pages[HYPERVISOR_PAGES], **pagep = pages;
+
+ hype_pages = alloc_pages(GFP_KERNEL|__GFP_ZERO,
+ get_order(HYPERVISOR_SIZE));
+ if (!hype_pages)
+ return -ENOMEM;
+
+ hypervisor_vma = __get_vm_area(HYPERVISOR_SIZE, VM_ALLOC,
+ HYPE_ADDR, VMALLOC_END);
+ if (!hypervisor_vma) {
+ err = -ENOMEM;
+ printk("lguest: could not map hypervisor pages high\n");
+ goto free_pages;
+ }
+
+ for (i = 0; i < HYPERVISOR_PAGES; i++)
+ pages[i] = hype_pages + i;
+
+ err = map_vm_area(hypervisor_vma, PAGE_KERNEL, &pagep);
+ if (err) {
+ printk("lguest: map_vm_area failed: %i\n", err);
+ goto free_vma;
+ }
+ memcpy(hypervisor_vma->addr, hypervisor_blob, sizeof(hypervisor_blob));
+
+ /* Setup LGUEST segments on all cpus */
+ for_each_possible_cpu(i) {
+ get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+ get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+ }
+
+ /* Initialize entry point into hypervisor. */
+ lguest_entry.offset = (long)__lguest_switch_to_guest();
+ lguest_entry.segment = LGUEST_CS;
+
+ printk("lguest: mapped hypervisor at %p\n", hypervisor_vma->addr);
+ return 0;
+
+free_vma:
+ vunmap(hypervisor_vma->addr);
+free_pages:
+ __free_pages(hype_pages, get_order(HYPERVISOR_SIZE));
+ return err;
+}
+
+static __exit void unmap_hypervisor(void)
+{
+ vunmap(hypervisor_vma->addr);
+ __free_pages(hype_pages, get_order(HYPERVISOR_SIZE));
+}
+
+/* IN/OUT insns: enough to get us past boot-time probing. */
+static int emulate_insn(struct lguest *lg)
+{
+ u8 insn;
+ unsigned int insnlen = 0, in = 0, shift = 0;
+ unsigned long physaddr = guest_pa(lg, lg->state->regs.eip);
+
+ /* This only works for addresses in linear mapping... */
+ if (lg->state->regs.eip < lg->page_offset)
+ return 0;
+ lhread(lg, &insn, physaddr, 1);
+
+ /* Operand size prefix means it's actually for ax. */
+ if (insn == 0x66) {
+ shift = 16;
+ insnlen = 1;
+ lhread(lg, &insn, physaddr + insnlen, 1);
+ }
+
+ switch (insn & 0xFE) {
+ case 0xE4: /* in <next byte>,%al */
+ insnlen += 2;
+ in = 1;
+ break;
+ case 0xEC: /* in (%dx),%al */
+ insnlen += 1;
+ in = 1;
+ break;
+ case 0xE6: /* out %al,<next byte> */
+ insnlen += 2;
+ break;
+ case 0xEE: /* out %al,(%dx) */
+ insnlen += 1;
+ break;
+ default:
+ return 0;
+ }
+
+ if (in) {
+ /* Lower bit tells is whether it's a 16 or 32 bit access */
+ if (insn & 0x1)
+ lg->state->regs.eax = 0xFFFFFFFF;
+ else
+ lg->state->regs.eax |= (0xFFFF << shift);
+ }
+ lg->state->regs.eip += insnlen;
+ return 1;
+}
+
+int find_free_guest(void)
+{
+ unsigned int i;
+ for (i = 0; i < MAX_LGUEST_GUESTS; i++)
+ if (!lguests[i].state)
+ return i;
+ return -1;
+}
+
+int lguest_address_ok(const struct lguest *lg, unsigned long addr)
+{
+ return addr / PAGE_SIZE < lg->pfn_limit;
+}
+
+/* Just like get_user, but don't let guest access lguest binary. */
+u32 lhread_u32(struct lguest *lg, u32 addr)
+{
+ u32 val = 0;
+
+ /* Don't let them access lguest_add */
+ if (!lguest_address_ok(lg, addr)
+ || get_user(val, (u32 __user *)addr) != 0)
+ kill_guest(lg, "bad read address %u", addr);
+ return val;
+}
+
+void lhwrite_u32(struct lguest *lg, u32 addr, u32 val)
+{
+ if (!lguest_address_ok(lg, addr)
+ || put_user(val, (u32 __user *)addr) != 0)
+ kill_guest(lg, "bad write address %u", addr);
+}
+
+void lhread(struct lguest *lg, void *b, u32 addr, unsigned bytes)
+{
+ if (addr + bytes < addr || !lguest_address_ok(lg, addr+bytes)
+ || copy_from_user(b, (void __user *)addr, bytes) != 0) {
+ /* copy_from_user should do this, but as we rely on it... */
+ memset(b, 0, bytes);
+ kill_guest(lg, "bad read address %u len %u", addr, bytes);
+ }
+}
+
+void lhwrite(struct lguest *lg, u32 addr, const void *b, unsigned bytes)
+{
+ if (addr + bytes < addr
+ || !lguest_address_ok(lg, addr+bytes)
+ || copy_to_user((void __user *)addr, b, bytes) != 0)
+ kill_guest(lg, "bad write address %u len %u", addr, bytes);
+}
+
+/* Saves exporting idt_table from kernel */
+static struct desc_struct *get_idt_table(void)
+{
+ struct Xgt_desc_struct idt;
+
+ asm("sidt %0":"=m" (idt));
+ return (void *)idt.address;
+}
+
+extern asmlinkage void math_state_restore(void);
+
+static int usermode(struct lguest_regs *regs)
+{
+ return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
+}
+
+/* Trap page resets this when it reloads gs. */
+static int new_gfp_eip(struct lguest *lg, struct lguest_regs *regs)
+{
+ u32 eip;
+ get_user(eip, &lg->lguest_data->gs_gpf_eip);
+ if (eip == regs->eip)
+ return 0;
+ put_user(regs->eip, &lg->lguest_data->gs_gpf_eip);
+ return 1;
+}
+
+static void set_ts(unsigned int guest_ts)
+{
+ u32 cr0;
+ if (guest_ts) {
+ asm("movl %%cr0,%0":"=r" (cr0));
+ if (!(cr0 & 8))
+ asm("movl %0,%%cr0": :"r" (cr0|8));
+ }
+}
+
+static void run_guest_once(struct lguest *lg)
+{
+ unsigned int clobber;
+
+ /* Put eflags on stack, lcall does rest. */
+ asm volatile("pushf; lcall *lguest_entry"
+ : "=a"(clobber), "=d"(clobber)
+ : "0"(lg->state), "1"(get_idt_table())
+ : "memory");
+}
+
+int run_guest(struct lguest *lg, char *__user user)
+{
+ struct lguest_regs *regs = &lg->state->regs;
+
+ while (!lg->dead) {
+ unsigned int cr2 = 0; /* Damn gcc */
+
+ /* Hypercalls first: we might have been out to userspace */
+ if (do_async_hcalls(lg))
+ goto pending_dma;
+
+ if (regs->trapnum == LGUEST_TRAP_ENTRY) {
+ /* Only do hypercall once. */
+ regs->trapnum = 255;
+ if (hypercall(lg, regs))
+ goto pending_dma;
+ }
+
+ if (signal_pending(current))
+ return -EINTR;
+ maybe_do_interrupt(lg);
+
+ if (lg->dead)
+ break;
+
+ if (lg->halted) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(1);
+ continue;
+ }
+
+ /* Restore limits on TLS segments if in user mode. */
+ if (usermode(regs)) {
+ unsigned int i;
+ for (i = 0; i < ARRAY_SIZE(lg->tls_limits); i++)
+ lg->state->gdt_table[GDT_ENTRY_TLS_MIN+i].a
+ |= lg->tls_limits[i];
+ }
+
+ local_irq_disable();
+ map_trap_page(lg);
+
+ /* Host state to be restored after the guest returns. */
+ asm("sidt %0":"=m"(lg->state->host.idt));
+ lg->state->host.gdt = __get_cpu_var(cpu_gdt_descr);
+
+ /* Even if *we* don't want FPU trap, guest might... */
+ set_ts(lg->ts);
+
+ run_guest_once(lg);
+
+ /* Save cr2 now if we page-faulted. */
+ if (regs->trapnum == 14)
+ asm("movl %%cr2,%0" :"=r" (cr2));
+ else if (regs->trapnum == 7)
+ math_state_restore();
+ local_irq_enable();
+
+ switch (regs->trapnum) {
+ case 13: /* We've intercepted a GPF. */
+ if (regs->errcode == 0) {
+ if (emulate_insn(lg))
+ continue;
+
+ /* FIXME: If it's reloading %gs in a loop? */
+ if (usermode(regs) && new_gfp_eip(lg,regs))
+ continue;
+ }
+
+ if (reflect_trap(lg, &lg->gpf_trap, 1))
+ continue;
+ break;
+ case 14: /* We've intercepted a page fault. */
+ if (demand_page(lg, cr2, regs->errcode & 2))
+ continue;
+
+ /* If lguest_data is NULL, this won't hurt. */
+ put_user(cr2, &lg->lguest_data->cr2);
+ if (reflect_trap(lg, &lg->page_trap, 1))
+ continue;
+ kill_guest(lg, "unhandled page fault at %#x"
+ " (eip=%#x, errcode=%#x)",
+ cr2, regs->eip, regs->errcode);
+ break;
+ case 7: /* We've intercepted a Device Not Available fault. */
+ /* If they don't want to know, just absorb it. */
+ if (!lg->ts)
+ continue;
+ if (reflect_trap(lg, &lg->fpu_trap, 0))
+ continue;
+ kill_guest(lg, "unhandled FPU fault at %#x",
+ regs->eip);
+ break;
+ case 32 ... 255: /* Real interrupt, fall thru */
+ cond_resched();
+ case LGUEST_TRAP_ENTRY: /* Handled at top of loop */
+ continue;
+ case 6: /* Invalid opcode before they installed handler */
+ check_bug_kill(lg);
+ }
+ kill_guest(lg,"unhandled trap %i at %#x (err=%i)",
+ regs->trapnum, regs->eip, regs->errcode);
+ }
+ return -ENOENT;
+
+pending_dma:
+ put_user(lg->pending_dma, (unsigned long *)user);
+ put_user(lg->pending_addr, (unsigned long *)user+1);
+ return sizeof(unsigned long)*2;
+}
+
+#define STRUCT_LGUEST_ELEM_SIZE(elem) sizeof(((struct lguest_state *)0)->elem)
+
+static void adjust_pge(void *on)
+{
+ if (on)
+ write_cr4(read_cr4() | X86_CR4_PGE);
+ else
+ write_cr4(read_cr4() & ~X86_CR4_PGE);
+}
+
+static int __init init(void)
+{
+ int err;
+
+ if (paravirt_enabled())
+ return -EPERM;
+
+ err = map_hypervisor();
+ if (err)
+ return err;
+
+ err = init_pagetables(hype_pages);
+ if (err) {
+ unmap_hypervisor();
+ return err;
+ }
+ lguest_io_init();
+
+ err = lguest_device_init();
+ if (err) {
+ free_pagetables();
+ unmap_hypervisor();
+ return err;
+ }
+ if (cpu_has_pge) { /* We have a broader idea of "global". */
+ cpu_had_pge = 1;
+ on_each_cpu(adjust_pge, 0, 0, 1);
+ clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+ }
+ return 0;
+}
+
+static void __exit fini(void)
+{
+ lguest_device_remove();
+ free_pagetables();
+ unmap_hypervisor();
+ if (cpu_had_pge) {
+ set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
+ on_each_cpu(adjust_pge, (void *)1, 0, 1);
+ }
+}
+
+module_init(init);
+module_exit(fini);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Rusty Russell <rusty@...tcorp.com.au>");
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/hypercalls.c
@@ -0,0 +1,199 @@
+/* Actual hypercalls, which allow guests to actually do something.
+ Copyright (C) 2006 Rusty Russell IBM Corporation
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/mm.h>
+#include <linux/clocksource.h>
+#include <asm/lguest.h>
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <irq_vectors.h>
+#include "lg.h"
+
+static void guest_set_stack(struct lguest *lg,
+ u32 seg, u32 esp, unsigned int pages)
+{
+ /* You cannot have a stack segment with priv level 0. */
+ if ((seg & 0x3) != GUEST_DPL)
+ kill_guest(lg, "bad stack segment %i", seg);
+ if (pages > 2)
+ kill_guest(lg, "bad stack pages %u", pages);
+ lg->state->tss.ss1 = seg;
+ lg->state->tss.esp1 = esp;
+ lg->stack_pages = pages;
+ pin_stack_pages(lg);
+}
+
+/* Return true if DMA to host userspace now pending. */
+static int do_hcall(struct lguest *lg, struct lguest_regs *regs)
+{
+ switch (regs->eax) {
+ case LHCALL_FLUSH_ASYNC:
+ break;
+ case LHCALL_LGUEST_INIT:
+ kill_guest(lg, "already have lguest_data");
+ break;
+ case LHCALL_CRASH: {
+ char msg[128];
+ lhread(lg, msg, regs->edx, sizeof(msg));
+ msg[sizeof(msg)-1] = '\0';
+ kill_guest(lg, "CRASH: %s", msg);
+ break;
+ }
+ case LHCALL_LOAD_GDT:
+ load_guest_gdt(lg, regs->edx, regs->ebx);
+ break;
+ case LHCALL_NEW_PGTABLE:
+ guest_new_pagetable(lg, regs->edx);
+ break;
+ case LHCALL_FLUSH_TLB:
+ if (regs->edx)
+ guest_pagetable_clear_all(lg);
+ else
+ guest_pagetable_flush_user(lg);
+ break;
+ case LHCALL_LOAD_IDT_ENTRY:
+ load_guest_idt_entry(lg, regs->edx, regs->ebx, regs->ecx);
+ break;
+ case LHCALL_SET_STACK:
+ guest_set_stack(lg, regs->edx, regs->ebx, regs->ecx);
+ break;
+ case LHCALL_TS:
+ lg->ts = regs->edx;
+ break;
+ case LHCALL_TIMER_READ: {
+ u32 now = jiffies;
+ mb();
+ regs->eax = now - lg->last_timer;
+ lg->last_timer = now;
+ break;
+ }
+ case LHCALL_TIMER_START:
+ lg->timer_on = 1;
+ if (regs->edx != HZ)
+ kill_guest(lg, "Bad clock speed %i", regs->edx);
+ lg->last_timer = jiffies;
+ break;
+ case LHCALL_HALT:
+ lg->halted = 1;
+ break;
+ case LHCALL_GET_WALLCLOCK: {
+ struct timeval tv;
+ do_gettimeofday(&tv);
+ regs->eax = tv.tv_sec;
+ break;
+ }
+ case LHCALL_BIND_DMA:
+ regs->eax = bind_dma(lg, regs->edx, regs->ebx,
+ regs->ecx >> 8, regs->ecx & 0xFF);
+ break;
+ case LHCALL_SEND_DMA:
+ return send_dma(lg, regs->edx, regs->ebx);
+ case LHCALL_SET_PTE:
+ guest_set_pte(lg, regs->edx, regs->ebx, regs->ecx);
+ break;
+ case LHCALL_SET_UNKNOWN_PTE:
+ guest_pagetable_clear_all(lg);
+ break;
+ case LHCALL_SET_PUD:
+ guest_set_pud(lg, regs->edx, regs->ebx);
+ break;
+ case LHCALL_LOAD_TLS:
+ guest_load_tls(lg, (struct desc_struct __user*)regs->edx);
+ break;
+ default:
+ kill_guest(lg, "Bad hypercall %i\n", regs->eax);
+ }
+ return 0;
+}
+
+#define log(...) \
+ do { \
+ mm_segment_t oldfs = get_fs(); \
+ char buf[100]; \
+ sprintf(buf, "lguest:" __VA_ARGS__); \
+ set_fs(KERNEL_DS); \
+ sys_write(1, buf, strlen(buf)); \
+ set_fs(oldfs); \
+ } while(0)
+
+/* We always do queued calls before actual hypercall. */
+int do_async_hcalls(struct lguest *lg)
+{
+ unsigned int i, pending;
+ u8 st[LHCALL_RING_SIZE];
+
+ if (!lg->lguest_data)
+ return 0;
+
+ copy_from_user(&st, &lg->lguest_data->hcall_status, sizeof(st));
+ for (i = 0; i < ARRAY_SIZE(st); i++) {
+ struct lguest_regs regs;
+ unsigned int n = lg->next_hcall;
+
+ if (st[n] == 0xFF)
+ break;
+
+ if (++lg->next_hcall == LHCALL_RING_SIZE)
+ lg->next_hcall = 0;
+
+ get_user(regs.eax, &lg->lguest_data->hcalls[n].eax);
+ get_user(regs.edx, &lg->lguest_data->hcalls[n].edx);
+ get_user(regs.ecx, &lg->lguest_data->hcalls[n].ecx);
+ get_user(regs.ebx, &lg->lguest_data->hcalls[n].ebx);
+ pending = do_hcall(lg, ®s);
+ put_user(0xFF, &lg->lguest_data->hcall_status[n]);
+ if (pending)
+ return 1;
+ }
+
+ set_wakeup_process(lg, NULL);
+ return 0;
+}
+
+int hypercall(struct lguest *lg, struct lguest_regs *regs)
+{
+ int pending;
+
+ if (!lg->lguest_data) {
+ if (regs->eax != LHCALL_LGUEST_INIT) {
+ kill_guest(lg, "hypercall %i before LGUEST_INIT",
+ regs->eax);
+ return 0;
+ }
+
+ lg->lguest_data = (struct lguest_data __user *)regs->edx;
+ /* We check here so we can simply copy_to_user/from_user */
+ if (!lguest_address_ok(lg, (long)lg->lguest_data)
+ || !lguest_address_ok(lg, (long)(lg->lguest_data+1))){
+ kill_guest(lg, "bad guest page %p", lg->lguest_data);
+ return 0;
+ }
+ get_user(lg->noirq_start, &lg->lguest_data->noirq_start);
+ get_user(lg->noirq_end, &lg->lguest_data->noirq_end);
+ /* We reserve the top pgd entry. */
+ put_user(4U*1024*1024, &lg->lguest_data->reserve_mem);
+ put_user(lg->guestid, &lg->lguest_data->guestid);
+ put_user(clocksource_khz2mult(tsc_khz, 22),
+ &lg->lguest_data->clock_mult);
+ return 0;
+ }
+ pending = do_hcall(lg, regs);
+ set_wakeup_process(lg, NULL);
+ return pending;
+}
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/hypervisor.S
@@ -0,0 +1,170 @@
+/* This code sits at 0xFFFF1000 to do the low-level guest<->host switch.
+ Layout is: default_idt_entries (1k), then switch_to_guest entry point. */
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include "lg.h"
+
+#define SAVE_REGS \
+ /* Save old guest/host state */ \
+ pushl %es; \
+ pushl %ds; \
+ pushl %fs; \
+ pushl %eax; \
+ pushl %gs; \
+ pushl %ebp; \
+ pushl %edi; \
+ pushl %esi; \
+ pushl %edx; \
+ pushl %ecx; \
+ pushl %ebx; \
+
+.text
+ENTRY(_start) /* ld complains unless _start is defined. */
+/* %eax contains ptr to target guest state, %edx contains host idt. */
+switch_to_guest:
+ pushl %ss
+ SAVE_REGS
+ /* Save old stack, switch to guest's stack. */
+ movl %esp, LGUEST_STATE_host_stackptr(%eax)
+ movl %eax, %esp
+ /* Guest registers will be at: %esp-$LGUEST_STATE_regs */
+ addl $LGUEST_STATE_regs, %esp
+ /* Switch to guest's GDT, IDT. */
+ lgdt LGUEST_STATE_gdt(%eax)
+ lidt LGUEST_STATE_idt(%eax)
+ /* Save page table top. */
+ movl %cr3, %ebx
+ movl %ebx, LGUEST_STATE_host_pgdir(%eax)
+ /* Set host's TSS to available (clear byte 5 bit 2). */
+ movl (LGUEST_STATE_host_gdt+2)(%eax), %ebx
+ andb $0xFD, (GDT_ENTRY_TSS*8 + 5)(%ebx)
+ /* Switch to guest page tables */
+ popl %ebx
+ movl %ebx, %cr3
+ /* Switch to guest's TSS. */
+ movl $(GDT_ENTRY_TSS*8), %ebx
+ ltr %bx
+ /* Restore guest regs */
+ popl %ebx
+ popl %ecx
+ popl %edx
+ popl %esi
+ popl %edi
+ popl %ebp
+ popl %gs
+ /* Now we've loaded gs, neuter the TLS entries down to 1 byte/page */
+ addl $(LGUEST_STATE_gdt_table+GDT_ENTRY_TLS_MIN*8), %eax
+ movw $0,(%eax)
+ movw $0,8(%eax)
+ movw $0,16(%eax)
+ popl %eax
+ popl %fs
+ popl %ds
+ popl %es
+ /* Skip error code and trap number */
+ addl $8, %esp
+ iret
+
+#define SWITCH_TO_HOST \
+ SAVE_REGS; \
+ /* Save old pgdir */ \
+ movl %cr3, %eax; \
+ pushl %eax; \
+ /* Load lguest ds segment for convenience. */ \
+ movl $(LGUEST_DS), %eax; \
+ movl %eax, %ds; \
+ /* Now figure out who we are */ \
+ movl %esp, %eax; \
+ subl $LGUEST_STATE_regs, %eax; \
+ /* Switch to host page tables (GDT, IDT and stack are in host \
+ mem, so need this first) */ \
+ movl LGUEST_STATE_host_pgdir(%eax), %ebx; \
+ movl %ebx, %cr3; \
+ /* Set guest's TSS to available (clear byte 5 bit 2). */ \
+ andb $0xFD, (LGUEST_STATE_gdt_table+GDT_ENTRY_TSS*8+5)(%eax);\
+ /* Switch to host's GDT & IDT. */ \
+ lgdt LGUEST_STATE_host_gdt(%eax); \
+ lidt LGUEST_STATE_host_idt(%eax); \
+ /* Switch to host's stack. */ \
+ movl LGUEST_STATE_host_stackptr(%eax), %esp; \
+ /* Switch to host's TSS */ \
+ movl $(GDT_ENTRY_TSS*8), %eax; \
+ ltr %ax; \
+ /* Restore host regs */ \
+ popl %ebx; \
+ popl %ecx; \
+ popl %edx; \
+ popl %esi; \
+ popl %edi; \
+ popl %ebp; \
+ popl %gs; \
+ popl %eax; \
+ popl %fs; \
+ popl %ds; \
+ popl %es; \
+ popl %ss
+
+/* Return to run_guest_once. */
+return_to_host:
+ SWITCH_TO_HOST
+ iret
+
+deliver_to_host:
+ SWITCH_TO_HOST
+decode_idt_and_jmp:
+ /* Decode IDT and jump to hosts' irq handler. When that does iret, it
+ * will return to run_guest_once. This is a feature. */
+ /* We told gcc we'd clobber edx and eax... */
+ movl LGUEST_STATE_trapnum(%eax), %eax
+ leal (%edx,%eax,8), %eax
+ movzwl (%eax),%edx
+ movl 4(%eax), %eax
+ xorw %ax, %ax
+ orl %eax, %edx
+ jmp *%edx
+
+deliver_to_host_with_errcode:
+ SWITCH_TO_HOST
+ pushl LGUEST_STATE_errcode(%eax)
+ jmp decode_idt_and_jmp
+
+/* Real hardware interrupts are delivered straight to the host. Others
+ cause us to return to run_guest_once so it can decide what to do. Note
+ that some of these are overridden by the guest to deliver directly, and
+ never enter here (see load_guest_idt_entry). */
+.macro IRQ_STUB N TARGET
+ .data; .long 1f; .text; 1:
+ /* Make an error number for most traps, which don't have one. */
+ .if (\N <> 2) && (\N <> 8) && (\N < 10 || \N > 14) && (\N <> 17)
+ pushl $0
+ .endif
+ pushl $\N
+ jmp \TARGET
+ ALIGN
+.endm
+
+.macro IRQ_STUBS FIRST LAST TARGET
+ irq=\FIRST
+ .rept \LAST-\FIRST+1
+ IRQ_STUB irq \TARGET
+ irq=irq+1
+ .endr
+.endm
+
+/* We intercept every interrupt, because we may need to switch back to
+ * host. Unfortunately we can't tell them apart except by entry
+ * point, so we need 256 entry points.
+ */
+irq_stubs:
+.data
+default_idt_entries:
+.text
+ IRQ_STUBS 0 1 return_to_host /* First two traps */
+ IRQ_STUB 2 deliver_to_host_with_errcode /* NMI */
+ IRQ_STUBS 3 31 return_to_host /* Rest of traps */
+ IRQ_STUBS 32 127 deliver_to_host /* Real interrupts */
+ IRQ_STUB 128 return_to_host /* System call (overridden) */
+ IRQ_STUBS 129 255 deliver_to_host /* Other real interrupts */
+
+/* Everything after this is used for the lguest_state structs. */
+ALIGN
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/interrupts_and_traps.c
@@ -0,0 +1,221 @@
+#include <linux/uaccess.h>
+#include "lg.h"
+
+static void push_guest_stack(struct lguest *lg, u32 __user **gstack, u32 val)
+{
+ lhwrite_u32(lg, (u32)--(*gstack), val);
+}
+
+int reflect_trap(struct lguest *lg, const struct host_trap *trap, int has_err)
+{
+ u32 __user *gstack;
+ u32 eflags, ss, irq_enable;
+ struct lguest_regs *regs = &lg->state->regs;
+
+ if (!trap->addr)
+ return 0;
+
+ /* If they want a ring change, we use new stack and push old ss/esp */
+ if ((regs->ss&0x3) != GUEST_DPL) {
+ gstack = (u32 __user *)guest_pa(lg, lg->state->tss.esp1);
+ ss = lg->state->tss.ss1;
+ push_guest_stack(lg, &gstack, regs->ss);
+ push_guest_stack(lg, &gstack, regs->esp);
+ } else {
+ gstack = (u32 __user *)guest_pa(lg, regs->esp);
+ ss = regs->ss;
+ }
+
+ /* We use IF bit in eflags to indicate whether irqs were disabled
+ (it's always 0, since irqs are enabled when guest is running). */
+ eflags = regs->eflags;
+ get_user(irq_enable, &lg->lguest_data->irq_enabled);
+ eflags |= (irq_enable & 512);
+
+ push_guest_stack(lg, &gstack, eflags);
+ push_guest_stack(lg, &gstack, regs->cs);
+ push_guest_stack(lg, &gstack, regs->eip);
+
+ if (has_err)
+ push_guest_stack(lg, &gstack, regs->errcode);
+
+ /* Change the real stack so hypervisor returns to trap handler */
+ regs->ss = ss;
+ regs->esp = (u32)gstack + lg->page_offset;
+ regs->cs = (__KERNEL_CS|GUEST_DPL);
+ regs->eip = trap->addr;
+
+ /* GS will be neutered on way back to guest. */
+ put_user(0, &lg->lguest_data->gs_gpf_eip);
+
+ /* Disable interrupts for an interrupt gate. */
+ if (trap->disable_interrupts)
+ put_user(0, &lg->lguest_data->irq_enabled);
+ return 1;
+}
+
+void maybe_do_interrupt(struct lguest *lg)
+{
+ unsigned int irq;
+ DECLARE_BITMAP(irqs, LGUEST_IRQS);
+
+ if (!lg->lguest_data)
+ return;
+
+ /* If timer has changed, set timer interrupt. */
+ if (lg->timer_on && jiffies != lg->last_timer)
+ set_bit(0, lg->irqs_pending);
+
+ /* Mask out any interrupts they have blocked. */
+ copy_from_user(&irqs, lg->lguest_data->interrupts, sizeof(irqs));
+ bitmap_andnot(irqs, lg->irqs_pending, irqs, LGUEST_IRQS);
+
+ irq = find_first_bit(irqs, LGUEST_IRQS);
+ if (irq >= LGUEST_IRQS)
+ return;
+
+ /* If they're halted, we re-enable interrupts. */
+ if (lg->halted) {
+ /* Re-enable interrupts. */
+ put_user(512, &lg->lguest_data->irq_enabled);
+ lg->halted = 0;
+ } else {
+ /* Maybe they have interrupts disabled? */
+ u32 irq_enabled;
+ get_user(irq_enabled, &lg->lguest_data->irq_enabled);
+ if (!irq_enabled)
+ return;
+ }
+
+ if (lg->interrupt[irq].addr != 0) {
+ clear_bit(irq, lg->irqs_pending);
+ reflect_trap(lg, &lg->interrupt[irq], 0);
+ }
+}
+
+void check_bug_kill(struct lguest *lg)
+{
+#ifdef CONFIG_BUG
+ u32 eip = lg->state->regs.eip - PAGE_OFFSET;
+ u16 insn;
+
+ /* This only works for addresses in linear mapping... */
+ if (lg->state->regs.eip < PAGE_OFFSET)
+ return;
+ lhread(lg, &insn, eip, sizeof(insn));
+ if (insn == 0x0b0f) {
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+ u16 l;
+ u32 f;
+ char file[128];
+ lhread(lg, &l, eip+sizeof(insn), sizeof(l));
+ lhread(lg, &f, eip+sizeof(insn)+sizeof(l), sizeof(f));
+ lhread(lg, file, f - PAGE_OFFSET, sizeof(file));
+ file[sizeof(file)-1] = 0;
+ kill_guest(lg, "BUG() at %#x %s:%u", eip, file, l);
+#else
+ kill_guest(lg, "BUG() at %#x", eip);
+#endif /* CONFIG_DEBUG_BUGVERBOSE */
+ }
+#endif /* CONFIG_BUG */
+}
+
+static void copy_trap(struct lguest *lg,
+ struct host_trap *trap,
+ const struct desc_struct *desc)
+{
+ u8 type = ((desc->b >> 8) & 0xF);
+
+ /* Not present? */
+ if (!(desc->b & 0x8000)) {
+ trap->addr = 0;
+ return;
+ }
+ if (type != 0xE && type != 0xF)
+ kill_guest(lg, "bad IDT type %i", type);
+ trap->disable_interrupts = (type == 0xE);
+ trap->addr = ((desc->a & 0x0000FFFF) | (desc->b & 0xFFFF0000));
+}
+
+/* FIXME: Put this in hypervisor.S and do something clever with relocs? */
+static u8 tramp[]
+= { 0x0f, 0xa8, 0x0f, 0xa9, /* push %gs; pop %gs */
+ 0x36, 0xc7, 0x05, 0x55, 0x55, 0x55, 0x55, 0x00, 0x00, 0x00, 0x00,
+ /* movl 0, %ss:lguest_data.gs_gpf_eip */
+ 0xe9, 0x55, 0x55, 0x55, 0x55 /* jmp dstaddr */
+};
+#define TRAMP_MOVL_TARGET_OFF 7
+#define TRAMP_JMP_TARGET_OFF 16
+
+static u32 setup_trampoline(struct lguest *lg, unsigned int i, u32 dstaddr)
+{
+ u32 addr, off;
+
+ off = sizeof(tramp)*i;
+ memcpy(lg->trap_page + off, tramp, sizeof(tramp));
+
+ /* 0 is to be placed in lguest_data.gs_gpf_eip. */
+ addr = (u32)&lg->lguest_data->gs_gpf_eip + lg->page_offset;
+ memcpy(lg->trap_page + off + TRAMP_MOVL_TARGET_OFF, &addr, 4);
+
+ /* Address is relative to where end of jmp will be. */
+ addr = dstaddr - ((-4*1024*1024) + off + sizeof(tramp));
+ memcpy(lg->trap_page + off + TRAMP_JMP_TARGET_OFF, &addr, 4);
+ return (-4*1024*1024) + off;
+}
+
+/* We bounce through the trap page, for two reasons: firstly, we need
+ the interrupt destination always mapped, to avoid double faults,
+ secondly we want to reload %gs to make it innocuous on entering kernel.
+ */
+static void setup_idt(struct lguest *lg,
+ unsigned int i,
+ const struct desc_struct *desc)
+{
+ u8 type = ((desc->b >> 8) & 0xF);
+ u32 taddr;
+
+ /* Not present? */
+ if (!(desc->b & 0x8000)) {
+ /* FIXME: When we need this, we'll know... */
+ if (lg->state->idt_table[i].a & 0x8000)
+ kill_guest(lg, "removing interrupts not supported");
+ return;
+ }
+
+ /* We could reflect and disable interrupts, but guest can do itself. */
+ if (type != 0xF)
+ kill_guest(lg, "bad direct IDT %i type %i", i, type);
+
+ taddr = setup_trampoline(lg, i, (desc->a&0xFFFF)|(desc->b&0xFFFF0000));
+
+ lg->state->idt_table[i].a = (((__KERNEL_CS|GUEST_DPL)<<16)
+ | (taddr & 0x0000FFFF));
+ lg->state->idt_table[i].b = (desc->b&0xEF00)|(taddr&0xFFFF0000);
+}
+
+void load_guest_idt_entry(struct lguest *lg, unsigned int i, u32 low, u32 high)
+{
+ struct desc_struct d = { low, high };
+
+ /* Ignore NMI, doublefault, hypercall, spurious interrupt. */
+ if (i == 2 || i == 8 || i == 15 || i == LGUEST_TRAP_ENTRY)
+ return;
+ /* FIXME: We should handle debug and int3 */
+ else if (i == 1 || i == 3)
+ return;
+ /* We intercept page fault, general protection fault and fpu missing */
+ else if (i == 13)
+ copy_trap(lg, &lg->gpf_trap, &d);
+ else if (i == 14)
+ copy_trap(lg, &lg->page_trap, &d);
+ else if (i == 7)
+ copy_trap(lg, &lg->fpu_trap, &d);
+ /* Other traps go straight to guest. */
+ else if (i < FIRST_EXTERNAL_VECTOR || i == SYSCALL_VECTOR)
+ setup_idt(lg, i, &d);
+ /* A virtual interrupt */
+ else if (i < FIRST_EXTERNAL_VECTOR + LGUEST_IRQS)
+ copy_trap(lg, &lg->interrupt[i-FIRST_EXTERNAL_VECTOR], &d);
+}
+
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/io.c
@@ -0,0 +1,413 @@
+/* Simple I/O model for guests, based on shared memory.
+ * Copyright (C) 2006 Rusty Russell IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include <linux/types.h>
+#include <linux/futex.h>
+#include <linux/jhash.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/uaccess.h>
+#include "lg.h"
+
+static struct list_head dma_hash[64];
+
+/* FIXME: allow multi-page lengths. */
+static int check_dma_list(struct lguest *lg, const struct lguest_dma *dma)
+{
+ unsigned int i;
+
+ for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+ if (!dma->len[i])
+ return 1;
+ if (!lguest_address_ok(lg, dma->addr[i]))
+ goto kill;
+ if (dma->len[i] > PAGE_SIZE)
+ goto kill;
+ /* We could do over a page, but is it worth it? */
+ if ((dma->addr[i] % PAGE_SIZE) + dma->len[i] > PAGE_SIZE)
+ goto kill;
+ }
+ return 1;
+
+kill:
+ kill_guest(lg, "bad DMA entry: %u@%#x", dma->len[i], dma->addr[i]);
+ return 0;
+}
+
+static unsigned int hash(const union futex_key *key)
+{
+ return jhash2((u32*)&key->both.word,
+ (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
+ key->both.offset)
+ % ARRAY_SIZE(dma_hash);
+}
+
+/* Must hold read lock on dmainfo owner's current->mm->mmap_sem */
+static void unlink_dma(struct lguest_dma_info *dmainfo)
+{
+ BUG_ON(down_trylock(&lguest_lock) == 0);
+ dmainfo->interrupt = 0;
+ list_del(&dmainfo->list);
+ drop_futex_key_refs(&dmainfo->key);
+}
+
+static inline int key_eq(const union futex_key *a, const union futex_key *b)
+{
+ return (a->both.word == b->both.word
+ && a->both.ptr == b->both.ptr
+ && a->both.offset == b->both.offset);
+}
+
+static u32 unbind_dma(struct lguest *lg,
+ const union futex_key *key,
+ unsigned long dmas)
+{
+ int i, ret = 0;
+
+ for (i = 0; i < LGUEST_MAX_DMA; i++) {
+ if (key_eq(key, &lg->dma[i].key) && dmas == lg->dma[i].dmas) {
+ unlink_dma(&lg->dma[i]);
+ ret = 1;
+ break;
+ }
+ }
+ return ret;
+}
+
+u32 bind_dma(struct lguest *lg,
+ unsigned long addr, unsigned long dmas, u16 numdmas, u8 interrupt)
+{
+ unsigned int i;
+ u32 ret = 0;
+ union futex_key key;
+
+ if (interrupt >= LGUEST_IRQS)
+ return 0;
+
+ down(&lguest_lock);
+ down_read(¤t->mm->mmap_sem);
+ if (get_futex_key((u32 __user *)addr, &key) != 0) {
+ kill_guest(lg, "bad dma address %#lx", addr);
+ goto unlock;
+ }
+ get_futex_key_refs(&key);
+
+ if (interrupt == 0)
+ ret = unbind_dma(lg, &key, dmas);
+ else {
+ for (i = 0; i < LGUEST_MAX_DMA; i++) {
+ if (lg->dma[i].interrupt == 0) {
+ lg->dma[i].dmas = dmas;
+ lg->dma[i].num_dmas = numdmas;
+ lg->dma[i].next_dma = 0;
+ lg->dma[i].key = key;
+ lg->dma[i].guestid = lg->guestid;
+ lg->dma[i].interrupt = interrupt;
+ list_add(&lg->dma[i].list,
+ &dma_hash[hash(&key)]);
+ ret = 1;
+ goto unlock;
+ }
+ }
+ }
+ drop_futex_key_refs(&key);
+unlock:
+ up_read(¤t->mm->mmap_sem);
+ up(&lguest_lock);
+ return ret;
+}
+
+/* lhread from another guest */
+static int lhread_other(struct lguest *lg,
+ void *buf, u32 addr, unsigned bytes)
+{
+ if (addr + bytes < addr
+ || !lguest_address_ok(lg, addr+bytes)
+ || access_process_vm(lg->tsk, addr, buf, bytes, 0) != bytes) {
+ memset(buf, 0, bytes);
+ kill_guest(lg, "bad address in registered DMA struct");
+ return 0;
+ }
+ return 1;
+}
+
+/* lhwrite to another guest */
+static int lhwrite_other(struct lguest *lg, u32 addr,
+ const void *buf, unsigned bytes)
+{
+ if (addr + bytes < addr
+ || !lguest_address_ok(lg, addr+bytes)
+ || (access_process_vm(lg->tsk, addr, (void *)buf, bytes, 1)
+ != bytes)) {
+ kill_guest(lg, "bad address writing to registered DMA");
+ return 0;
+ }
+ return 1;
+}
+
+static u32 copy_data(const struct lguest_dma *src,
+ const struct lguest_dma *dst,
+ struct page *pages[])
+{
+ unsigned int totlen, si, di, srcoff, dstoff;
+ void *maddr = NULL;
+
+ totlen = 0;
+ si = di = 0;
+ srcoff = dstoff = 0;
+ while (si < LGUEST_MAX_DMA_SECTIONS && src->len[si]
+ && di < LGUEST_MAX_DMA_SECTIONS && dst->len[di]) {
+ u32 len = min(src->len[si] - srcoff, dst->len[di] - dstoff);
+
+ if (!maddr)
+ maddr = kmap(pages[di]);
+
+ /* FIXME: This is not completely portable, since
+ archs do different things for copy_to_user_page. */
+ if (copy_from_user(maddr + (dst->addr[di] + dstoff)%PAGE_SIZE,
+ (void *__user)src->addr[si], len) != 0) {
+ totlen = 0;
+ break;
+ }
+
+ totlen += len;
+ srcoff += len;
+ dstoff += len;
+ if (srcoff == src->len[si]) {
+ si++;
+ srcoff = 0;
+ }
+ if (dstoff == dst->len[di]) {
+ kunmap(pages[di]);
+ maddr = NULL;
+ di++;
+ dstoff = 0;
+ }
+ }
+
+ if (maddr)
+ kunmap(pages[di]);
+
+ return totlen;
+}
+
+/* Src is us, ie. current. */
+static u32 do_dma(struct lguest *srclg, const struct lguest_dma *src,
+ struct lguest *dstlg, const struct lguest_dma *dst)
+{
+ int i;
+ u32 ret;
+ struct page *pages[LGUEST_MAX_DMA_SECTIONS];
+
+ if (!check_dma_list(dstlg, dst) || !check_dma_list(srclg, src))
+ return 0;
+
+ /* First get the destination pages */
+ for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+ if (dst->len[i] == 0)
+ break;
+ if (get_user_pages(dstlg->tsk, dstlg->mm,
+ dst->addr[i], 1, 1, 1, pages+i, NULL)
+ != 1) {
+ ret = 0;
+ goto drop_pages;
+ }
+ }
+
+ /* Now copy until we run out of src or dst. */
+ ret = copy_data(src, dst, pages);
+
+drop_pages:
+ while (--i >= 0)
+ put_page(pages[i]);
+ return ret;
+}
+
+/* We cache one process to wakeup: helps for batching & wakes outside locks. */
+void set_wakeup_process(struct lguest *lg, struct task_struct *p)
+{
+ if (p == lg->wake)
+ return;
+
+ if (lg->wake) {
+ wake_up_process(lg->wake);
+ put_task_struct(lg->wake);
+ }
+ lg->wake = p;
+ if (lg->wake)
+ get_task_struct(lg->wake);
+}
+
+static int dma_transfer(struct lguest *srclg,
+ unsigned long udma,
+ struct lguest_dma_info *dst)
+{
+ struct lguest_dma dst_dma, src_dma;
+ struct lguest *dstlg;
+ u32 i, dma = 0;
+
+ dstlg = &lguests[dst->guestid];
+ /* Get our dma list. */
+ lhread(srclg, &src_dma, udma, sizeof(src_dma));
+
+ /* We can't deadlock against them dmaing to us, because this
+ * is all under the lguest_lock. */
+ down_read(&dstlg->mm->mmap_sem);
+
+ for (i = 0; i < dst->num_dmas; i++) {
+ dma = (dst->next_dma + i) % dst->num_dmas;
+ if (!lhread_other(dstlg, &dst_dma,
+ dst->dmas + dma * sizeof(struct lguest_dma),
+ sizeof(dst_dma))) {
+ goto fail;
+ }
+ if (!dst_dma.used_len)
+ break;
+ }
+ if (i != dst->num_dmas) {
+ unsigned long used_lenp;
+ unsigned int ret;
+
+ ret = do_dma(srclg, &src_dma, dstlg, &dst_dma);
+ /* Put used length in src. */
+ lhwrite_u32(srclg,
+ udma+offsetof(struct lguest_dma, used_len), ret);
+ if (ret == 0 && src_dma.len[0] != 0)
+ goto fail;
+
+ /* Make sure destination sees contents before length. */
+ mb();
+ used_lenp = dst->dmas
+ + dma * sizeof(struct lguest_dma)
+ + offsetof(struct lguest_dma, used_len);
+ lhwrite_other(dstlg, used_lenp, &ret, sizeof(ret));
+ dst->next_dma++;
+ }
+ up_read(&dstlg->mm->mmap_sem);
+
+ /* Do this last so dst doesn't simply sleep on lock. */
+ set_bit(dst->interrupt, dstlg->irqs_pending);
+ set_wakeup_process(srclg, dstlg->tsk);
+ return i == dst->num_dmas;
+
+fail:
+ up_read(&dstlg->mm->mmap_sem);
+ return 0;
+}
+
+int send_dma(struct lguest *lg, unsigned long addr, unsigned long udma)
+{
+ union futex_key key;
+ int pending = 0, empty = 0;
+
+again:
+ down(&lguest_lock);
+ down_read(¤t->mm->mmap_sem);
+ if (get_futex_key((u32 __user *)addr, &key) != 0) {
+ kill_guest(lg, "bad sending DMA address");
+ goto unlock;
+ }
+ /* Shared mapping? Look for other guests... */
+ if (key.shared.offset & 1) {
+ struct lguest_dma_info *i, *n;
+ list_for_each_entry_safe(i, n, &dma_hash[hash(&key)], list) {
+ if (i->guestid == lg->guestid)
+ continue;
+ if (!key_eq(&key, &i->key))
+ continue;
+
+ empty += dma_transfer(lg, udma, i);
+ break;
+ }
+ if (empty == 1) {
+ /* Give any recipients one chance to restock. */
+ up_read(¤t->mm->mmap_sem);
+ up(&lguest_lock);
+ yield();
+ empty++;
+ goto again;
+ }
+ pending = 0;
+ } else {
+ /* Private mapping: tell our userspace. */
+ lg->dma_is_pending = 1;
+ lg->pending_dma = udma;
+ lg->pending_addr = addr;
+ pending = 1;
+ }
+unlock:
+ up_read(¤t->mm->mmap_sem);
+ up(&lguest_lock);
+ return pending;
+}
+
+void release_all_dma(struct lguest *lg)
+{
+ unsigned int i;
+
+ BUG_ON(down_trylock(&lguest_lock) == 0);
+
+ down_read(&lg->mm->mmap_sem);
+ for (i = 0; i < LGUEST_MAX_DMA; i++) {
+ if (lg->dma[i].interrupt)
+ unlink_dma(&lg->dma[i]);
+ }
+ up_read(&lg->mm->mmap_sem);
+}
+
+/* Userspace wants a dma buffer from this guest. */
+unsigned long get_dma_buffer(struct lguest *lg,
+ unsigned long addr, unsigned long *interrupt)
+{
+ unsigned long ret = 0;
+ union futex_key key;
+ struct lguest_dma_info *i;
+
+ down(&lguest_lock);
+ down_read(¤t->mm->mmap_sem);
+ if (get_futex_key((u32 __user *)addr, &key) != 0) {
+ kill_guest(lg, "bad registered DMA buffer");
+ goto unlock;
+ }
+ list_for_each_entry(i, &dma_hash[hash(&key)], list) {
+ if (key_eq(&key, &i->key) && i->guestid == lg->guestid) {
+ unsigned int j;
+ for (j = 0; j < i->num_dmas; j++) {
+ struct lguest_dma dma;
+
+ ret = i->dmas + j * sizeof(struct lguest_dma);
+ lhread(lg, &dma, ret, sizeof(dma));
+ if (dma.used_len == 0)
+ break;
+ }
+ *interrupt = i->interrupt;
+ break;
+ }
+ }
+unlock:
+ up_read(¤t->mm->mmap_sem);
+ up(&lguest_lock);
+ return ret;
+}
+
+void lguest_io_init(void)
+{
+ unsigned int i;
+
+ for (i = 0; i < ARRAY_SIZE(dma_hash); i++)
+ INIT_LIST_HEAD(&dma_hash[i]);
+}
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/lguest_user.c
@@ -0,0 +1,242 @@
+/* Userspace control of the guest, via /dev/lguest. */
+#include <linux/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/fs.h>
+#include "lg.h"
+
+static struct lguest_state *setup_guest_state(unsigned int num, void *pgdir,
+ unsigned long start)
+{
+ struct lguest_state *guest = &__lguest_states()[num];
+ unsigned int i;
+ const long *def = __lguest_default_idt_entries();
+ struct lguest_regs *regs;
+
+ guest->gdt_table[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
+ guest->gdt_table[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
+ guest->gdt.size = GDT_ENTRIES*8-1;
+ guest->gdt.address = (unsigned long)&guest->gdt_table;
+
+ /* Other guest's IDTs are initialized from default. */
+ guest->idt.size = 8 * IDT_ENTRIES;
+ guest->idt.address = (long)guest->idt_table;
+ for (i = 0; i < IDT_ENTRIES; i++) {
+ u32 flags = 0x8e00;
+
+ /* They can't "int" into any of them except hypercall. */
+ if (i == LGUEST_TRAP_ENTRY)
+ flags |= (GUEST_DPL << 13);
+
+ guest->idt_table[i].a = (LGUEST_CS<<16) | (def[i]&0x0000FFFF);
+ guest->idt_table[i].b = (def[i]&0xFFFF0000) | flags;
+ }
+
+ memset(&guest->tss, 0, sizeof(guest->tss));
+ guest->tss.ss0 = LGUEST_DS;
+ guest->tss.esp0 = (unsigned long)(guest+1);
+ guest->tss.io_bitmap_base = sizeof(guest->tss); /* No I/O for you! */
+
+ /* Write out stack in format lguest expects, so we can switch to it. */
+ regs = &guest->regs;
+ regs->cr3 = __pa(pgdir);
+ regs->eax = regs->ebx = regs->ecx = regs->edx = regs->esp = 0;
+ regs->edi = LGUEST_MAGIC_EDI;
+ regs->ebp = LGUEST_MAGIC_EBP;
+ regs->esi = LGUEST_MAGIC_ESI;
+ regs->gs = regs->fs = 0;
+ regs->ds = regs->es = __KERNEL_DS|GUEST_DPL;
+ regs->trapnum = regs->errcode = 0;
+ regs->eip = start;
+ regs->cs = __KERNEL_CS|GUEST_DPL;
+ regs->eflags = 0x202; /* Interrupts enabled. */
+ regs->ss = __KERNEL_DS|GUEST_DPL;
+
+ if (!fixup_gdt_table(guest->gdt_table, ARRAY_SIZE(guest->gdt_table),
+ &guest->regs, &guest->tss))
+ return NULL;
+
+ return guest;
+}
+
+/* + addr */
+static long user_get_dma(struct lguest *lg, const u32 __user *input)
+{
+ unsigned long addr, udma, irq;
+
+ if (get_user(addr, input) != 0)
+ return -EFAULT;
+ udma = get_dma_buffer(lg, addr, &irq);
+ if (!udma)
+ return -ENOENT;
+
+ /* We put irq number in udma->used_len. */
+ lhwrite_u32(lg, udma + offsetof(struct lguest_dma, used_len), irq);
+ return udma;
+}
+
+/* + irq */
+static int user_send_irq(struct lguest *lg, const u32 __user *input)
+{
+ u32 irq;
+
+ if (get_user(irq, input) != 0)
+ return -EFAULT;
+ if (irq >= LGUEST_IRQS)
+ return -EINVAL;
+ set_bit(irq, lg->irqs_pending);
+ return 0;
+}
+
+static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
+{
+ struct lguest *lg = file->private_data;
+
+ if (!lg)
+ return -EINVAL;
+
+ if (lg->dead) {
+ size_t len;
+
+ if (lg->dead == (void *)-1)
+ return -ENOMEM;
+
+ len = min(size, strlen(lg->dead)+1);
+ if (copy_to_user(user, lg->dead, len) != 0)
+ return -EFAULT;
+ return len;
+ }
+
+ if (lg->dma_is_pending)
+ lg->dma_is_pending = 0;
+
+ return run_guest(lg, user);
+}
+
+/* Take: pfnlimit, pgdir, start, pageoffset. */
+static int initialize(struct file *file, const u32 __user *input)
+{
+ struct lguest *lg;
+ int err, i;
+ u32 args[4];
+
+ if (file->private_data)
+ return -EBUSY;
+
+ if (copy_from_user(args, input, sizeof(args)) != 0)
+ return -EFAULT;
+
+ if (args[1] <= PAGE_SIZE)
+ return -EINVAL;
+
+ down(&lguest_lock);
+ i = find_free_guest();
+ if (i < 0) {
+ err = -ENOSPC;
+ goto unlock;
+ }
+ lg = &lguests[i];
+ lg->guestid = i;
+ lg->pfn_limit = args[0];
+ lg->page_offset = args[3];
+
+ lg->trap_page = (u32 *)get_zeroed_page(GFP_KERNEL);
+ if (!lg->trap_page) {
+ err = -ENOMEM;
+ goto release_guest;
+ }
+
+ err = init_guest_pagetable(lg, args[1]);
+ if (err)
+ goto free_trap_page;
+
+ lg->state = setup_guest_state(i, lg->pgdirs[lg->pgdidx].pgdir,args[2]);
+ if (!lg->state) {
+ err = -ENOEXEC;
+ goto release_pgtable;
+ }
+ up(&lguest_lock);
+
+ lg->tsk = current;
+ lg->mm = get_task_mm(current);
+ file->private_data = lg;
+ return sizeof(args);
+
+release_pgtable:
+ free_guest_pagetable(lg);
+free_trap_page:
+ free_page((long)lg->trap_page);
+release_guest:
+ memset(lg, 0, sizeof(*lg));
+unlock:
+ up(&lguest_lock);
+ return err;
+}
+
+static ssize_t write(struct file *file, const char __user *input,
+ size_t size, loff_t *off)
+{
+ struct lguest *lg = file->private_data;
+ u32 req;
+
+ if (get_user(req, input) != 0)
+ return -EFAULT;
+ input += sizeof(req);
+
+ if (req != LHREQ_INITIALIZE && !lg)
+ return -EINVAL;
+ if (lg && lg->dead)
+ return -ENOENT;
+
+ switch (req) {
+ case LHREQ_INITIALIZE:
+ return initialize(file, (const u32 __user *)input);
+ case LHREQ_GETDMA:
+ return user_get_dma(lg, (const u32 __user *)input);
+ case LHREQ_IRQ:
+ return user_send_irq(lg, (const u32 __user *)input);
+ default:
+ return -EINVAL;
+ }
+}
+
+static int close(struct inode *inode, struct file *file)
+{
+ struct lguest *lg = file->private_data;
+
+ if (!lg)
+ return 0;
+
+ down(&lguest_lock);
+ release_all_dma(lg);
+ free_page((long)lg->trap_page);
+ free_guest_pagetable(lg);
+ mmput(lg->mm);
+ if (lg->dead != (void *)1)
+ kfree(lg->dead);
+ memset(lg->state, 0, sizeof(*lg->state));
+ memset(lg, 0, sizeof(*lg));
+ up(&lguest_lock);
+ return 0;
+}
+
+static struct file_operations lguest_fops = {
+ .owner = THIS_MODULE,
+ .release = close,
+ .write = write,
+ .read = read,
+};
+static struct miscdevice lguest_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "lguest",
+ .fops = &lguest_fops,
+};
+
+int __init lguest_device_init(void)
+{
+ return misc_register(&lguest_dev);
+}
+
+void __exit lguest_device_remove(void)
+{
+ misc_deregister(&lguest_dev);
+}
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/page_tables.c
@@ -0,0 +1,374 @@
+/* Shadow page table operations.
+ * Copyright (C) Rusty Russell IBm Corporation 2006.
+ * GPL v2 and any later version */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/random.h>
+#include <linux/percpu.h>
+#include <asm/tlbflush.h>
+#include "lg.h"
+
+#define PTES_PER_PAGE_SHIFT 10
+#define PTES_PER_PAGE (1 << PTES_PER_PAGE_SHIFT)
+#define HYPERVISOR_PGD_ENTRY (PTES_PER_PAGE - 1)
+
+static DEFINE_PER_CPU(u32 *, hypervisor_pte_pages) = { NULL };
+#define hypervisor_pte_page(cpu) per_cpu(hypervisor_pte_pages, cpu)
+
+static unsigned vaddr_to_pgd(unsigned long vaddr)
+{
+ return vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
+}
+
+/* These access the real versions. */
+static u32 *toplev(struct lguest *lg, u32 i, unsigned long vaddr)
+{
+ unsigned int index = vaddr_to_pgd(vaddr);
+
+ if (index >= HYPERVISOR_PGD_ENTRY) {
+ kill_guest(lg, "attempt to access hypervisor pages");
+ index = 0;
+ }
+ return &lg->pgdirs[i].pgdir[index];
+}
+
+static u32 *pteof(struct lguest *lg, u32 top, unsigned long vaddr)
+{
+ u32 *page = __va(top&PAGE_MASK);
+ BUG_ON(!(top & _PAGE_PRESENT));
+ return &page[(vaddr >> PAGE_SHIFT) % PTES_PER_PAGE];
+}
+
+/* These access the guest versions. */
+static u32 gtoplev(struct lguest *lg, unsigned long vaddr)
+{
+ unsigned int index = vaddr >> (PAGE_SHIFT + PTES_PER_PAGE_SHIFT);
+ return lg->pgdirs[lg->pgdidx].cr3 + index * sizeof(u32);
+}
+
+static u32 gpteof(struct lguest *lg, u32 gtop, unsigned long vaddr)
+{
+ u32 gpage = (gtop&PAGE_MASK);
+ BUG_ON(!(gtop & _PAGE_PRESENT));
+ return gpage + ((vaddr >> PAGE_SHIFT) % PTES_PER_PAGE) * sizeof(u32);
+}
+
+static void release_pte(u32 pte)
+{
+ if (pte & _PAGE_PRESENT)
+ put_page(pfn_to_page(pte >> PAGE_SHIFT));
+}
+
+/* Do a virtual -> physical mapping on a user page. */
+static unsigned long get_pfn(unsigned long virtpfn, int write)
+{
+ struct vm_area_struct *vma;
+ struct page *page;
+ unsigned long ret = -1UL;
+
+ down_read(¤t->mm->mmap_sem);
+ if (get_user_pages(current, current->mm, virtpfn << PAGE_SHIFT,
+ 1, write, 1, &page, &vma) == 1)
+ ret = page_to_pfn(page);
+ up_read(¤t->mm->mmap_sem);
+ return ret;
+}
+
+static u32 check_pgtable_entry(struct lguest *lg, u32 entry)
+{
+ if ((entry & (_PAGE_PWT|_PAGE_PSE))
+ || (entry >> PAGE_SHIFT) >= lg->pfn_limit)
+ kill_guest(lg, "bad page table entry");
+ return entry & ~_PAGE_GLOBAL;
+}
+
+static u32 get_pte(struct lguest *lg, u32 entry, int write)
+{
+ u32 pfn;
+
+ pfn = get_pfn(entry >> PAGE_SHIFT, write);
+ if (pfn == -1UL) {
+ kill_guest(lg, "failed to get page %u", entry>>PAGE_SHIFT);
+ return 0;
+ }
+ return ((pfn << PAGE_SHIFT) | (entry & (PAGE_SIZE-1)));
+}
+
+/* FIXME: We hold reference to pages, which prevents them from being
+ swapped. It'd be nice to have a callback when Linux wants to swap out. */
+
+/* We fault pages in, which allows us to update accessed/dirty bits.
+ * Return NULL or the pte page. */
+static int page_in(struct lguest *lg, u32 vaddr, unsigned flags)
+{
+ u32 gtop, gpte;
+ u32 *top, *pte, *ptepage;
+ u32 val;
+
+ gtop = gtoplev(lg, vaddr);
+ val = lhread_u32(lg, gtop);
+ if (!(val & _PAGE_PRESENT))
+ return 0;
+
+ top = toplev(lg, lg->pgdidx, vaddr);
+ if (!(*top & _PAGE_PRESENT)) {
+ /* Get a PTE page for them. */
+ ptepage = (void *)get_zeroed_page(GFP_KERNEL);
+ /* FIXME: Steal from self in this case? */
+ if (!ptepage) {
+ kill_guest(lg, "out of memory allocating pte page");
+ return 0;
+ }
+ val = check_pgtable_entry(lg, val);
+ *top = (__pa(ptepage) | (val & (PAGE_SIZE-1)));
+ } else
+ ptepage = __va(*top & PAGE_MASK);
+
+ gpte = gpteof(lg, val, vaddr);
+ val = lhread_u32(lg, gpte);
+
+ /* No page, or write to readonly page? */
+ if (!(val&_PAGE_PRESENT) || ((flags&_PAGE_DIRTY) && !(val&_PAGE_RW)))
+ return 0;
+
+ pte = pteof(lg, *top, vaddr);
+ val = check_pgtable_entry(lg, val) | flags;
+
+ /* We're done with the old pte. */
+ release_pte(*pte);
+
+ /* We don't make it writable if this isn't a write: later
+ * write will fault so we can set dirty bit in guest. */
+ if (val & _PAGE_DIRTY)
+ *pte = get_pte(lg, val, 1);
+ else
+ *pte = get_pte(lg, val & ~_PAGE_RW, 0);
+
+ /* Now we update dirty/accessed on guest. */
+ lhwrite_u32(lg, gpte, val);
+ return 1;
+}
+
+int demand_page(struct lguest *lg, u32 vaddr, int write)
+{
+ return page_in(lg, vaddr, (write ? _PAGE_DIRTY : 0)|_PAGE_ACCESSED);
+}
+
+void pin_stack_pages(struct lguest *lg)
+{
+ unsigned int i;
+ u32 stack = lg->state->tss.esp1;
+
+ for (i = 0; i < lg->stack_pages; i++)
+ if (!demand_page(lg, stack - i*PAGE_SIZE, 1))
+ kill_guest(lg, "bad stack page %i@%#x", i, stack);
+}
+
+static unsigned int find_pgdir(struct lguest *lg, u32 pgtable)
+{
+ unsigned int i;
+ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+ if (lg->pgdirs[i].cr3 == pgtable)
+ break;
+ return i;
+}
+
+static void release_pgd(struct lguest *lg, u32 *pgd)
+{
+ if (*pgd & _PAGE_PRESENT) {
+ unsigned int i;
+ u32 *ptepage = __va(*pgd & ~(PAGE_SIZE-1));
+ for (i = 0; i < PTES_PER_PAGE; i++)
+ release_pte(ptepage[i]);
+ free_page((long)ptepage);
+ *pgd = 0;
+ }
+}
+
+static void flush_user_mappings(struct lguest *lg, int idx)
+{
+ unsigned int i;
+ for (i = 0; i < vaddr_to_pgd(lg->page_offset); i++)
+ release_pgd(lg, lg->pgdirs[idx].pgdir + i);
+}
+
+void guest_pagetable_flush_user(struct lguest *lg)
+{
+ flush_user_mappings(lg, lg->pgdidx);
+}
+
+static unsigned int new_pgdir(struct lguest *lg, u32 cr3)
+{
+ unsigned int next;
+
+ next = (lg->pgdidx + random32()) % ARRAY_SIZE(lg->pgdirs);
+ if (!lg->pgdirs[next].pgdir) {
+ lg->pgdirs[next].pgdir = (u32 *)get_zeroed_page(GFP_KERNEL);
+ if (!lg->pgdirs[next].pgdir)
+ next = lg->pgdidx;
+ }
+ lg->pgdirs[next].cr3 = cr3;
+ /* Release all the non-kernel mappings. */
+ flush_user_mappings(lg, next);
+
+ return next;
+}
+
+void guest_new_pagetable(struct lguest *lg, u32 pgtable)
+{
+ int newpgdir;
+
+ newpgdir = find_pgdir(lg, pgtable);
+ if (newpgdir == ARRAY_SIZE(lg->pgdirs))
+ newpgdir = new_pgdir(lg, pgtable);
+ lg->pgdidx = newpgdir;
+ lg->state->regs.cr3 = __pa(lg->pgdirs[lg->pgdidx].pgdir);
+ pin_stack_pages(lg);
+}
+
+static void release_all_pagetables(struct lguest *lg)
+{
+ unsigned int i, j;
+
+ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+ if (lg->pgdirs[i].pgdir)
+ for (j = 0; j < HYPERVISOR_PGD_ENTRY; j++)
+ release_pgd(lg, lg->pgdirs[i].pgdir + j);
+}
+
+void guest_pagetable_clear_all(struct lguest *lg)
+{
+ release_all_pagetables(lg);
+ pin_stack_pages(lg);
+}
+
+static void do_set_pte(struct lguest *lg, int idx,
+ unsigned long vaddr, u32 val)
+{
+ u32 *top = toplev(lg, idx, vaddr);
+ if (*top & _PAGE_PRESENT) {
+ u32 *pte = pteof(lg, *top, vaddr);
+ release_pte(*pte);
+ if (val & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
+ val = check_pgtable_entry(lg, val);
+ *pte = get_pte(lg, val, val & _PAGE_DIRTY);
+ } else
+ *pte = 0;
+ }
+}
+
+void guest_set_pte(struct lguest *lg,
+ unsigned long cr3, unsigned long vaddr, u32 val)
+{
+ /* Kernel mappings must be changed on all top levels. */
+ if (vaddr >= lg->page_offset) {
+ unsigned int i;
+ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+ if (lg->pgdirs[i].pgdir)
+ do_set_pte(lg, i, vaddr, val);
+ } else {
+ int pgdir = find_pgdir(lg, cr3);
+ if (pgdir != ARRAY_SIZE(lg->pgdirs))
+ do_set_pte(lg, pgdir, vaddr, val);
+ }
+}
+
+void guest_set_pud(struct lguest *lg, unsigned long cr3, u32 idx)
+{
+ int pgdir;
+
+ if (idx >= HYPERVISOR_PGD_ENTRY)
+ return;
+
+ pgdir = find_pgdir(lg, cr3);
+ if (pgdir < ARRAY_SIZE(lg->pgdirs))
+ release_pgd(lg, lg->pgdirs[pgdir].pgdir + idx);
+}
+
+int init_guest_pagetable(struct lguest *lg, u32 pgtable)
+{
+ /* We assume this in flush_user_mappings, so check now */
+ if (vaddr_to_pgd(lg->page_offset) >= HYPERVISOR_PGD_ENTRY)
+ return -EINVAL;
+ lg->pgdidx = 0;
+ lg->pgdirs[lg->pgdidx].cr3 = pgtable;
+ lg->pgdirs[lg->pgdidx].pgdir = (u32*)get_zeroed_page(GFP_KERNEL);
+ if (!lg->pgdirs[lg->pgdidx].pgdir)
+ return -ENOMEM;
+ return 0;
+}
+
+void free_guest_pagetable(struct lguest *lg)
+{
+ unsigned int i;
+
+ release_all_pagetables(lg);
+ for (i = 0; i < ARRAY_SIZE(lg->pgdirs); i++)
+ free_page((long)lg->pgdirs[i].pgdir);
+}
+
+/* Caller must be preempt-safe */
+void map_trap_page(struct lguest *lg)
+{
+ int cpu = smp_processor_id();
+
+ hypervisor_pte_page(cpu)[0] = (__pa(lg->trap_page)|_PAGE_PRESENT);
+
+ /* Since hypervisor less that 4MB, we simply mug top pte page. */
+ lg->pgdirs[lg->pgdidx].pgdir[HYPERVISOR_PGD_ENTRY] =
+ (__pa(hypervisor_pte_page(cpu))| _PAGE_KERNEL);
+}
+
+static void free_hypervisor_pte_pages(void)
+{
+ int i;
+
+ for_each_possible_cpu(i)
+ free_page((long)hypervisor_pte_page(i));
+}
+
+static __init int alloc_hypervisor_pte_pages(void)
+{
+ int i;
+
+ for_each_possible_cpu(i) {
+ hypervisor_pte_page(i) = (u32 *)get_zeroed_page(GFP_KERNEL);
+ if (!hypervisor_pte_page(i)) {
+ free_hypervisor_pte_pages();
+ return -ENOMEM;
+ }
+ }
+ return 0;
+}
+
+static __init void populate_hypervisor_pte_page(int cpu)
+{
+ int i;
+ u32 *pte = hypervisor_pte_page(cpu);
+
+ for (i = 0; i < HYPERVISOR_PAGES; i++) {
+ /* First entry set dynamically in map_trap_page */
+ pte[i+1] = ((page_to_pfn(&hype_pages[i]) << PAGE_SHIFT)
+ | _PAGE_KERNEL_EXEC);
+ }
+}
+
+__init int init_pagetables(struct page hype_pages[])
+{
+ int ret;
+ unsigned int i;
+
+ ret = alloc_hypervisor_pte_pages();
+ if (ret)
+ return ret;
+
+ for_each_possible_cpu(i)
+ populate_hypervisor_pte_page(i);
+ return 0;
+}
+
+__exit void free_pagetables(void)
+{
+ free_hypervisor_pte_pages();
+}
===================================================================
--- /dev/null
+++ b/arch/i386/lguest/segments.c
@@ -0,0 +1,171 @@
+#include "lg.h"
+
+/* Dealing with GDT entries is such a horror, I convert to sanity and back */
+struct decoded_gdt_entry
+{
+ u32 base, limit;
+ union {
+ struct {
+ unsigned type:4;
+ unsigned dtype:1;
+ unsigned dpl:2;
+ unsigned present:1;
+ unsigned unused:4;
+ unsigned avl:1;
+ unsigned mbz:1;
+ unsigned def:1;
+ unsigned page_granularity:1;
+ };
+ u16 raw_attributes;
+ };
+};
+
+static struct decoded_gdt_entry decode_gdt_entry(const struct desc_struct *en)
+{
+ struct decoded_gdt_entry de;
+ de.base = ((en->a >> 16) | ((en->b & 0xff) << 16)
+ | (en->b & 0xFF000000));
+ de.limit = ((en->a & 0xFFFF) | (en->b & 0xF0000));
+ de.raw_attributes = (en->b >> 8);
+ return de;
+}
+
+static struct desc_struct encode_gdt_entry(const struct decoded_gdt_entry *de)
+{
+ struct desc_struct en;
+ en.a = ((de->limit & 0xFFFF) | (de->base << 16));
+ en.b = (((de->base >> 16) & 0xFF)
+ | ((((u32)de->raw_attributes) & 0xF0FF) << 8)
+ | (de->limit & 0xF0000)
+ | (de->base & 0xFF000000));
+ return en;
+}
+
+static int check_desc(const struct decoded_gdt_entry *dec)
+{
+ return (dec->mbz == 0 && dec->dtype == 1 && (dec->type & 4) == 0);
+}
+
+static void check_segment(const struct desc_struct *gdt, u32 *segreg)
+{
+ if (*segreg > 255 || !(gdt[*segreg >> 3].b & 0x8000))
+ *segreg = 0;
+}
+
+/* Ensure our manually-loaded segment regs don't fault in switch_to_guest. */
+static void check_live_segments(const struct desc_struct *gdt,
+ struct lguest_regs *regs)
+{
+ check_segment(gdt, ®s->es);
+ check_segment(gdt, ®s->ds);
+ check_segment(gdt, ®s->fs);
+ check_segment(gdt, ®s->gs);
+}
+
+int fixup_gdt_table(struct desc_struct *gdt, unsigned int num,
+ struct lguest_regs *regs, struct x86_tss *tss)
+{
+ unsigned int i;
+ struct decoded_gdt_entry dec;
+
+ for (i = 0; i < num; i++) {
+ unsigned long base, length;
+
+ /* We override these ones, so we don't care what they give. */
+ if (i == GDT_ENTRY_TSS
+ || i == GDT_ENTRY_LGUEST_CS
+ || i == GDT_ENTRY_LGUEST_DS
+ || i == GDT_ENTRY_DOUBLEFAULT_TSS)
+ continue;
+
+ dec = decode_gdt_entry(&gdt[i]);
+ if (!dec.present)
+ continue;
+
+ if (!check_desc(&dec))
+ return 0;
+
+ base = dec.base;
+ length = dec.limit + 1;
+ if (dec.page_granularity) {
+ base *= PAGE_SIZE;
+ length *= PAGE_SIZE;
+ }
+
+ /* Unacceptable base? */
+ if (base >= HYPE_ADDR)
+ return 0;
+
+ /* Wrap around or segment overlaps hypervisor mem? */
+ if (!length
+ || base + length < base
+ || base + length > HYPE_ADDR) {
+ /* Trim to edge of hypervisor. */
+ length = HYPE_ADDR - base;
+ if (dec.page_granularity)
+ dec.limit = (length / PAGE_SIZE) - 1;
+ else
+ dec.limit = length - 1;
+ }
+ if (dec.dpl == 0)
+ dec.dpl = GUEST_DPL;
+ gdt[i] = encode_gdt_entry(&dec);
+ }
+ check_live_segments(gdt, regs);
+
+ /* Now put in hypervisor data and code segments. */
+ gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
+ gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
+
+ /* Finally, TSS entry */
+ dec.base = (unsigned long)tss;
+ dec.limit = sizeof(*tss)-1;
+ dec.type = 0x9;
+ dec.dtype = 0;
+ dec.def = 0;
+ dec.present = 1;
+ dec.mbz = 0;
+ dec.page_granularity = 0;
+ gdt[GDT_ENTRY_TSS] = encode_gdt_entry(&dec);
+
+ return 1;
+}
+
+void load_guest_gdt(struct lguest *lg, u32 table, u32 num)
+{
+ if (num > GDT_ENTRIES)
+ kill_guest(lg, "too many gdt entries %i", num);
+
+ lhread(lg, lg->state->gdt_table, table,
+ num * sizeof(lg->state->gdt_table[0]));
+ if (!fixup_gdt_table(lg->state->gdt_table, num,
+ &lg->state->regs, &lg->state->tss))
+ kill_guest(lg, "bad gdt table");
+}
+
+/* We don't care about limit here, since we only let them use these in
+ * usermode (where lack of USER bit in pagetable protects hypervisor mem).
+ * However, we want to ensure it doesn't fault when loaded, since *we* are
+ * the ones who will load it in switch_to_guest.
+ */
+void guest_load_tls(struct lguest *lg, const struct desc_struct __user *gtls)
+{
+ unsigned int i;
+ struct desc_struct *tls = &lg->state->gdt_table[GDT_ENTRY_TLS_MIN];
+
+ lhread(lg, tls, (u32)gtls, sizeof(*tls)*GDT_ENTRY_TLS_ENTRIES);
+ for (i = 0; i < ARRAY_SIZE(lg->tls_limits); i++) {
+ struct decoded_gdt_entry dec = decode_gdt_entry(&tls[i]);
+
+ if (!dec.present)
+ continue;
+
+ /* We truncate to one byte/page (depending on G bit) to neuter
+ it, so ensure it's more than 1 page below trap page. */
+ tls[i].a &= 0xFFFF0000;
+ lg->tls_limits[i] = dec.limit;
+ if (!check_desc(&dec) || dec.base > HYPE_ADDR - PAGE_SIZE)
+ kill_guest(lg, "bad TLS descriptor %i", i);
+ }
+ check_live_segments(lg->state->gdt_table, &lg->state->regs);
+}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists