Makefile | 9 +- i386/defines | 1 i386/lguest_defs.h | 9 ++ lguest.c | 177 +++++++++++++++++++++++++++++++++++++++------------ x86_64/defines | 1 x86_64/lguest_defs.h | 11 +++ 6 files changed, 165 insertions(+), 43 deletions(-) Index: linux-2.6-mm/Documentation/lguest/Makefile =================================================================== --- linux-2.6-mm.orig/Documentation/lguest/Makefile +++ linux-2.6-mm/Documentation/lguest/Makefile @@ -1,12 +1,13 @@ # This creates the demonstration utility "lguest" which runs a Linux guest. -# We rely on CONFIG_PAGE_OFFSET to know where to put lguest binary. -# Some shells (dash - ubunu) can't handle numbers that big so we cheat. +# We could uname -i, but it seems to return unknown on a bunch of locations +ARCH:=$(shell uname -m | sed s/i[3456]86/i386/) + include ../../.config -LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000) +include $(ARCH)/defines CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 \ - -static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds + -static -DLGUEST_GUEST_TOP="$(LGUEST_GUEST_TOP)" -Wl,-T,lguest.lds -I$(ARCH) LDLIBS:=-lz all: lguest.lds lguest Index: linux-2.6-mm/Documentation/lguest/i386/defines =================================================================== --- /dev/null +++ linux-2.6-mm/Documentation/lguest/i386/defines @@ -0,0 +1 @@ +LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000) Index: linux-2.6-mm/Documentation/lguest/i386/lguest_defs.h =================================================================== --- /dev/null +++ linux-2.6-mm/Documentation/lguest/i386/lguest_defs.h @@ -0,0 +1,9 @@ +#ifndef _LGUEST_DEFS_H_ +#define _LGUEST_DEFS_H_ + +#include "../../../include/linux/lguest_launcher.h" +typedef Elf32_Ehdr Elf_Ehdr; +typedef Elf32_Phdr Elf_Phdr; +#define ELF_MACHINE EM_386 + +#endif Index: linux-2.6-mm/Documentation/lguest/lguest.c =================================================================== --- linux-2.6-mm.orig/Documentation/lguest/lguest.c +++ linux-2.6-mm/Documentation/lguest/lguest.c @@ -30,10 +30,11 @@ #include #include #include +typedef uint64_t u64; typedef uint32_t u32; typedef uint16_t u16; typedef uint8_t u8; -#include "../../include/linux/lguest_launcher.h" +#include #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ #define NET_PEERNUM 1 @@ -64,8 +65,8 @@ struct device /* Watch DMA to this key if handle_input non-NULL. */ unsigned long watch_key; - u32 (*handle_output)(int fd, const struct iovec *iov, - unsigned int num, struct device *me); + unsigned long (*handle_output)(int fd, const struct iovec *iov, + unsigned int num, struct device *me); /* Device-specific data. */ void *priv; @@ -93,21 +94,31 @@ static void *map_zeroed_pages(unsigned l return (void *)addr; } +static inline bool is_vsyscall_segment(Elf_Phdr *phdr) +{ +#ifdef __x86_64__ + return phdr->p_vaddr == VSYSCALL_START; +#else + return 0; +#endif +} + /* Returns the entry point */ -static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr, +static unsigned long map_elf(int elf_fd, Elf_Ehdr *ehdr, unsigned long *page_offset) { + Elf_Phdr phdr[ehdr->e_phnum]; void *addr; - Elf32_Phdr phdr[ehdr->e_phnum]; unsigned int i; /* Sanity checks. */ if (ehdr->e_type != ET_EXEC - || ehdr->e_machine != EM_386 - || ehdr->e_phentsize != sizeof(Elf32_Phdr) - || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) + || ehdr->e_machine != ELF_MACHINE + || ehdr->e_phentsize != sizeof(phdr[0]) + || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(phdr[0])) errx(1, "Malformed elf header"); + if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) err(1, "Seeking to program headers"); if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) @@ -120,13 +131,14 @@ static unsigned long map_elf(int elf_fd, if (phdr[i].p_type != PT_LOAD) continue; - verbose("Section %i: size %i addr %p\n", - i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); + verbose("Section %i: size %lu addr %p\n", + i, (unsigned long)phdr[i].p_memsz, (void *)phdr[i].p_paddr); /* We expect linear address space. */ if (!*page_offset) *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr; - else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr) + else if ((*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr) + && (!is_vsyscall_segment(&phdr[i]))) errx(1, "Page offset of section %i different", i); /* We map everything private, writable. */ @@ -210,15 +222,18 @@ static unsigned long load_bzimage(int fd errx(1, "Could not find kernel in bzImage"); } -static unsigned long load_kernel(int fd, unsigned long *page_offset) +/* In x86_64 systems, page table hierarchy does not start at a common location, + * but rather, is indicated by a symbol called boot_level4_pgt. Due to this, we + * need the elf header to survive this function, to lookup for the pgdir */ +static unsigned long load_kernel(int fd, unsigned long *page_offset, + Elf_Ehdr *hdr) { - Elf32_Ehdr hdr; - if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr)) + if (read(fd, hdr, sizeof(*hdr)) != sizeof(*hdr)) err(1, "Reading kernel"); - if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) - return map_elf(fd, &hdr, page_offset); + if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) == 0) + return map_elf(fd, hdr, page_offset); return load_bzimage(fd, page_offset); } @@ -250,6 +265,7 @@ static inline unsigned long page_align(u return ((addr + getpagesize()-1) & ~(getpagesize()-1)); } +#ifndef __x86_64__ static unsigned long setup_pagetables(unsigned long mem, unsigned long initrd_size, unsigned long page_offset) @@ -285,6 +301,74 @@ static unsigned long setup_pagetables(un return (unsigned long)pgdir; } +#else +static unsigned long find_pgt_symbol(int elf_fd, + Elf_Ehdr *ehdr, + unsigned long page_offset) +{ + unsigned int i; + Elf64_Shdr sec[ehdr->e_shnum]; + Elf64_Sym *syms; + char *strtab = NULL; + unsigned long pgdir_addr = 0; + unsigned long nsyms = 0; + + /* Now process sections searching for boot page tables + * Start by finding the symtab section */ + if (lseek(elf_fd, ehdr->e_shoff, SEEK_SET) < 0) + err(1, "Seeking to section headers"); + if (read(elf_fd, sec, sizeof(sec)) != sizeof(sec)) + err(1, "Reading section headers"); + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sec[i].sh_type == SHT_SYMTAB) { + int ret = 0; + syms = malloc(sec[i].sh_size); + if (!syms) + err(1,"Not enough memory for symbol table"); + ret = lseek(elf_fd, sec[i].sh_offset, SEEK_SET); + if (ret < 0) + err(1, "Seeking to symbol table"); + ret = read(elf_fd, syms, sec[i].sh_size); + if (ret != sec[i].sh_size) + err(1, "Reading symbol table"); + nsyms = sec[i].sh_size / sizeof(Elf64_Sym); + + + /* symtab links to strtab. We use it to find symbol + * names */ + strtab = malloc(sec[sec[i].sh_link].sh_size); + if (!strtab) + err(1,"Not enough memory for string table"); + ret = lseek(elf_fd, sec[sec[i].sh_link].sh_offset , SEEK_SET); + if (ret < 0) + err(1, "Seeking to string table"); + ret = read(elf_fd, strtab, sec[sec[i].sh_link].sh_size); + if (ret != sec[sec[i].sh_link].sh_size) + err(1, "Reading string table"); + break; + } + } + + /* We now have a pointer to the symtab, start searching for the symbol */ + for (i = 0; i < nsyms; i++) { + if ((syms[i].st_shndx == SHN_UNDEF) || !syms[i].st_name) + continue; + if (!strcmp("boot_level4_pgt", + (char *)((u64)syms[i].st_name + strtab))) { + pgdir_addr = syms[i].st_value - page_offset; + break; + } + } + + if (!pgdir_addr) { + errno = ESRCH; + err(1,"Unable to find boot pgdir"); + } + + return pgdir_addr; +} +#endif static void concat(char *dst, char *args[]) { @@ -299,9 +383,10 @@ static void concat(char *dst, char *args dst[len] = '\0'; } -static int tell_kernel(u32 pgdir, u32 start, u32 page_offset) +static int tell_kernel(unsigned long pgdir, unsigned long start, + unsigned long page_offset) { - u32 args[] = { LHREQ_INITIALIZE, + unsigned long args[] = { LHREQ_INITIALIZE, LGUEST_GUEST_TOP/getpagesize(), /* Just below us */ pgdir, start, page_offset }; int fd; @@ -379,7 +464,8 @@ static void *_check_pointer(unsigned lon #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) /* Returns pointer to dma->used_len */ -static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num) +static unsigned long *dma2iov(unsigned long dma, struct iovec iov[], + unsigned *num) { unsigned int i; struct lguest_dma *udma; @@ -396,12 +482,12 @@ static u32 *dma2iov(unsigned long dma, s return &udma->used_len; } -static u32 *get_dma_buffer(int fd, void *key, +static unsigned long *get_dma_buffer(int fd, void *key, struct iovec iov[], unsigned int *num, u32 *irq) { - u32 buf[] = { LHREQ_GETDMA, (u32)key }; + unsigned long buf[] = { LHREQ_GETDMA, (unsigned long)key }; unsigned long udma; - u32 *res; + unsigned long *res; udma = write(fd, buf, sizeof(buf)); if (udma == (unsigned long)-1) @@ -415,7 +501,7 @@ static u32 *get_dma_buffer(int fd, void static void trigger_irq(int fd, u32 irq) { - u32 buf[] = { LHREQ_IRQ, irq }; + unsigned long buf[] = { LHREQ_IRQ, irq }; if (write(fd, buf, sizeof(buf)) != 0) err(1, "Triggering irq %i", irq); } @@ -443,7 +529,8 @@ struct console_abort /* We DMA input to buffer bound at start of console page. */ static bool handle_console_input(int fd, struct device *dev) { - u32 irq = 0, *lenp; + u32 irq = 0; + unsigned long *lenp; int len; unsigned int num; struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; @@ -487,14 +574,14 @@ static bool handle_console_input(int fd, return true; } -static u32 handle_console_output(int fd, const struct iovec *iov, - unsigned num, struct device*dev) +static unsigned long handle_console_output(int fd, const struct iovec *iov, + unsigned num, struct device*dev) { return writev(STDOUT_FILENO, iov, num); } -static u32 handle_tun_output(int fd, const struct iovec *iov, - unsigned num, struct device *dev) +static unsigned long handle_tun_output(int fd, const struct iovec *iov, + unsigned num, struct device *dev) { /* Now we've seen output, we should warn if we can't get buffers. */ *(bool *)dev->priv = true; @@ -508,7 +595,8 @@ static unsigned long peer_offset(unsigne static bool handle_tun_input(int fd, struct device *dev) { - u32 irq = 0, *lenp; + u32 irq = 0; + unsigned long *lenp; int len; unsigned num; struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; @@ -534,11 +622,12 @@ static bool handle_tun_input(int fd, str return true; } -static u32 handle_block_output(int fd, const struct iovec *iov, - unsigned num, struct device *dev) +static unsigned long handle_block_output(int fd, const struct iovec *iov, + unsigned num, struct device *dev) { struct lguest_block_page *p = dev->mem; - u32 irq, *lenp; + u32 irq; + unsigned long *lenp; unsigned int len, reply_num; struct iovec reply[LGUEST_MAX_DMA_SECTIONS]; off64_t device_len, off = (off64_t)p->sector * 512; @@ -546,11 +635,14 @@ static u32 handle_block_output(int fd, c device_len = *(off64_t *)dev->priv; if (off >= device_len) - err(1, "Bad offset %llu vs %llu", off, device_len); + err(1, "Bad offset %llu vs %llu", + (unsigned long long)off, + (unsigned long long)device_len); if (lseek64(dev->fd, off, SEEK_SET) != off) err(1, "Bad seek to sector %i", p->sector); - verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off); + verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", + (unsigned long long)off); lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq); if (!lenp) @@ -560,7 +652,8 @@ static u32 handle_block_output(int fd, c len = writev(dev->fd, iov, num); if (off + len > device_len) { ftruncate(dev->fd, device_len); - errx(1, "Write past end %llu+%u", off, len); + errx(1, "Write past end %llu+%u", + (unsigned long long)off, len); } *lenp = 0; } else { @@ -577,7 +670,7 @@ static void handle_output(int fd, unsign struct device_list *devices) { struct device *i; - u32 *lenp; + unsigned long *lenp; struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; unsigned num = 0; @@ -639,7 +732,7 @@ static struct device *new_device(struct int fd, bool (*handle_input)(int, struct device *), unsigned long watch_off, - u32 (*handle_output)(int, + unsigned long (*handle_output)(int, const struct iovec *, unsigned, struct device *)) @@ -916,9 +1009,11 @@ int main(int argc, char *argv[]) { unsigned long mem, pgdir, start, page_offset; int c, lguest_fd, waker_fd; + int elf_fd; struct device_list device_list; struct lguest_boot_info *boot = (void *)0; const char *initrd_name = NULL; + Elf_Ehdr ehdr; device_list.max_infd = -1; device_list.dev = NULL; @@ -958,8 +1053,8 @@ int main(int argc, char *argv[]) map_zeroed_pages(0, mem / getpagesize()); /* Now we load the kernel */ - start = load_kernel(open_or_die(argv[optind+1], O_RDONLY), - &page_offset); + elf_fd = open_or_die(argv[optind+1], O_RDONLY); + start = load_kernel(elf_fd, &page_offset, &ehdr); /* Write the device descriptors into memory. */ map_device_descriptors(&device_list, mem); @@ -969,7 +1064,11 @@ int main(int argc, char *argv[]) boot->initrd_size = load_initrd(initrd_name, mem); /* Set up the initial linar pagetables. */ +#ifndef __x86_64__ pgdir = setup_pagetables(mem, boot->initrd_size, page_offset); +#else + pgdir = find_pgt_symbol(elf_fd, &ehdr, page_offset); +#endif /* Give the guest the boot information it needs. */ concat(boot->cmdline, argv+optind+2); Index: linux-2.6-mm/Documentation/lguest/x86_64/defines =================================================================== --- /dev/null +++ linux-2.6-mm/Documentation/lguest/x86_64/defines @@ -0,0 +1 @@ +LGUEST_GUEST_TOP := 0x7f000000 Index: linux-2.6-mm/Documentation/lguest/x86_64/lguest_defs.h =================================================================== --- /dev/null +++ linux-2.6-mm/Documentation/lguest/x86_64/lguest_defs.h @@ -0,0 +1,11 @@ +#ifndef _LGUEST_DEFS_H_ +#define _LGUEST_DEFS_H_ + +#include + +#include "../../../include/asm/lguest_user.h" +typedef Elf64_Ehdr Elf_Ehdr; +typedef Elf64_Phdr Elf_Phdr; +#define ELF_MACHINE EM_X86_64 + +#endif