--- i386/lguest.c 2007-04-02 16:19:27.000000000 -0300 +++ lguest.c 2007-04-02 16:19:28.000000000 -0300 @@ -29,11 +29,22 @@ #include #include #include + +typedef uint64_t u64; typedef uint32_t u32; typedef uint16_t u16; typedef uint8_t u8; -#include "../../../include/asm/lguest_user.h" +#include "../../include/asm/lguest_user.h" +#include + +unsigned long (*finish)(unsigned long mem, unsigned long *page_offset, + const char *initrd, unsigned long *ird_size); + +typedef unsigned long (*load_function)(int, void *, unsigned long, + unsigned long *, const char *, unsigned long *, + unsigned long *); + #define PAGE_PRESENT 0x7 /* Present, RW, Execute */ #define NET_PEERNUM 1 @@ -63,8 +74,8 @@ struct device /* Watch DMA to this address if handle_input non-NULL. */ unsigned long watch_address; - u32 (*handle_output)(int fd, const struct iovec *iov, - unsigned int num, struct device *me); + unsigned long (*handle_output)(int fd, const struct iovec *iov, + unsigned int num, struct device *me); /* Device-specific data. */ void *priv; @@ -78,7 +89,7 @@ static int zero_fd; FIXME: vdso gets mapped just under it, and we need to protect that. */ #define RESERVE_TOP LGUEST_GUEST_TOP - 1024*1024 -static u32 memparse(const char *ptr) +static unsigned long memparse(const char *ptr) { char *end; unsigned long ret = strtoul(ptr, &end, 0); @@ -142,8 +153,8 @@ static void map_memory(unsigned long mem err(1, "Mmaping /dev/zero for %li bytes", mem); } -static u32 finish(unsigned long mem, unsigned long *page_offset, - const char *initrd, unsigned long *ird_size) +static unsigned long finish32(unsigned long mem, unsigned long *page_offset, + const char *initrd, unsigned long *ird_size) { u32 *pgdir = NULL, *linear = NULL; int i, pte_pages; @@ -169,7 +180,7 @@ static u32 finish(unsigned long mem, uns /* Now set up pgd so that this memory is at page_offset */ for (i = 0; i < mem / getpagesize(); i += getpagesize()/sizeof(u32)) { pgdir[(i + *page_offset/getpagesize())/1024] - = (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT); + = (((u32)(long)linear + i*sizeof(u32)) | PAGE_PRESENT); verbose("Top level %lu = %#08x\n", (i + *page_offset/getpagesize())/1024, pgdir[(i + *page_offset/getpagesize())/1024]); @@ -178,8 +189,14 @@ static u32 finish(unsigned long mem, uns return (unsigned long)pgdir; } +static unsigned long finish64(unsigned long mem, unsigned long *page_offset, + const char *initrd, unsigned long *ird_size) +{ + return 0; +} + /* Returns the entry point */ -static u32 map_elf(int elf_fd, const Elf32_Ehdr *ehdr, unsigned long mem, +static unsigned long map_elf32(int elf_fd, const Elf32_Ehdr *ehdr, unsigned long mem, unsigned long *pgdir_addr, const char *initrd, unsigned long *ird_size, unsigned long *page_offset) @@ -210,7 +227,7 @@ static u32 map_elf(int elf_fd, const Elf continue; verbose("Section %i: size %i addr %p\n", - i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); + i, phdr[i].p_memsz, (void *)(long)phdr[i].p_paddr); /* We map everything private, writable. */ if (phdr[i].p_paddr + phdr[i].p_memsz > mem) errx(1, "Segment %i overlaps end of memory", i); @@ -227,6 +244,77 @@ static u32 map_elf(int elf_fd, const Elf phdr[i].p_offset -= (phdr[i].p_paddr % getpagesize()); phdr[i].p_paddr -= (phdr[i].p_paddr % getpagesize()); } + addr = mmap((void *)(long)phdr[i].p_paddr, + phdr[i].p_filesz, + PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_FIXED|MAP_PRIVATE, + elf_fd, phdr[i].p_offset); + if (addr != (void *)(long)phdr[i].p_paddr) + err(1, "Mmaping vmlinux segment %i returned %p not %p (%p)", + i, addr, (void *)(long)phdr[i].p_paddr, &phdr[i].p_paddr); + } + + *pgdir_addr = finish(mem, page_offset, initrd, ird_size); + /* Entry is physical address: convert to virtual */ + return ehdr->e_entry + *page_offset; +} + +/* Returns the entry point */ +static unsigned long map_elf64(int elf_fd, const Elf64_Ehdr *ehdr, unsigned long mem, + unsigned long *pgdir_addr, + const char *initrd, unsigned long *ird_size, + unsigned long *page_offset) +{ +#ifdef CONFIG_X86_64 + void *addr; + Elf64_Phdr phdr[ehdr->e_phnum]; + unsigned int i; + Elf64_Shdr sec[ehdr->e_shnum]; + Elf64_Sym *syms; + char *strtab = NULL; + unsigned long nsyms = 0; + + /* Sanity checks. */ + if (ehdr->e_type != ET_EXEC + || ehdr->e_machine != EM_X86_64 + || ehdr->e_phentsize != sizeof(Elf64_Phdr) + || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf64_Phdr)) + errx(1, "Malformed elf64 header"); + + if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) + err(1, "Seeking to program headers"); + if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) + err(1, "Reading program headers"); + + map_memory(mem); + + *page_offset = 0; + /* We map the loadable segments at virtual addresses corresponding + * to their physical addresses (our virtual == guest physical). */ + for (i = 0; i < ehdr->e_phnum; i++) { + if (phdr[i].p_type != PT_LOAD) + continue; + + verbose("Section %i: size %li addr %p\n", + i, phdr[i].p_memsz, (void *)phdr[i].p_paddr); + /* We map everything private, writable. */ + if (phdr[i].p_paddr + phdr[i].p_memsz > mem) + errx(1, "Segment %i overlaps end of memory", i); + + /* We expect linear address space. */ + if (!*page_offset) + *page_offset = phdr[i].p_vaddr - phdr[i].p_paddr; + else if ((*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr) && + phdr[i].p_vaddr != VSYSCALL_START) + errx(1, "Page offset of section %i different (got %lx, expected %lx)", + i, (phdr[i].p_vaddr - phdr[i].p_paddr), *page_offset); + + /* Recent ld versions don't page align any more. */ + if (phdr[i].p_paddr % getpagesize()) { + phdr[i].p_filesz += (phdr[i].p_paddr % getpagesize()); + phdr[i].p_offset -= (phdr[i].p_paddr % getpagesize()); + phdr[i].p_paddr -= (phdr[i].p_paddr % getpagesize()); + } addr = mmap((void *)phdr[i].p_paddr, phdr[i].p_filesz, PROT_READ|PROT_WRITE|PROT_EXEC, @@ -237,9 +325,67 @@ static u32 map_elf(int elf_fd, const Elf i, addr, (void *)phdr[i].p_paddr, &phdr[i].p_paddr); } - *pgdir_addr = finish(mem, page_offset, initrd, ird_size); + /* Now process sections searching for boot page tables + * Start by finding the symtab section */ + if (lseek(elf_fd, ehdr->e_shoff, SEEK_SET) < 0) + err(1, "Seeking to section headers"); + if (read(elf_fd, sec, sizeof(sec)) != sizeof(sec)) + err(1, "Reading section headers"); + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sec[i].sh_type == SHT_SYMTAB) { + int ret = 0; + syms = malloc(sec[i].sh_size); + if (!syms) + err(1,"Not enough memory for symbol table"); + ret = lseek(elf_fd, sec[i].sh_offset, SEEK_SET); + if (ret < 0) + err(1, "Seeking to symbol table"); + ret = read(elf_fd, syms, sec[i].sh_size); + if (ret != sec[i].sh_size) + err(1, "Reading symbol table"); + nsyms = sec[i].sh_size / sizeof(Elf64_Sym); + + + /* symtab links to strtab. We use it to find symbol + * names */ + strtab = malloc(sec[sec[i].sh_link].sh_size); + if (!strtab) + err(1,"Not enough memory for string table"); + ret = lseek(elf_fd, sec[sec[i].sh_link].sh_offset , SEEK_SET); + if (ret < 0) + err(1, "Seeking to string table"); + ret = read(elf_fd, strtab, sec[sec[i].sh_link].sh_size); + if (ret != sec[sec[i].sh_link].sh_size) + err(1, "Reading string table"); + break; + } + } + + /* We now have a pointer to the symtab, start searching for the symbol */ + for (i = 0; i < nsyms; i++) { + if ((syms[i].st_shndx == SHN_UNDEF) || !syms[i].st_name) + continue; + if (!strcmp("boot_level4_pgt", + (char *)((u64)syms[i].st_name + strtab))) { + *pgdir_addr = syms[i].st_value - *page_offset; + break; + } + } + + if (!*pgdir_addr) + err(1,"Unable to find boot pgdir"); + + *ird_size = load_initrd(initrd, mem); + /* Entry is physical address: convert to virtual */ + printf("entry=%lx page_offset=%lx entry+page_offset=%lx\n", + ehdr->e_entry, *page_offset, ehdr->e_entry + *page_offset); return ehdr->e_entry + *page_offset; +#else + errno = EINVAL; + err(1, "Too many bits! i386 architecture cannot load 64 bit kernels"); +#endif } static unsigned long intuit_page_offset(unsigned char *img, unsigned long len) @@ -254,9 +400,9 @@ static unsigned long intuit_page_offset( errx(1, "could not determine page offset"); } -static u32 bzimage(int fd, unsigned long mem, unsigned long *pgdir_addr, - const char *initrd, unsigned long *ird_size, - unsigned long *page_offset) +static unsigned long bzimage(int fd, unsigned long mem, unsigned long *pgdir_addr, + const char *initrd, unsigned long *ird_size, + unsigned long *page_offset) { gzFile f; int ret, len = 0; @@ -277,13 +423,13 @@ static u32 bzimage(int fd, unsigned long *pgdir_addr = finish(mem, page_offset, initrd, ird_size); /* Entry is physical address: convert to virtual */ - return (u32)img + *page_offset; + return (long)img + *page_offset; } -static u32 load_bzimage(int bzimage_fd, const Elf32_Ehdr *ehdr, - unsigned long mem, unsigned long *pgdir_addr, - const char *initrd, unsigned long *ird_size, - unsigned long *page_offset) +static unsigned long load_bzimage(int bzimage_fd, const Elf32_Ehdr *ehdr, + unsigned long mem, unsigned long *pgdir_addr, + const char *initrd, unsigned long *ird_size, + unsigned long *page_offset) { unsigned char c; int state = 0; @@ -363,7 +509,7 @@ static struct device *new_device(struct int fd, int (*handle_input)(int, struct device *), unsigned long watch_off, - u32 (*handle_output)(int, + unsigned long (*handle_output)(int, const struct iovec *, unsigned, struct device *)) @@ -384,16 +530,16 @@ static struct device *new_device(struct return dev; } -static int tell_kernel(u32 pagelimit, u32 pgdir, u32 start, u32 page_offset) +static int tell_kernel(long pagelimit, long pgdir, long start, long page_offset) { - u32 args[] = { LHREQ_INITIALIZE, + unsigned long args[] = { LHREQ_INITIALIZE, pagelimit, pgdir, start, page_offset }; int fd = open("/dev/lguest", O_RDWR); if (fd < 0) err(1, "Opening /dev/lguest"); - verbose("Telling kernel limit %u, pgdir %i, e=%#08x page_off=0x%08x\n", + verbose("Telling kernel limit %lu, pgdir %li, e=%#08lx page_off=0x%08lx\n", pagelimit, pgdir, start, page_offset); if (write(fd, args, sizeof(args)) < 0) err(1, "Writing to /dev/lguest"); @@ -423,7 +569,7 @@ static void *_check_pointer(unsigned lon #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) /* Returns pointer to dma->used_len */ -static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num) +static unsigned long *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num) { unsigned int i; struct lguest_dma *udma; @@ -446,12 +592,12 @@ static u32 *dma2iov(unsigned long dma, s return &udma->used_len; } -static u32 *get_dma_buffer(int fd, void *addr, +static unsigned long *get_dma_buffer(int fd, void *addr, struct iovec iov[], unsigned *num, u32 *irq) { - u32 buf[] = { LHREQ_GETDMA, (u32)addr }; + unsigned long buf[] = { LHREQ_GETDMA, (unsigned long)addr }; unsigned long udma; - u32 *res; + unsigned long *res; udma = write(fd, buf, sizeof(buf)); if (udma == (unsigned long)-1) @@ -466,7 +612,7 @@ static u32 *get_dma_buffer(int fd, void static void trigger_irq(int fd, u32 irq) { - u32 buf[] = { LHREQ_IRQ, irq }; + unsigned long buf[] = { LHREQ_IRQ, irq }; if (write(fd, buf, sizeof(buf)) != 0) err(1, "Triggering irq %i", irq); } @@ -486,7 +632,8 @@ struct console_abort /* We DMA input to buffer bound at start of console page. */ static int handle_console_input(int fd, struct device *dev) { - u32 num, irq = 0, *lenp; + u32 num, irq = 0; + unsigned long *lenp; int len; struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; struct console_abort *abort = dev->priv; @@ -535,19 +682,20 @@ static unsigned long peer_offset(unsigne return 4 * peernum; } -static u32 handle_tun_output(int fd, const struct iovec *iov, - unsigned num, struct device *dev) +static unsigned long handle_tun_output(int fd, const struct iovec *iov, + unsigned num, struct device *dev) { /* Now we've seen output, we should warn if we can't get buffers. */ *(bool *)dev->priv = true; return writev(dev->fd, iov, num); } -static u32 handle_block_output(int fd, const struct iovec *iov, - unsigned num, struct device *dev) +static unsigned long handle_block_output(int fd, const struct iovec *iov, + unsigned num, struct device *dev) { struct lguest_block_page *p = dev->mem; - u32 irq, reply_num, *lenp; + u32 irq, reply_num; + unsigned long *lenp; int len; struct iovec reply[LGUEST_MAX_DMA_SECTIONS]; off64_t device_len, off = (off64_t)p->sector * 512; @@ -555,11 +703,13 @@ static u32 handle_block_output(int fd, c device_len = *(off64_t *)dev->priv; if (off >= device_len) - err(1, "Bad offset %llu vs %llu", off, device_len); + err(1, "Bad offset %llu vs %llu", (unsigned long long)off, + (unsigned long long)device_len); if (lseek64(dev->fd, off, SEEK_SET) != off) err(1, "Bad seek to sector %i", p->sector); - verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off); + verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", + (unsigned long long)off); lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq); if (!lenp) @@ -569,7 +719,8 @@ static u32 handle_block_output(int fd, c len = writev(dev->fd, iov, num); if (off + len > device_len) { ftruncate(dev->fd, device_len); - errx(1, "Write past end %llu+%u", off, len); + errx(1, "Write past end %llu+%u", + (unsigned long long)off, len); } *lenp = 0; } else { @@ -639,7 +790,8 @@ static void wakeup(int signo) static int handle_tun_input(int fd, struct device *dev) { - u32 irq = 0, num, *lenp; + u32 irq = 0, num; + unsigned long *lenp; int len; struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; @@ -836,8 +988,8 @@ static void setup_block_file(const char (void *)(dev->desc->pfn * getpagesize()), p->num_sectors); } -static u32 handle_console_output(int fd, const struct iovec *iov, - unsigned num, struct device*dev) +static unsigned long handle_console_output(int fd, const struct iovec *iov, + unsigned num, struct device*dev) { return writev(STDOUT_FILENO, iov, num); } @@ -871,11 +1023,11 @@ static const char *get_arg(const char *a return NULL; } -static u32 handle_device(int fd, unsigned long dma, unsigned long addr, +static long handle_device(int fd, unsigned long dma, unsigned long addr, struct devices *devices) { struct device *i; - u32 *lenp; + unsigned long *lenp; struct iovec iov[LGUEST_MAX_DMA_SECTIONS]; unsigned num = 0; @@ -916,20 +1068,45 @@ static void handle_input(int fd, int chi } } +static unsigned long load_elf_header(unsigned char *elf_nident) +{ + errno = 0; + switch (*(elf_nident+EI_CLASS)) { + case ELFCLASS32: + finish = finish32; + if (memcmp(elf_nident, ELFMAG, SELFMAG) == 0) + return (unsigned long)map_elf32; + else + return (unsigned long)load_bzimage; + break; + case ELFCLASS64: + finish = finish64; + if (memcmp(elf_nident, ELFMAG, SELFMAG) == 0) + return (unsigned long)map_elf64; + else + return (unsigned long)load_bzimage; + break; + default: + /* unrecognized class */ + errno = EINVAL; + return 0; + } + +} + int main(int argc, char *argv[]) { unsigned long mem, pgdir, entry, initrd_size, page_offset; int arg, kern_fd, fd, child, pipefd[2]; - Elf32_Ehdr hdr; + /* Worst case */ + Elf64_Ehdr hdr; struct sigaction act; sigset_t sigset; struct lguest_device_desc *devdescs; struct devices devices; struct lguest_boot_info *boot = (void *)0; const char *initrd_name = NULL; - u32 (*load)(int, const Elf32_Ehdr *ehdr, unsigned long, - unsigned long *, const char *, unsigned long *, - unsigned long *); + load_function load; if (argv[1] && strcmp(argv[1], "--verbose") == 0) { verbose = true; @@ -954,10 +1131,10 @@ int main(int argc, char *argv[]) if (read(kern_fd, &hdr, sizeof(hdr)) != sizeof(hdr)) err(1, "Reading %s elf header", argv[2]); - if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0) - load = map_elf; - else - load = load_bzimage; + load = (load_function)load_elf_header(hdr.e_ident); + + if (!load) + err(1, "Could not identify file class"); devices.max_infd = -1; devices.dev = NULL;