lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <1423200112-5354-7-git-send-email-rusty@rustcorp.com.au>
Date:	Fri,  6 Feb 2015 15:51:49 +1030
From:	Rusty Russell <rusty@...tcorp.com.au>
To:	"lkml" <linux-kernel@...r.kernel.org>
Cc:	Rusty Russell <rusty@...tcorp.com.au>
Subject: [PATCH 06/29] lguest: send trap 13 through to userspace.

We copy 7 bytes at eip for userspace's instruction decode; we have to
carefully handle the case where eip is at the end of a page.  We can't
leave this to userspace since kernel has all the page table decode
logic.

The decode logic moves to userspace, basically unchanged.

Signed-off-by: Rusty Russell <rusty@...tcorp.com.au>
---
 drivers/lguest/x86/core.c | 133 +++++++++++++----------------------------
 tools/lguest/lguest.c     | 149 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 192 insertions(+), 90 deletions(-)

diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index f7a16b4ea456..42e87bf14113 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -314,95 +314,52 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
  * usually attached to a PC.
  *
  * When the Guest uses one of these instructions, we get a trap (General
- * Protection Fault) and come here.  We see if it's one of those troublesome
- * instructions and skip over it.  We return true if we did.
+ * Protection Fault) and come here.  We queue this to be sent out to the
+ * Launcher to handle.
  */
-static int emulate_insn(struct lg_cpu *cpu)
-{
-	u8 insn;
-	unsigned int insnlen = 0, in = 0, small_operand = 0;
-	/*
-	 * The eip contains the *virtual* address of the Guest's instruction:
-	 * walk the Guest's page tables to find the "physical" address.
-	 */
-	unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
-
-	/*
-	 * This must be the Guest kernel trying to do something, not userspace!
-	 * The bottom two bits of the CS segment register are the privilege
-	 * level.
-	 */
-	if ((cpu->regs->cs & 3) != GUEST_PL)
-		return 0;
 
-	/* Decoding x86 instructions is icky. */
-	insn = lgread(cpu, physaddr, u8);
-
-	/*
-	 * Around 2.6.33, the kernel started using an emulation for the
-	 * cmpxchg8b instruction in early boot on many configurations.  This
-	 * code isn't paravirtualized, and it tries to disable interrupts.
-	 * Ignore it, which will Mostly Work.
-	 */
-	if (insn == 0xfa) {
-		/* "cli", or Clear Interrupt Enable instruction.  Skip it. */
-		cpu->regs->eip++;
-		return 1;
+/*
+ * The eip contains the *virtual* address of the Guest's instruction:
+ * we copy the instruction here so the Launcher doesn't have to walk
+ * the page tables to decode it.  We handle the case (eg. in a kernel
+ * module) where the instruction is over two pages, and the pages are
+ * virtually but not physically contiguous.
+ *
+ * The longest possible x86 instruction is 15 bytes, but we don't handle
+ * anything that strange.
+ */
+static void copy_from_guest(struct lg_cpu *cpu,
+			    void *dst, unsigned long vaddr, size_t len)
+{
+	size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
+	unsigned long paddr;
+
+	BUG_ON(len > PAGE_SIZE);
+
+	/* If it goes over a page, copy in two parts. */
+	if (len > to_page_end) {
+		/* But make sure the next page is mapped! */
+		if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
+			copy_from_guest(cpu, dst + to_page_end,
+					vaddr + to_page_end,
+					len - to_page_end);
+		else
+			/* Otherwise fill with zeroes. */
+			memset(dst + to_page_end, 0, len - to_page_end);
+		len = to_page_end;
 	}
 
-	/*
-	 * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.
-	 */
-	if (insn == 0x66) {
-		small_operand = 1;
-		/* The instruction is 1 byte so far, read the next byte. */
-		insnlen = 1;
-		insn = lgread(cpu, physaddr + insnlen, u8);
-	}
+	/* This will kill the guest if it isn't mapped, but that
+	 * shouldn't happen. */
+	__lgread(cpu, dst, guest_pa(cpu, vaddr), len);
+}
 
-	/*
-	 * We can ignore the lower bit for the moment and decode the 4 opcodes
-	 * we need to emulate.
-	 */
-	switch (insn & 0xFE) {
-	case 0xE4: /* in     <next byte>,%al */
-		insnlen += 2;
-		in = 1;
-		break;
-	case 0xEC: /* in     (%dx),%al */
-		insnlen += 1;
-		in = 1;
-		break;
-	case 0xE6: /* out    %al,<next byte> */
-		insnlen += 2;
-		break;
-	case 0xEE: /* out    %al,(%dx) */
-		insnlen += 1;
-		break;
-	default:
-		/* OK, we don't know what this is, can't emulate. */
-		return 0;
-	}
 
-	/*
-	 * If it was an "IN" instruction, they expect the result to be read
-	 * into %eax, so we change %eax.  We always return all-ones, which
-	 * traditionally means "there's nothing there".
-	 */
-	if (in) {
-		/* Lower bit tells means it's a 32/16 bit access */
-		if (insn & 0x1) {
-			if (small_operand)
-				cpu->regs->eax |= 0xFFFF;
-			else
-				cpu->regs->eax = 0xFFFFFFFF;
-		} else
-			cpu->regs->eax |= 0xFF;
-	}
-	/* Finally, we've "done" the instruction, so move past it. */
-	cpu->regs->eip += insnlen;
-	/* Success! */
-	return 1;
+static void setup_emulate_insn(struct lg_cpu *cpu)
+{
+	cpu->pending.trap = 13;
+	copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
+			sizeof(cpu->pending.insn));
 }
 
 /*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
@@ -410,14 +367,10 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
 {
 	switch (cpu->regs->trapnum) {
 	case 13: /* We've intercepted a General Protection Fault. */
-		/*
-		 * Check if this was one of those annoying IN or OUT
-		 * instructions which we need to emulate.  If so, we just go
-		 * back into the Guest after we've done it.
-		 */
+		/* Hand to Launcher to emulate those pesky IN and OUT insns */
 		if (cpu->regs->errcode == 0) {
-			if (emulate_insn(cpu))
-				return;
+			setup_emulate_insn(cpu);
+			return;
 		}
 		break;
 	case 14: /* We've intercepted a Page Fault. */
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
index 0e754d04876d..b2217657f62c 100644
--- a/tools/lguest/lguest.c
+++ b/tools/lguest/lguest.c
@@ -41,6 +41,7 @@
 #include <signal.h>
 #include <pwd.h>
 #include <grp.h>
+#include <sys/user.h>
 
 #ifndef VIRTIO_F_ANY_LAYOUT
 #define VIRTIO_F_ANY_LAYOUT		27
@@ -1143,6 +1144,150 @@ static void handle_output(unsigned long addr)
 	      strnlen(from_guest_phys(addr), guest_limit - addr));
 }
 
+/*L:216
+ * This is where we emulate a handful of Guest instructions.  It's ugly
+ * and we used to do it in the kernel but it grew over time.
+ */
+
+/*
+ * We use the ptrace syscall's pt_regs struct to talk about registers
+ * to lguest: these macros convert the names to the offsets.
+ */
+#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name))
+#define setreg(name, val) \
+	setreg_off(offsetof(struct user_regs_struct, name), (val))
+
+static u32 getreg_off(size_t offset)
+{
+	u32 r;
+	unsigned long args[] = { LHREQ_GETREG, offset };
+
+	if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
+		err(1, "Getting register %u", offset);
+	if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r))
+		err(1, "Reading register %u", offset);
+
+	return r;
+}
+
+static void setreg_off(size_t offset, u32 val)
+{
+	unsigned long args[] = { LHREQ_SETREG, offset, val };
+
+	if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
+		err(1, "Setting register %u", offset);
+}
+
+static void emulate_insn(const u8 insn[])
+{
+	unsigned long args[] = { LHREQ_TRAP, 13 };
+	unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access;
+	unsigned int eax, port, mask;
+	/*
+	 * We always return all-ones on IO port reads, which traditionally
+	 * means "there's nothing there".
+	 */
+	u32 val = 0xFFFFFFFF;
+
+	/*
+	 * This must be the Guest kernel trying to do something, not userspace!
+	 * The bottom two bits of the CS segment register are the privilege
+	 * level.
+	 */
+	if ((getreg(xcs) & 3) != 0x1)
+		goto no_emulate;
+
+	/* Decoding x86 instructions is icky. */
+
+	/*
+	 * Around 2.6.33, the kernel started using an emulation for the
+	 * cmpxchg8b instruction in early boot on many configurations.  This
+	 * code isn't paravirtualized, and it tries to disable interrupts.
+	 * Ignore it, which will Mostly Work.
+	 */
+	if (insn[insnlen] == 0xfa) {
+		/* "cli", or Clear Interrupt Enable instruction.  Skip it. */
+		insnlen = 1;
+		goto skip_insn;
+	}
+
+	/*
+	 * 0x66 is an "operand prefix".  It means a 16, not 32 bit in/out.
+	 */
+	if (insn[insnlen] == 0x66) {
+		small_operand = 1;
+		/* The instruction is 1 byte so far, read the next byte. */
+		insnlen = 1;
+	}
+
+	/* If the lower bit isn't set, it's a single byte access */
+	byte_access = !(insn[insnlen] & 1);
+
+	/*
+	 * Now we can ignore the lower bit and decode the 4 opcodes
+	 * we need to emulate.
+	 */
+	switch (insn[insnlen] & 0xFE) {
+	case 0xE4: /* in     <next byte>,%al */
+		port = insn[insnlen+1];
+		insnlen += 2;
+		in = 1;
+		break;
+	case 0xEC: /* in     (%dx),%al */
+		port = getreg(edx) & 0xFFFF;
+		insnlen += 1;
+		in = 1;
+		break;
+	case 0xE6: /* out    %al,<next byte> */
+		port = insn[insnlen+1];
+		insnlen += 2;
+		break;
+	case 0xEE: /* out    %al,(%dx) */
+		port = getreg(edx) & 0xFFFF;
+		insnlen += 1;
+		break;
+	default:
+		/* OK, we don't know what this is, can't emulate. */
+		goto no_emulate;
+	}
+
+	/* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */
+	if (byte_access)
+		mask = 0xFF;
+	else if (small_operand)
+		mask = 0xFFFF;
+	else
+		mask = 0xFFFFFFFF;
+
+	/*
+	 * If it was an "IN" instruction, they expect the result to be read
+	 * into %eax, so we change %eax.
+	 */
+	eax = getreg(eax);
+
+	if (in) {
+		/* Clear the bits we're about to read */
+		eax &= ~mask;
+		/* Copy bits in from val. */
+		eax |= val & mask;
+		/* Now update the register. */
+		setreg(eax, eax);
+	}
+
+	verbose("IO %s of %x to %u: %#08x\n",
+		in ? "IN" : "OUT", mask, port, eax);
+skip_insn:
+	/* Finally, we've "done" the instruction, so move past it. */
+	setreg(eip, getreg(eip) + insnlen);
+	return;
+
+no_emulate:
+	/* Inject trap into Guest. */
+	if (write(lguest_fd, args, sizeof(args)) < 0)
+		err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip));
+}
+
+
 /*L:190
  * Device Setup
  *
@@ -1832,6 +1977,10 @@ static void __attribute__((noreturn)) run_guest(void)
 				verbose("Notify on address %#08x\n",
 					notify.addr);
 				handle_output(notify.addr);
+			} else if (notify.trap == 13) {
+				verbose("Emulating instruction at %#x\n",
+					getreg(eip));
+				emulate_insn(notify.insn);
 			} else
 				errx(1, "Unknown trap %i addr %#08x\n",
 				     notify.trap, notify.addr);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ