lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-ID: <201308121615.r7CGFstm024706@farm-0021.internal.tilera.com>
Date:	Mon, 12 Aug 2013 11:24:11 -0400
From:	Chris Metcalf <cmetcalf@...era.com>
To:	<linux-kernel@...r.kernel.org>, <kvm@...r.kernel.org>,
	Gleb Natapov <gleb@...hat.com>,
	Paolo Bonzini <pbonzini@...hat.com>
Subject: [PATCH] tile: support KVM for tilegx

This change provides the initial framework support for KVM on tilegx.
Basic virtual disk and networking is supported.

Signed-off-by: Chris Metcalf <cmetcalf@...era.com>
---
 arch/tile/Kconfig                        |   19 +-
 arch/tile/Makefile                       |    1 +
 arch/tile/include/asm/io.h               |    2 +
 arch/tile/include/asm/kvm.h              |   29 +
 arch/tile/include/asm/kvm_host.h         |  119 +++
 arch/tile/include/asm/kvm_para.h         |   20 +
 arch/tile/include/asm/kvm_virtio.h       |   26 +
 arch/tile/include/asm/module.h           |    9 +-
 arch/tile/include/asm/page.h             |   56 +-
 arch/tile/include/asm/pgtable_32.h       |    2 +-
 arch/tile/include/asm/pgtable_64.h       |    3 +-
 arch/tile/include/asm/processor.h        |    6 +-
 arch/tile/include/asm/ptrace.h           |    2 +-
 arch/tile/include/asm/switch_to.h        |   25 +-
 arch/tile/include/asm/thread_info.h      |   17 +-
 arch/tile/include/asm/timex.h            |    8 +
 arch/tile/include/hv/hypervisor.h        |  183 +++-
 arch/tile/include/uapi/arch/sim.h        |   19 +
 arch/tile/include/uapi/arch/sim_def.h    |    8 +
 arch/tile/include/uapi/arch/spr_def_32.h |   15 +
 arch/tile/include/uapi/arch/spr_def_64.h |   25 +
 arch/tile/include/uapi/asm/Kbuild        |    2 +
 arch/tile/include/uapi/asm/kvm.h         |  249 +++++
 arch/tile/include/uapi/asm/kvm_virtio.h  |   60 ++
 arch/tile/kernel/Makefile                |    1 +
 arch/tile/kernel/asm-offsets.c           |    7 +
 arch/tile/kernel/early_printk.c          |   17 +
 arch/tile/kernel/head_32.S               |    4 +-
 arch/tile/kernel/head_64.S               |    6 +-
 arch/tile/kernel/hvglue.S                |    8 +-
 arch/tile/kernel/hvglue_trace.c          |   14 +
 arch/tile/kernel/intvec_32.S             |   18 +-
 arch/tile/kernel/intvec_64.S             |  226 +++--
 arch/tile/kernel/kvm_virtio.c            |  430 ++++++++
 arch/tile/kernel/process.c               |   40 +-
 arch/tile/kernel/relocate_kernel_64.S    |    9 +-
 arch/tile/kernel/setup.c                 |   21 +-
 arch/tile/kernel/smp.c                   |   28 +-
 arch/tile/kernel/stack.c                 |    2 +-
 arch/tile/kernel/sysfs.c                 |    4 +
 arch/tile/kernel/time.c                  |   14 +-
 arch/tile/kernel/traps.c                 |    2 +-
 arch/tile/kernel/vmlinux.lds.S           |   10 +-
 arch/tile/kvm/Kconfig                    |    3 -
 arch/tile/kvm/Makefile                   |   12 +
 arch/tile/kvm/entry.S                    |   91 ++
 arch/tile/kvm/kvm-tile.c                 | 1585 ++++++++++++++++++++++++++++++
 arch/tile/lib/exports.c                  |   20 +-
 arch/tile/mm/elf.c                       |    2 +
 arch/tile/mm/fault.c                     |    4 +-
 arch/tile/mm/init.c                      |    8 +-
 arch/tile/mm/pgtable.c                   |   35 +-
 include/uapi/linux/kvm.h                 |    3 +
 virt/kvm/kvm_main.c                      |    7 +-
 54 files changed, 3338 insertions(+), 198 deletions(-)
 create mode 100644 arch/tile/include/asm/kvm.h
 create mode 100644 arch/tile/include/asm/kvm_host.h
 create mode 100644 arch/tile/include/asm/kvm_para.h
 create mode 100644 arch/tile/include/asm/kvm_virtio.h
 create mode 100644 arch/tile/include/uapi/asm/kvm.h
 create mode 100644 arch/tile/include/uapi/asm/kvm_virtio.h
 create mode 100644 arch/tile/kernel/kvm_virtio.c
 create mode 100644 arch/tile/kvm/Makefile
 create mode 100644 arch/tile/kvm/entry.S
 create mode 100644 arch/tile/kvm/kvm-tile.c

diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index ecff467..bbb6d51 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -5,7 +5,6 @@ config TILE
 	def_bool y
 	select HAVE_DMA_ATTRS
 	select HAVE_DMA_API_DEBUG
-	select HAVE_KVM if !TILEGX
 	select GENERIC_FIND_FIRST_BIT
 	select SYSCTL_EXCEPTION_TRACE
 	select USE_GENERIC_SMP_HELPERS
@@ -113,6 +112,7 @@ config SMP
 	def_bool y
 
 config HVC_TILE
+	depends on !KVM_GUEST
 	depends on TTY
 	select HVC_DRIVER
 	select HVC_IRQ if TILEGX
@@ -127,6 +127,7 @@ config TILEGX
 	select HAVE_FTRACE_MCOUNT_RECORD
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES
+	select HAVE_KVM if !KVM_GUEST
 
 config TILEPRO
 	def_bool !TILEGX
@@ -366,11 +367,23 @@ config HARDWALL
 	bool "Hardwall support to allow access to user dynamic network"
 	default y
 
+config KVM_GUEST
+	bool "Build kernel as guest for KVM"
+	default n
+	depends on TILEGX
+	select VIRTIO
+	select VIRTIO_RING
+	select VIRTIO_CONSOLE
+	---help---
+	  This will build a kernel that runs at a lower protection level
+	  than the default kernel and is suitable to run under KVM.
+
+# TILEPro kernels run at PL1; TILE-Gx runs at PL2 unless it's a KVM guest.
 config KERNEL_PL
 	int "Processor protection level for kernel"
 	range 1 2
-	default 2 if TILEGX
-	default 1 if !TILEGX
+	default 2 if TILEGX && !KVM_GUEST
+	default 1 if !TILEGX || KVM_GUEST
 	---help---
 	  Since MDE 4.2, the Tilera hypervisor runs the kernel
 	  at PL2 by default.  If running under an older hypervisor,
diff --git a/arch/tile/Makefile b/arch/tile/Makefile
index 3d15364..8e7f852 100644
--- a/arch/tile/Makefile
+++ b/arch/tile/Makefile
@@ -62,6 +62,7 @@ libs-y		+= $(LIBGCC_PATH)
 
 # See arch/tile/Kbuild for content of core part of the kernel
 core-y		+= arch/tile/
+core-$(CONFIG_KVM) += arch/tile/kvm/
 
 core-$(CONFIG_TILE_GXIO) += arch/tile/gxio/
 
diff --git a/arch/tile/include/asm/io.h b/arch/tile/include/asm/io.h
index 9fe4349..023659b 100644
--- a/arch/tile/include/asm/io.h
+++ b/arch/tile/include/asm/io.h
@@ -43,6 +43,8 @@
  * long before casting it to a pointer to avoid compiler warnings.
  */
 #if CHIP_HAS_MMIO()
+extern void *generic_remap_prot(resource_size_t phys_addr, unsigned long size,
+	unsigned long flags, pgprot_t prot);
 extern void __iomem *ioremap(resource_size_t offset, unsigned long size);
 extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
 	pgprot_t pgprot);
diff --git a/arch/tile/include/asm/kvm.h b/arch/tile/include/asm/kvm.h
new file mode 100644
index 0000000..2ea6c41
--- /dev/null
+++ b/arch/tile/include/asm/kvm.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#ifndef _ASM_TILE_KVM_H
+#define _ASM_TILE_KVM_H
+
+#include <hv/hypervisor.h>
+#include <uapi/asm/kvm.h>
+
+#ifndef __ASSEMBLER__
+/* For hv_*() */
+#define KVM_EMULATE(name) [HV_SYS_##name] = kvm_emulate_hv_##name,
+#define USER_EMULATE(name) [HV_SYS_##name] = kvm_deliver_to_user,
+#define NO_EMULATE(name) [HV_SYS_##name] = kvm_emulate_illegal,
+#define BOTH_EMULATE(name) [HV_SYS_##name] = kvm_emulate_hv_##name,
+/* For others */
+#define USER_HCALL(name) [KVM_HCALL_##name] = kvm_deliver_to_user,
+#endif
+#endif /* _ASM_TILE_KVM_H */
diff --git a/arch/tile/include/asm/kvm_host.h b/arch/tile/include/asm/kvm_host.h
new file mode 100644
index 0000000..8241f50
--- /dev/null
+++ b/arch/tile/include/asm/kvm_host.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _ASM_TILE_KVM_HOST_H
+#define _ASM_TILE_KVM_HOST_H
+
+#define KVM_MAX_VCPUS 64
+#define KVM_USER_MEM_SLOTS 32
+#define KVM_PRIVATE_MEM_SLOTS 4
+
+/* For now, claim we have no huge pages. */
+#define KVM_HPAGE_GFN_SHIFT(x)  0
+#define KVM_NR_PAGE_SIZES       1
+#define KVM_PAGES_PER_HPAGE(x)  1
+
+/* Max number of message tags for hv_send/receive_message() */
+#define MAX_MSG_TAG	(sizeof(unsigned long) * 8)
+
+/* Bits in pending_downcalls */
+#define DOWNCALL_MESSAGE_RCV     0x01  /**< Message receive */
+
+#ifndef __ASSEMBLY__
+
+#include <linux/types.h>
+#include <linux/ptrace.h>
+
+struct kvm_vcpu_stat {
+	/* None yet. */
+};
+
+struct kvm_vcpu_arch {
+	struct pt_regs regs;
+	unsigned long host_sp; /* Host "real" sp during vmresume. */
+	HV_Context guest_context;
+	unsigned long pending_msgs; /* Pending guest messages */
+	unsigned long ipi_events; /* Pending guest ipi events. */
+	unsigned long ipi_gpa; /* pa for hv_get_ipi_pte() */
+	pte_t ipi_gpte; /* pte for hv_get_ipi_pte() */
+	unsigned long fault_addr;  /* addr for VPGTABLE_MISS faults */
+	int suspended;  /* true for cores not yet started by host */
+	unsigned long timer_control;  /* AUX_TILE_TIMER_CONTROL value */
+	unsigned long vmexit_cycles;  /* cycle count of last vmexit */
+
+#define FOR_EACH_GUEST_SPR(f)			\
+	f(INTERRUPT_MASK_1);			\
+	f(INTERRUPT_VECTOR_BASE_1);		\
+	f(EX_CONTEXT_1_0);			\
+	f(EX_CONTEXT_1_1);			\
+	f(SYSTEM_SAVE_1_0);			\
+	f(SYSTEM_SAVE_1_1);			\
+	f(SYSTEM_SAVE_1_2);			\
+	f(SYSTEM_SAVE_1_3);			\
+	f(INTCTRL_1_STATUS);			\
+	f(IPI_MASK_1);				\
+	f(IPI_EVENT_1);				\
+	f(SINGLE_STEP_CONTROL_1);		\
+	f(SINGLE_STEP_EN_1_1);			\
+
+#define DECLARE_SPR(f) unsigned long f
+	FOR_EACH_GUEST_SPR(DECLARE_SPR)
+#undef DECLARE_SPR
+};
+
+struct kvm_vm_stat {
+	/*
+	 * FIXME - does this make sense for us?  It's used in common KVM
+	 * code.
+	 */
+	u32 remote_tlb_flush;
+};
+
+struct kvm_arch_memory_slot {
+};
+
+struct kvm_arch {
+	pgd_t *vpgd;
+	unsigned long resv_gpa_start; /* For special purpose. */
+	struct completion smp_start;
+};
+
+struct kvm_vcpu;
+
+extern void kvm_vmresume(struct pt_regs *guest,
+			 unsigned long *host_sp_ptr);
+extern void kvm_vmexit(unsigned long host_sp);
+extern void kvm_trigger_vmexit(struct pt_regs *regs, int exit_reason);
+extern void kvm_do_hypervisor_call(struct pt_regs *regs, int fault_num);
+extern void kvm_do_vpgtable_miss(struct pt_regs *regs, int fault_num,
+				 unsigned long, unsigned long);
+extern void kvm_do_vguest_fatal(struct pt_regs *regs, int fault_num);
+
+extern void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
+
+#define gpud_offset(kvm, pgd, address) pud_offset(pgd, address)
+
+#define gpud_page_vaddr(kvm, pud) gfn_to_hva(kvm, pud_pfn(pud))
+
+#define gpmd_offset(kvm, pud, address) \
+	((pmd_t *)gpud_page_vaddr(kvm, *(pud)) + pmd_index(address))
+
+#define gpmd_page_vaddr(kvm, pmd) gfn_to_hva(kvm, pmd_pfn(pmd))
+
+#define gpte_offset_kernel(kvm, pmd, address) \
+	((pte_t *) gpmd_page_vaddr(kvm, *(pmd)) + pte_index(address))
+
+#endif /* __ASSEMBLY__*/
+
+#endif /* _ASM_TILE_KVM_HOST_H */
diff --git a/arch/tile/include/asm/kvm_para.h b/arch/tile/include/asm/kvm_para.h
new file mode 100644
index 0000000..c8c31d5
--- /dev/null
+++ b/arch/tile/include/asm/kvm_para.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#ifndef _ASM_TILE_KVM_PARA_H
+#define _ASM_TILE_KVM_PARA_H
+
+#include <uapi/asm/kvm_para.h>
+
+int hcall_virtio(unsigned long instrument, unsigned long mem);
+#endif /* _ASM_TILE_KVM_PARA_H */
diff --git a/arch/tile/include/asm/kvm_virtio.h b/arch/tile/include/asm/kvm_virtio.h
new file mode 100644
index 0000000..8faa959
--- /dev/null
+++ b/arch/tile/include/asm/kvm_virtio.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+#ifndef _ASM_TILE_KVM_VIRTIO_H
+#define _ASM_TILE_KVM_VIRTIO_H
+
+#include <uapi/asm/kvm_virtio.h>
+
+
+struct kvm_device {
+	struct virtio_device vdev;
+	struct kvm_device_desc *desc;
+	unsigned long desc_pa;
+};
+
+#endif /* _ASM_TILE_KVM_VIRTIO_H */
diff --git a/arch/tile/include/asm/module.h b/arch/tile/include/asm/module.h
index 44ed07c..927c97f 100644
--- a/arch/tile/include/asm/module.h
+++ b/arch/tile/include/asm/module.h
@@ -28,6 +28,13 @@
 # define MODULE_PGSZ ""
 #endif
 
+/* Tag guest Linux, since it uses different SPRs, etc. */
+#if CONFIG_KERNEL_PL == 2
+#define MODULE_PL ""
+#else
+#define MODULE_PL " guest"
+#endif
+
 /* We don't really support no-SMP so tag if someone tries. */
 #ifdef CONFIG_SMP
 #define MODULE_NOSMP ""
@@ -35,6 +42,6 @@
 #define MODULE_NOSMP " nosmp"
 #endif
 
-#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_NOSMP
+#define MODULE_ARCH_VERMAGIC CHIP_ARCH_NAME MODULE_PGSZ MODULE_PL MODULE_NOSMP
 
 #endif /* _ASM_TILE_MODULE_H */
diff --git a/arch/tile/include/asm/page.h b/arch/tile/include/asm/page.h
index b4f96c0..65ee752 100644
--- a/arch/tile/include/asm/page.h
+++ b/arch/tile/include/asm/page.h
@@ -148,8 +148,17 @@ static inline __attribute_const__ int get_order(unsigned long size)
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 #endif
 
+#ifdef CONFIG_KVM_GUEST
+/* Paravirtualized guests get half the VA, and thus half the PA. */
+#define MAX_PA_WIDTH (CHIP_PA_WIDTH() - 1)
+#define MAX_VA_WIDTH (CHIP_VA_WIDTH() - 1)
+#else
+#define MAX_PA_WIDTH CHIP_PA_WIDTH()
+#define MAX_VA_WIDTH CHIP_VA_WIDTH()
+#endif
+
 /* Each memory controller has PAs distinct in their high bits. */
-#define NR_PA_HIGHBIT_SHIFT (CHIP_PA_WIDTH() - CHIP_LOG_NUM_MSHIMS())
+#define NR_PA_HIGHBIT_SHIFT (MAX_PA_WIDTH - CHIP_LOG_NUM_MSHIMS())
 #define NR_PA_HIGHBIT_VALUES (1 << CHIP_LOG_NUM_MSHIMS())
 #define __pa_to_highbits(pa) ((phys_addr_t)(pa) >> NR_PA_HIGHBIT_SHIFT)
 #define __pfn_to_highbits(pfn) ((pfn) >> (NR_PA_HIGHBIT_SHIFT - PAGE_SHIFT))
@@ -160,7 +169,7 @@ static inline __attribute_const__ int get_order(unsigned long size)
  * We reserve the lower half of memory for user-space programs, and the
  * upper half for system code.  We re-map all of physical memory in the
  * upper half, which takes a quarter of our VA space.  Then we have
- * the vmalloc regions.  The supervisor code lives at 0xfffffff700000000,
+ * the vmalloc regions.  The supervisor code lives at the highest address,
  * with the hypervisor above that.
  *
  * Loadable kernel modules are placed immediately after the static
@@ -172,26 +181,25 @@ static inline __attribute_const__ int get_order(unsigned long size)
  * Similarly, for now we don't play any struct page mapping games.
  */
 
-#if CHIP_PA_WIDTH() + 2 > CHIP_VA_WIDTH()
+#if MAX_PA_WIDTH + 2 > MAX_VA_WIDTH
 # error Too much PA to map with the VA available!
 #endif
-#define HALF_VA_SPACE           (_AC(1, UL) << (CHIP_VA_WIDTH() - 1))
 
-#define MEM_LOW_END		(HALF_VA_SPACE - 1)         /* low half */
-#define MEM_HIGH_START		(-HALF_VA_SPACE)            /* high half */
-#define PAGE_OFFSET		MEM_HIGH_START
-#define FIXADDR_BASE		_AC(0xfffffff400000000, UL) /* 4 GB */
-#define FIXADDR_TOP		_AC(0xfffffff500000000, UL) /* 4 GB */
+#ifdef CONFIG_KVM_GUEST
+#define PAGE_OFFSET		(_AC(1, UL) << (MAX_VA_WIDTH - 1))
+#define KERNEL_HIGH_VADDR	(_AC(1, UL) << MAX_VA_WIDTH)
+#else
+#define PAGE_OFFSET		(-(_AC(1, UL) << (MAX_VA_WIDTH - 1)))
+#define KERNEL_HIGH_VADDR	_AC(0xfffffff800000000, UL)  /* high 32GB */
+#endif
+
+#define FIXADDR_BASE		(KERNEL_HIGH_VADDR - 0x400000000) /* 4 GB */
+#define FIXADDR_TOP		(KERNEL_HIGH_VADDR - 0x300000000) /* 4 GB */
 #define _VMALLOC_START		FIXADDR_TOP
-#define HUGE_VMAP_BASE		_AC(0xfffffff600000000, UL) /* 4 GB */
-#define MEM_SV_START		_AC(0xfffffff700000000, UL) /* 256 MB */
-#define MEM_SV_INTRPT		MEM_SV_START
-#define MEM_MODULE_START	_AC(0xfffffff710000000, UL) /* 256 MB */
+#define HUGE_VMAP_BASE		(KERNEL_HIGH_VADDR - 0x200000000) /* 4 GB */
+#define MEM_SV_START		(KERNEL_HIGH_VADDR - 0x100000000) /* 256 MB */
+#define MEM_MODULE_START	(MEM_SV_START + (256*1024*1024)) /* 256 MB */
 #define MEM_MODULE_END		(MEM_MODULE_START + (256*1024*1024))
-#define MEM_HV_START		_AC(0xfffffff800000000, UL) /* 32 GB */
-
-/* Highest DTLB address we will use */
-#define KERNEL_HIGH_VADDR	MEM_SV_START
 
 #else /* !__tilegx__ */
 
@@ -213,8 +221,8 @@ static inline __attribute_const__ int get_order(unsigned long size)
  * values, and after that, we show "typical" values, since the actual
  * addresses depend on kernel #defines.
  *
- * MEM_HV_INTRPT                   0xfe000000
- * MEM_SV_INTRPT (kernel code)     0xfd000000
+ * MEM_HV_START                    0xfe000000
+ * MEM_SV_START  (kernel code)     0xfd000000
  * MEM_USER_INTRPT (user vector)   0xfc000000
  * FIX_KMAP_xxx                    0xf8000000 (via NR_CPUS * KM_TYPE_NR)
  * PKMAP_BASE                      0xf7000000 (via LAST_PKMAP)
@@ -224,14 +232,8 @@ static inline __attribute_const__ int get_order(unsigned long size)
  */
 
 #define MEM_USER_INTRPT		_AC(0xfc000000, UL)
-#if CONFIG_KERNEL_PL == 1
-#define MEM_SV_INTRPT		_AC(0xfd000000, UL)
-#define MEM_HV_INTRPT		_AC(0xfe000000, UL)
-#else
-#define MEM_GUEST_INTRPT	_AC(0xfd000000, UL)
-#define MEM_SV_INTRPT		_AC(0xfe000000, UL)
-#define MEM_HV_INTRPT		_AC(0xff000000, UL)
-#endif
+#define MEM_SV_START		_AC(0xfd000000, UL)
+#define MEM_HV_START		_AC(0xfe000000, UL)
 
 #define INTRPT_SIZE		0x4000
 
diff --git a/arch/tile/include/asm/pgtable_32.h b/arch/tile/include/asm/pgtable_32.h
index e5bdc0e..63142ab 100644
--- a/arch/tile/include/asm/pgtable_32.h
+++ b/arch/tile/include/asm/pgtable_32.h
@@ -89,7 +89,7 @@ static inline int pud_huge_page(pud_t pud)	{ return 0; }
 /* We don't define any pgds for these addresses. */
 static inline int pgd_addr_invalid(unsigned long addr)
 {
-	return addr >= MEM_HV_INTRPT;
+	return addr >= MEM_HV_START;
 }
 
 /*
diff --git a/arch/tile/include/asm/pgtable_64.h b/arch/tile/include/asm/pgtable_64.h
index 7cb8d35..3421177 100644
--- a/arch/tile/include/asm/pgtable_64.h
+++ b/arch/tile/include/asm/pgtable_64.h
@@ -140,8 +140,7 @@ static inline unsigned long pgd_addr_normalize(unsigned long addr)
 /* We don't define any pgds for these addresses. */
 static inline int pgd_addr_invalid(unsigned long addr)
 {
-	return addr >= MEM_HV_START ||
-		(addr > MEM_LOW_END && addr < MEM_HIGH_START);
+	return addr >= KERNEL_HIGH_VADDR || addr != pgd_addr_normalize(addr);
 }
 
 /*
diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h
index 230b830..5aa5431 100644
--- a/arch/tile/include/asm/processor.h
+++ b/arch/tile/include/asm/processor.h
@@ -15,6 +15,8 @@
 #ifndef _ASM_TILE_PROCESSOR_H
 #define _ASM_TILE_PROCESSOR_H
 
+#include <arch/chip.h>
+
 #ifndef __ASSEMBLY__
 
 /*
@@ -25,7 +27,6 @@
 #include <asm/ptrace.h>
 #include <asm/percpu.h>
 
-#include <arch/chip.h>
 #include <arch/spr_def.h>
 
 struct task_struct;
@@ -167,7 +168,7 @@ struct thread_struct {
 #ifndef __ASSEMBLY__
 
 #ifdef __tilegx__
-#define TASK_SIZE_MAX		(MEM_LOW_END + 1)
+#define TASK_SIZE_MAX		(_AC(1, UL) << (MAX_VA_WIDTH - 1))
 #else
 #define TASK_SIZE_MAX		PAGE_OFFSET
 #endif
@@ -347,7 +348,6 @@ extern int kdata_huge;
 
 /*
  * Provide symbolic constants for PLs.
- * Note that assembly code assumes that USER_PL is zero.
  */
 #define USER_PL 0
 #if CONFIG_KERNEL_PL == 2
diff --git a/arch/tile/include/asm/ptrace.h b/arch/tile/include/asm/ptrace.h
index 0d25c21..b9620c0 100644
--- a/arch/tile/include/asm/ptrace.h
+++ b/arch/tile/include/asm/ptrace.h
@@ -39,7 +39,7 @@ typedef unsigned long pt_reg_t;
 #define user_stack_pointer(regs) ((regs)->sp)
 
 /* Does the process account for user or for system time? */
-#define user_mode(regs) (EX1_PL((regs)->ex1) == USER_PL)
+#define user_mode(regs) (EX1_PL((regs)->ex1) < KERNEL_PL)
 
 /* Fill in a struct pt_regs with the current kernel registers. */
 struct pt_regs *get_pt_regs(struct pt_regs *);
diff --git a/arch/tile/include/asm/switch_to.h b/arch/tile/include/asm/switch_to.h
index b8f888c..8e9150f 100644
--- a/arch/tile/include/asm/switch_to.h
+++ b/arch/tile/include/asm/switch_to.h
@@ -50,16 +50,31 @@ extern struct task_struct *__switch_to(struct task_struct *prev,
 extern unsigned long get_switch_to_pc(void);
 
 /*
+ * Normally we notify the simulator whenever we change from one pid
+ * to another, so it can track symbol files appropriately on the fly.
+ * For now, we don't do this for the guest Linux, since we don't
+ * have a way to tell the simulator that we are entering a separate
+ * pid space when we are in the guest.
+ */
+#ifdef CONFIG_KVM_GUEST
+#define notify_sim_task_change(prev) do { } while (0)
+#else
+#define notify_sim_task_change(prev) do {				\
+	if (unlikely((prev)->state == TASK_DEAD))			\
+		__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT |	\
+			     ((prev)->pid << _SIM_CONTROL_OPERATOR_BITS)); \
+	__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH |		\
+		     (current->pid << _SIM_CONTROL_OPERATOR_BITS));	\
+} while (0)
+#endif
+
+/*
  * Kernel threads can check to see if they need to migrate their
  * stack whenever they return from a context switch; for user
  * threads, we defer until they are returning to user-space.
  */
 #define finish_arch_switch(prev) do {                                     \
-	if (unlikely((prev)->state == TASK_DEAD))                         \
-		__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT |       \
-			((prev)->pid << _SIM_CONTROL_OPERATOR_BITS));     \
-	__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH |             \
-		(current->pid << _SIM_CONTROL_OPERATOR_BITS));            \
+	notify_sim_task_change(prev);                                     \
 	if (current->mm == NULL && !kstack_hash &&                        \
 	    current_thread_info()->homecache_cpu != smp_processor_id())   \
 		homecache_migrate_kthread();                              \
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index b8aa6df..1c26cdf 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -18,7 +18,9 @@
 
 #include <asm/processor.h>
 #include <asm/page.h>
+
 #ifndef __ASSEMBLY__
+struct kvm_vcpu;
 
 /*
  * Low level task data that assembly code needs immediate access to.
@@ -44,6 +46,9 @@ struct thread_info {
 	unsigned long		unalign_jit_tmp[4]; /* temp r0..r3 storage */
 	void __user		*unalign_jit_base; /* unalign fixup JIT base */
 #endif
+#ifdef CONFIG_KVM
+	struct kvm_vcpu		*vcpu;		/* vcpu during vmresume */
+#endif
 };
 
 /*
@@ -117,8 +122,8 @@ extern void _cpu_idle(void);
 
 /*
  * Thread information flags that various assembly files may need to access.
- * Keep flags accessed frequently in low bits, particular since it makes
- * it easier to build constants in assembly.
+ * Keep flags accessed frequently in low bits, since it makes it
+ * easier to build constants in assembly.
  */
 #define TIF_SIGPENDING		0	/* signal pending */
 #define TIF_NEED_RESCHED	1	/* rescheduling necessary */
@@ -131,6 +136,7 @@ extern void _cpu_idle(void);
 #define TIF_MEMDIE		7	/* OOM killer at work */
 #define TIF_NOTIFY_RESUME	8	/* callback before returning to user */
 #define TIF_SYSCALL_TRACEPOINT	9	/* syscall tracepoint instrumentation */
+#define TIF_VIRT_EXIT		10	/* force exit of task in vmresume */
 
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
@@ -142,11 +148,12 @@ extern void _cpu_idle(void);
 #define _TIF_MEMDIE		(1<<TIF_MEMDIE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
 #define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
+#define _TIF_VIRT_EXIT		(1<<TIF_VIRT_EXIT)
 
 /* Work to do on any return to user space. */
-#define _TIF_ALLWORK_MASK \
-  (_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP|\
-   _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME)
+#define _TIF_ALLWORK_MASK					\
+	(_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_SINGLESTEP|	\
+	 _TIF_ASYNC_TLB|_TIF_NOTIFY_RESUME|_TIF_VIRT_EXIT)
 
 /* Work to do at syscall entry. */
 #define _TIF_SYSCALL_ENTRY_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_TRACEPOINT)
diff --git a/arch/tile/include/asm/timex.h b/arch/tile/include/asm/timex.h
index edbd7e4..0417617 100644
--- a/arch/tile/include/asm/timex.h
+++ b/arch/tile/include/asm/timex.h
@@ -27,6 +27,14 @@
 
 typedef unsigned long long cycles_t;
 
+#ifdef CONFIG_KVM_GUEST
+#define INT_LINUX_TIMER INT_AUX_TILE_TIMER
+#define SPR_LINUX_TIMER_CONTROL SPR_AUX_TILE_TIMER_CONTROL
+#else
+#define INT_LINUX_TIMER INT_TILE_TIMER
+#define SPR_LINUX_TIMER_CONTROL SPR_TILE_TIMER_CONTROL
+#endif
+
 #if CHIP_HAS_SPLIT_CYCLE()
 cycles_t get_cycles(void);
 #define get_cycles_low() __insn_mfspr(SPR_CYCLE_LOW)
diff --git a/arch/tile/include/hv/hypervisor.h b/arch/tile/include/hv/hypervisor.h
index f71b08e..71abe38 100644
--- a/arch/tile/include/hv/hypervisor.h
+++ b/arch/tile/include/hv/hypervisor.h
@@ -321,6 +321,18 @@
 /** hv_set_speed */
 #define HV_DISPATCH_SET_SPEED                     58
 
+/** hv_install_virt_context */
+#define HV_DISPATCH_INSTALL_VIRT_CONTEXT          59
+
+/** hv_inquire_virt_context */
+#define HV_DISPATCH_INQUIRE_VIRT_CONTEXT          60
+
+/** hv_install_guest_context */
+#define HV_DISPATCH_INSTALL_GUEST_CONTEXT         61
+
+/** hv_inquire_guest_context */
+#define HV_DISPATCH_INQUIRE_GUEST_CONTEXT         62
+
 /** hv_console_set_ipi */
 #define HV_DISPATCH_CONSOLE_SET_IPI               63
 
@@ -783,12 +795,15 @@ HV_SetSpeed hv_set_speed(unsigned long speed, __hv64 start_cycle,
  *  new page table does not need to contain any mapping for the
  *  hv_install_context address itself.
  *
- *  At most one HV_CTX_PG_SM_* flag may be specified in "flags";
+ *  At most one HV_CTX_PG_SM_* flag may be specified in the flags argument;
  *  if multiple flags are specified, HV_EINVAL is returned.
  *  Specifying none of the flags results in using the default page size.
  *  All cores participating in a given client must request the same
  *  page size, or the results are undefined.
  *
+ *  To disable an installed page table, install HV_CTX_NONE.  The access
+ *  and asid fields are ignored.
+ *
  * @param page_table Root of the page table.
  * @param access PTE providing info on how to read the page table.  This
  *   value must be consistent between multiple tiles sharing a page table,
@@ -804,16 +819,101 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
 
 #endif /* !__ASSEMBLER__ */
 
+#define HV_CTX_NONE         ((HV_PhysAddr)-1)  /**< Disable page table. */
+
 #define HV_CTX_DIRECTIO     0x1   /**< Direct I/O requests are accepted from
                                        PL0. */
 
+#define HV_CTX_GUEST_CACHE  0x4   /**< Let guest control caching flags (only
+                                       usable with hv_install_virt_context.) */
+
 #define HV_CTX_PG_SM_4K     0x10  /**< Use 4K small pages, if available. */
 #define HV_CTX_PG_SM_16K    0x20  /**< Use 16K small pages, if available. */
 #define HV_CTX_PG_SM_64K    0x40  /**< Use 64K small pages, if available. */
 #define HV_CTX_PG_SM_MASK   0xf0  /**< Mask of all possible small pages. */
 
+
 #ifndef __ASSEMBLER__
 
+/** Install a virtualization context.
+ *
+ * When a virtualization context is installed, all faults from PL0 or
+ * PL1 are handled via a "guest context" and then post-processed by
+ * the "virtualization context"; faults at PL2 are still handled by
+ * the normal context.  For guest faults, the "guest PAs" produced by
+ * the guest page table are passed through the virtualization page
+ * table as pseudo-VAs, generating the true CPA as a result.  See the
+ * individual HV_PTE_xxx bits for the effect the bits have when
+ * present in the virtualization page table.  The ASID is currently
+ * ignored in this syscall, but it might be used later, so the API
+ * includes it.  The HV_CTX_GUEST_CACHE flag indicates that all
+ * cache-related flags should be taken from the primary page table,
+ * not the virtualization page table.
+ *
+ * Once the virtualization context is installed, a guest context
+ * should also be installed; otherwise a VA-equals-PA context will be
+ * used for accesses at PL 0 or 1, i.e. VAs will be passed directly to
+ * the virtualization context to generate CPAs.
+ *
+ * When entering client PL after being at guest or user PL, the
+ * client is expected to call hv_flush_all() to clear any TLB mappings
+ * that might otherwise conflict.  Similarly, hv_flush_all() should
+ * be called before returning to guest or user PL with a virtualization
+ * context installed, so that any TLB mappings are cleared.  Future
+ * work may include adding a "vpid" or similar namespace so that
+ * the TLBs may be managed independently.
+ *
+ * Subsequent guest page table installations will have their root PA
+ * and PTE cached after translating through the virtualization
+ * context, so if entries in the virtualization page table are
+ * modified or removed, the guest context should be re-installed.
+ * This, in conjunction with flushing the TLB on return to the guest,
+ * will ensure that the new virtualization entries are honored.
+ *
+ * @param page_table Root of the page table.
+ * @param access PTE providing info on how to read the page table.  This
+ *   value must be consistent between multiple tiles sharing a page table,
+ *   and must also be consistent with any virtual mappings the client
+ *   may be using to access the page table.
+ * @param asid HV_ASID the page table is to be used for (currently ignored).
+ * @param flags Context flags, denoting attributes or privileges of the
+ *   current virtualization context (see below).
+ * @return Zero on success, or a hypervisor error code on failure.
+ */
+
+int hv_install_virt_context(HV_PhysAddr page_table, HV_PTE access,
+                            HV_ASID asid, __hv32 flags);
+
+
+
+/** Install a guest context.
+ *
+ * The guest context is only consulted when a virtualization context
+ * is also installed, and for faults that occur below the client's PL.
+ * If no guest context is installed, in such a case, a VA=PA context
+ * is used instead.
+ *
+ * The access PTE will only be honored if the virtualization table was
+ * installed with HV_CTX_GUEST_CACHE.
+ *
+ * A virtualization context must already be installed prior to
+ * installing the guest context.
+ *
+ * @param page_table Root of the page table; the value is the guest's
+ *   physical address (GPA), not a CPA.
+ * @param access PTE providing info on how to read the page table.  This
+ *   value must be consistent between multiple tiles sharing a page table,
+ *   and must also be consistent with any virtual mappings the client
+ *   may be using to access the page table.
+ * @param asid HV_ASID the page table is to be used for.
+ * @param flags Context flags, denoting attributes or privileges of the
+ *   current context (HV_CTX_xxx).
+ * @return Zero on success, or a hypervisor error code on failure.
+ */
+
+int hv_install_guest_context(HV_PhysAddr page_table, HV_PTE access,
+                             HV_ASID asid, __hv32 flags);
+
 
 /** Set the number of pages ganged together by HV_PTE_SUPER at a
  * particular level of the page table.
@@ -823,7 +923,7 @@ int hv_install_context(HV_PhysAddr page_table, HV_PTE access, HV_ASID asid,
  * "super" page size must be less than the span of the next level in
  * the page table.  The largest size that can be requested is 64GB.
  *
- * The shift value is initially "0" for all page table levels,
+ * The shift value is initially 0 for all page table levels,
  * indicating that the HV_PTE_SUPER bit is effectively ignored.
  *
  * If you change the count from one non-zero value to another, the
@@ -854,11 +954,26 @@ typedef struct
 } HV_Context;
 
 /** Retrieve information about the currently installed context.
- * @return The data passed to the last successful hv_install_context call.
+ * @return The data passed to the last successful call to
+ * hv_install_context().
  */
 HV_Context hv_inquire_context(void);
 
 
+/** Retrieve information about the currently installed virtualization context.
+ * @return The data passed to the last successful call to
+ * hv_install_virt_context().
+ */
+HV_Context hv_inquire_virt_context(void);
+
+
+/** Retrieve information about the currently installed guest context.
+ * @return The data passed to the last successful call to
+ * hv_install_guest_context().
+ */
+HV_Context hv_inquire_guest_context(void);
+
+
 /** Flushes all translations associated with the named address space
  *  identifier from the TLB and any other hypervisor data structures.
  *  Translations installed with the "global" bit are not flushed.
@@ -917,7 +1032,7 @@ int hv_flush_pages(HV_VirtAddr start, HV_PageSize page_size,
 /** Flushes all non-global translations (if preserve_global is true),
  *  or absolutely all translations (if preserve_global is false).
  *
- * @param preserve_global Non-zero if we want to preserve "global" mappings.
+ * @param preserve_global Non-zero if we want to preserve global mappings.
  * @return Zero on success, or a hypervisor error code on failure.
 */
 int hv_flush_all(int preserve_global);
@@ -991,7 +1106,11 @@ typedef enum {
   HV_INQ_TILES_HFH_CACHE       = 2,
 
   /** The set of tiles that can be legally used as a LOTAR for a PTE. */
-  HV_INQ_TILES_LOTAR           = 3
+  HV_INQ_TILES_LOTAR           = 3,
+
+  /** The set of "shared" driver tiles that the hypervisor may
+   *  periodically interrupt. */
+  HV_INQ_TILES_SHARED          = 4
 } HV_InqTileSet;
 
 /** Returns specific information about various sets of tiles within the
@@ -1271,14 +1390,21 @@ void hv_downcall_dispatch(void);
  */
 /** Message receive downcall interrupt vector */
 #define INT_MESSAGE_RCV_DWNCL    INT_BOOT_ACCESS
+/** Device interrupt downcall interrupt vector */
+#define INT_DEV_INTR_DWNCL       INT_WORLD_ACCESS
+#ifdef __tilegx__
+/** Virtualization page table miss downcall interrupt vector */
+#define INT_VPGTABLE_MISS_DWNCL  INT_I_ASID
+/** Virtualization guest illegal page table */
+#define INT_VGUEST_FATAL_DWNCL   INT_D_ASID
+#else
 /** DMA TLB miss downcall interrupt vector */
 #define INT_DMATLB_MISS_DWNCL    INT_DMA_ASID
-/** Static nework processor instruction TLB miss interrupt vector */
-#define INT_SNITLB_MISS_DWNCL    INT_SNI_ASID
 /** DMA TLB access violation downcall interrupt vector */
 #define INT_DMATLB_ACCESS_DWNCL  INT_DMA_CPL
-/** Device interrupt downcall interrupt vector */
-#define INT_DEV_INTR_DWNCL       INT_WORLD_ACCESS
+/** Static nework processor instruction TLB miss interrupt vector */
+#define INT_SNITLB_MISS_DWNCL    INT_SNI_ASID
+#endif
 
 #ifndef __ASSEMBLER__
 
@@ -2041,8 +2167,16 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
 #define HV_PTE_PTFN_BITS             29  /**< Number of bits in a PTFN */
 
 /*
- * Legal values for the PTE's mode field
+ * Legal values for the PTE's mode field.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
+ * Note that if HV_CTX_GUEST_CACHE is not set, guests will only be able
+ * to access MMIO resources via pseudo PAs that map to MMIO in the
+ * virtualization page table.
  */
+
 /** Data is not resident in any caches; loads and stores access memory
  *  directly.
  */
@@ -2161,6 +2295,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
  * doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
  *
  * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in the primary page table if a virtualization
+ * page table is installed.
  */
 #define HV_PTE_GLOBAL                (__HV_PTE_ONE << HV_PTE_INDEX_GLOBAL)
 
@@ -2174,6 +2310,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
  * doing so may race with the hypervisor's update of ACCESSED and DIRTY bits.
  *
  * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in the virtualization page table.
  */
 #define HV_PTE_USER                  (__HV_PTE_ONE << HV_PTE_INDEX_USER)
 
@@ -2185,7 +2322,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
  * has been cleared, subsequent references are not guaranteed to set
  * it again until the translation has been flushed from the TLB.
  *
- * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in level-0 or level-1 PTEs unless the Page bit is set.
  */
 #define HV_PTE_ACCESSED              (__HV_PTE_ONE << HV_PTE_INDEX_ACCESSED)
 
@@ -2197,7 +2334,7 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
  * has been cleared, subsequent references are not guaranteed to set
  * it again until the translation has been flushed from the TLB.
  *
- * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * This bit is ignored in level-0 or level-1 PTEs unless the Page bit is set.
  */
 #define HV_PTE_DIRTY                 (__HV_PTE_ONE << HV_PTE_INDEX_DIRTY)
 
@@ -2239,6 +2376,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
  *
  * In level-1 PTEs, if the Page bit is clear, this bit determines how the
  * level-2 page table is accessed.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
  */
 #define HV_PTE_NC                    (__HV_PTE_ONE << HV_PTE_INDEX_NC)
 
@@ -2252,6 +2393,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
  *
  * In level-1 PTEs, if the Page bit is clear, this bit
  * determines how the level-2 page table is accessed.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
  */
 #define HV_PTE_NO_ALLOC_L1           (__HV_PTE_ONE << HV_PTE_INDEX_NO_ALLOC_L1)
 
@@ -2265,6 +2410,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
  *
  * In level-1 PTEs, if the Page bit is clear, this bit determines how the
  * level-2 page table is accessed.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
  */
 #define HV_PTE_NO_ALLOC_L2           (__HV_PTE_ONE << HV_PTE_INDEX_NO_ALLOC_L2)
 
@@ -2284,6 +2433,10 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
  * the page map directly to memory.
  *
  * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ *
+ * If a virtualization page table is installed, this field is only honored
+ * in the primary page table if HV_CTX_GUEST_CACHE was set when the page
+ * table was installed, otherwise only in the virtualization page table.
  */
 #define HV_PTE_CACHED_PRIORITY       (__HV_PTE_ONE << \
                                       HV_PTE_INDEX_CACHED_PRIORITY)
@@ -2297,6 +2450,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
  * It is illegal for this bit to be clear if the Writable bit is set.
  *
  * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * If a virtualization page table is present, the final Readable status
+ * is the logical "and" of this bit in both page tables.
  */
 #define HV_PTE_READABLE              (__HV_PTE_ONE << HV_PTE_INDEX_READABLE)
 
@@ -2307,6 +2462,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
  * PTE.
  *
  * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * If a virtualization page table is present, the final Writable status
+ * is the logical "and" of this bit in both page tables.
  */
 #define HV_PTE_WRITABLE              (__HV_PTE_ONE << HV_PTE_INDEX_WRITABLE)
 
@@ -2319,6 +2476,8 @@ int hv_flush_remote(HV_PhysAddr cache_pa, unsigned long cache_control,
  * than one.
  *
  * This bit is ignored in level-1 PTEs unless the Page bit is set.
+ * If a virtualization page table is present, the final Executable status
+ * is the logical "and" of this bit in both page tables.
  */
 #define HV_PTE_EXECUTABLE            (__HV_PTE_ONE << HV_PTE_INDEX_EXECUTABLE)
 
diff --git a/arch/tile/include/uapi/arch/sim.h b/arch/tile/include/uapi/arch/sim.h
index e54b7b0..36fb24c 100644
--- a/arch/tile/include/uapi/arch/sim.h
+++ b/arch/tile/include/uapi/arch/sim.h
@@ -611,6 +611,25 @@ sim_profiler_chip_clear(unsigned int mask)
   __insn_mtspr(SPR_SIM_CONTROL, SIM_PROFILER_CHIP_CLEAR_SPR_ARG(mask));
 }
 
+/**
+ * Set vCPU number for a given task.
+ * @param vcpu Virtual cpu to set.
+ */
+static __inline void
+sim_set_vcpu(int vcpu)
+{
+  __insn_mtspr(SPR_SIM_CONTROL,
+               SIM_CONTROL_VCPU | (vcpu << _SIM_CONTROL_OPERATOR_BITS));
+}
+
+/** Clear vCPU status for a given task. */
+static __inline void
+sim_clear_vcpu(void)
+{
+  __insn_mtspr(SPR_SIM_CONTROL,
+               SIM_CONTROL_VCPU | (-1 << _SIM_CONTROL_OPERATOR_BITS));
+}
+
 
 /*
  * Event support.
diff --git a/arch/tile/include/uapi/arch/sim_def.h b/arch/tile/include/uapi/arch/sim_def.h
index 4b44a2b..b9aad66 100644
--- a/arch/tile/include/uapi/arch/sim_def.h
+++ b/arch/tile/include/uapi/arch/sim_def.h
@@ -221,6 +221,14 @@
  */
 #define SIM_CONTROL_ENABLE_MPIPE_LINK_MAGIC_BYTE 36
 
+/**
+ * If written to SPR_SIM_CONTROL, combined with a signed virtual cpu
+ * number shifted by 8, will tag any identification of the cpu that
+ * task is running on with the given virtual cpu number.  If the
+ * virtual cpu number is -1, the tag is removed.
+ */
+#define SIM_CONTROL_VCPU 37
+
 
 /*
  * Syscall numbers for use with "sim_syscall()".
diff --git a/arch/tile/include/uapi/arch/spr_def_32.h b/arch/tile/include/uapi/arch/spr_def_32.h
index c689446..4644c8d 100644
--- a/arch/tile/include/uapi/arch/spr_def_32.h
+++ b/arch/tile/include/uapi/arch/spr_def_32.h
@@ -121,6 +121,9 @@
 #define SPR_MPL_DMA_NOTIFY_SET_0 0x3800
 #define SPR_MPL_DMA_NOTIFY_SET_1 0x3801
 #define SPR_MPL_DMA_NOTIFY_SET_2 0x3802
+#define SPR_MPL_GPV_SET_0 0x0600
+#define SPR_MPL_GPV_SET_1 0x0601
+#define SPR_MPL_GPV_SET_2 0x0602
 #define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
 #define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
 #define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
@@ -142,6 +145,9 @@
 #define SPR_MPL_IDN_TIMER_SET_0 0x3400
 #define SPR_MPL_IDN_TIMER_SET_1 0x3401
 #define SPR_MPL_IDN_TIMER_SET_2 0x3402
+#define SPR_MPL_ILL_SET_0 0x0400
+#define SPR_MPL_ILL_SET_1 0x0401
+#define SPR_MPL_ILL_SET_2 0x0402
 #define SPR_MPL_INTCTRL_0_SET_0 0x4a00
 #define SPR_MPL_INTCTRL_0_SET_1 0x4a01
 #define SPR_MPL_INTCTRL_0_SET_2 0x4a02
@@ -166,6 +172,12 @@
 #define SPR_MPL_SN_NOTIFY_SET_0 0x2a00
 #define SPR_MPL_SN_NOTIFY_SET_1 0x2a01
 #define SPR_MPL_SN_NOTIFY_SET_2 0x2a02
+#define SPR_MPL_SWINT_0_SET_0 0x1c00
+#define SPR_MPL_SWINT_0_SET_1 0x1c01
+#define SPR_MPL_SWINT_0_SET_2 0x1c02
+#define SPR_MPL_SWINT_1_SET_0 0x1a00
+#define SPR_MPL_SWINT_1_SET_1 0x1a01
+#define SPR_MPL_SWINT_1_SET_2 0x1a02
 #define SPR_MPL_UDN_ACCESS_SET_0 0x0c00
 #define SPR_MPL_UDN_ACCESS_SET_1 0x0c01
 #define SPR_MPL_UDN_ACCESS_SET_2 0x0c02
@@ -187,6 +199,9 @@
 #define SPR_MPL_UDN_TIMER_SET_0 0x3600
 #define SPR_MPL_UDN_TIMER_SET_1 0x3601
 #define SPR_MPL_UDN_TIMER_SET_2 0x3602
+#define SPR_MPL_UNALIGN_DATA_SET_0 0x1e00
+#define SPR_MPL_UNALIGN_DATA_SET_1 0x1e01
+#define SPR_MPL_UNALIGN_DATA_SET_2 0x1e02
 #define SPR_MPL_WORLD_ACCESS_SET_0 0x4e00
 #define SPR_MPL_WORLD_ACCESS_SET_1 0x4e01
 #define SPR_MPL_WORLD_ACCESS_SET_2 0x4e02
diff --git a/arch/tile/include/uapi/arch/spr_def_64.h b/arch/tile/include/uapi/arch/spr_def_64.h
index 67a6c17..727cda7 100644
--- a/arch/tile/include/uapi/arch/spr_def_64.h
+++ b/arch/tile/include/uapi/arch/spr_def_64.h
@@ -21,6 +21,10 @@
 #define SPR_AUX_PERF_COUNT_1 0x2106
 #define SPR_AUX_PERF_COUNT_CTL 0x2107
 #define SPR_AUX_PERF_COUNT_STS 0x2108
+#define SPR_AUX_TILE_TIMER_CONTROL 0x1705
+#define SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK  0xffffffff
+#define SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT 62
+#define SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT 63
 #define SPR_CMPEXCH_VALUE 0x2780
 #define SPR_CYCLE 0x2781
 #define SPR_DONE 0x2705
@@ -101,6 +105,9 @@
 #define SPR_MPL_AUX_TILE_TIMER_SET_0 0x1700
 #define SPR_MPL_AUX_TILE_TIMER_SET_1 0x1701
 #define SPR_MPL_AUX_TILE_TIMER_SET_2 0x1702
+#define SPR_MPL_GPV_SET_0 0x0900
+#define SPR_MPL_GPV_SET_1 0x0901
+#define SPR_MPL_GPV_SET_2 0x0902
 #define SPR_MPL_IDN_ACCESS_SET_0 0x0a00
 #define SPR_MPL_IDN_ACCESS_SET_1 0x0a01
 #define SPR_MPL_IDN_ACCESS_SET_2 0x0a02
@@ -116,6 +123,12 @@
 #define SPR_MPL_IDN_TIMER_SET_0 0x1800
 #define SPR_MPL_IDN_TIMER_SET_1 0x1801
 #define SPR_MPL_IDN_TIMER_SET_2 0x1802
+#define SPR_MPL_ILL_SET_0 0x0800
+#define SPR_MPL_ILL_SET_1 0x0801
+#define SPR_MPL_ILL_SET_2 0x0802
+#define SPR_MPL_ILL_TRANS_SET_0 0x1000
+#define SPR_MPL_ILL_TRANS_SET_1 0x1001
+#define SPR_MPL_ILL_TRANS_SET_2 0x1002
 #define SPR_MPL_INTCTRL_0_SET_0 0x2500
 #define SPR_MPL_INTCTRL_0_SET_1 0x2501
 #define SPR_MPL_INTCTRL_0_SET_2 0x2502
@@ -140,6 +153,15 @@
 #define SPR_MPL_PERF_COUNT_SET_0 0x2000
 #define SPR_MPL_PERF_COUNT_SET_1 0x2001
 #define SPR_MPL_PERF_COUNT_SET_2 0x2002
+#define SPR_MPL_SINGLE_STEP_1_SET_0 0x0300
+#define SPR_MPL_SINGLE_STEP_1_SET_1 0x0301
+#define SPR_MPL_SINGLE_STEP_1_SET_2 0x0302
+#define SPR_MPL_SWINT_0_SET_0 0x0f00
+#define SPR_MPL_SWINT_0_SET_1 0x0f01
+#define SPR_MPL_SWINT_0_SET_2 0x0f02
+#define SPR_MPL_SWINT_1_SET_0 0x0e00
+#define SPR_MPL_SWINT_1_SET_1 0x0e01
+#define SPR_MPL_SWINT_1_SET_2 0x0e02
 #define SPR_MPL_UDN_ACCESS_SET_0 0x0b00
 #define SPR_MPL_UDN_ACCESS_SET_1 0x0b01
 #define SPR_MPL_UDN_ACCESS_SET_2 0x0b02
@@ -155,6 +177,9 @@
 #define SPR_MPL_UDN_TIMER_SET_0 0x1900
 #define SPR_MPL_UDN_TIMER_SET_1 0x1901
 #define SPR_MPL_UDN_TIMER_SET_2 0x1902
+#define SPR_MPL_UNALIGN_DATA_SET_0 0x1100
+#define SPR_MPL_UNALIGN_DATA_SET_1 0x1101
+#define SPR_MPL_UNALIGN_DATA_SET_2 0x1102
 #define SPR_MPL_WORLD_ACCESS_SET_0 0x2700
 #define SPR_MPL_WORLD_ACCESS_SET_1 0x2701
 #define SPR_MPL_WORLD_ACCESS_SET_2 0x2702
diff --git a/arch/tile/include/uapi/asm/Kbuild b/arch/tile/include/uapi/asm/Kbuild
index c20db8e..f07cc24 100644
--- a/arch/tile/include/uapi/asm/Kbuild
+++ b/arch/tile/include/uapi/asm/Kbuild
@@ -6,7 +6,9 @@ header-y += bitsperlong.h
 header-y += byteorder.h
 header-y += cachectl.h
 header-y += hardwall.h
+header-y += kvm.h
 header-y += kvm_para.h
+header-y += kvm_virtio.h
 header-y += mman.h
 header-y += ptrace.h
 header-y += setup.h
diff --git a/arch/tile/include/uapi/asm/kvm.h b/arch/tile/include/uapi/asm/kvm.h
new file mode 100644
index 0000000..25ca8ce
--- /dev/null
+++ b/arch/tile/include/uapi/asm/kvm.h
@@ -0,0 +1,249 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _UAPI_ASM_TILE_KVM_H
+#define _UAPI_ASM_TILE_KVM_H
+
+#ifndef __ASSEMBLER__
+#include <linux/ptrace.h>
+#endif
+
+#include <arch/abi.h>
+
+/*
+ * For Hypervisor syscalls. Note this comes from the hv: syscall.h,
+ * with small modifications: Remove HV_SYS_fence_incoherent.
+ */
+/* Syscall allowed from guest PL bit mask. */
+#define HV_SYS_GUEST_SHIFT                12
+#define HV_SYS_GUEST_MASK                 (1 << HV_SYS_GUEST_SHIFT)
+/* downcall_dispatch; this syscall number must be zero */
+#define HV_SYS_downcall_dispatch          0
+/* install_context */
+#define HV_SYS_install_context            1
+/* sysconf */
+#define HV_SYS_sysconf                    2
+/* get_rtc */
+#define HV_SYS_get_rtc                    3
+/* set_rtc */
+#define HV_SYS_set_rtc                    4
+/* flush_asid */
+#define HV_SYS_flush_asid                 5
+/* flush_page */
+#define HV_SYS_flush_page                 6
+/* flush_pages */
+#define HV_SYS_flush_pages                7
+/* restart */
+#define HV_SYS_restart                    8
+/* halt */
+#define HV_SYS_halt                       9
+/* power_off */
+#define HV_SYS_power_off                 10
+/* inquire_physical */
+#define HV_SYS_inquire_physical          11
+/* inquire_memory_controller */
+#define HV_SYS_inquire_memory_controller 12
+/* inquire_virtual */
+#define HV_SYS_inquire_virtual           13
+/* inquire_asid */
+#define HV_SYS_inquire_asid              14
+/* console_read_if_ready */
+#define HV_SYS_console_read_if_ready     15
+/* console_write */
+#define HV_SYS_console_write             16
+/* init */
+#define HV_SYS_init                      17
+/* inquire_topology */
+#define HV_SYS_inquire_topology          18
+/* fs_findfile */
+#define HV_SYS_fs_findfile               19
+/* fs_fstat */
+#define HV_SYS_fs_fstat                  20
+/* fs_pread */
+#define HV_SYS_fs_pread                  21
+/* physaddr_read64 */
+#define HV_SYS_physaddr_read64           22
+/* physaddr_write64 */
+#define HV_SYS_physaddr_write64          23
+/* get_command_line */
+#define HV_SYS_get_command_line          24
+/* set_caching */
+#define HV_SYS_set_caching               25
+/* bzero_page */
+#define HV_SYS_bzero_page                26
+/* register_message_state */
+#define HV_SYS_register_message_state    27
+/* send_message */
+#define HV_SYS_send_message              28
+/* receive_message */
+#define HV_SYS_receive_message           29
+/* inquire_context */
+#define HV_SYS_inquire_context           30
+/* start_all_tiles */
+#define HV_SYS_start_all_tiles           31
+/* dev_open */
+#define HV_SYS_dev_open                  32
+/* dev_close */
+#define HV_SYS_dev_close                 33
+/* dev_pread */
+#define HV_SYS_dev_pread                 34
+/* dev_pwrite */
+#define HV_SYS_dev_pwrite                35
+/* dev_poll */
+#define HV_SYS_dev_poll                  36
+/* dev_poll_cancel */
+#define HV_SYS_dev_poll_cancel           37
+/* dev_preada */
+#define HV_SYS_dev_preada                38
+/* dev_pwritea */
+#define HV_SYS_dev_pwritea               39
+/* flush_remote */
+#define HV_SYS_flush_remote              40
+/* console_putc */
+#define HV_SYS_console_putc              41
+/* inquire_tiles */
+#define HV_SYS_inquire_tiles             42
+/* confstr */
+#define HV_SYS_confstr                   43
+/* reexec */
+#define HV_SYS_reexec                    44
+/* set_command_line */
+#define HV_SYS_set_command_line          45
+
+/* store_mapping */
+#define HV_SYS_store_mapping             52
+/* inquire_realpa */
+#define HV_SYS_inquire_realpa            53
+/* flush_all */
+#define HV_SYS_flush_all                 54
+/* get_ipi_pte */
+#define HV_SYS_get_ipi_pte               55
+/* set_pte_super_shift */
+#define HV_SYS_set_pte_super_shift       56
+/* set_speed */
+#define HV_SYS_set_speed                 57
+/* install_virt_context */
+#define HV_SYS_install_virt_context      58
+/* inquire_virt_context */
+#define HV_SYS_inquire_virt_context      59
+/* inquire_guest_context */
+#define HV_SYS_install_guest_context     60
+/* inquire_guest_context */
+#define HV_SYS_inquire_guest_context     61
+
+/*
+ * Number of hypercall (from guest os to host os) other than hv_*().
+ * We leave the previous 128 entries to the usual hv_*() calls
+ * as defined in hypervisor.h.
+ */
+#define KVM_OTHER_HCALL                  128
+
+/* Hypercall index for virtio. */
+#define KVM_HCALL_virtio                 128
+
+/* One greater than the maximum hypercall number. */
+#define KVM_NUM_HCALLS                   256
+
+#ifndef __ASSEMBLER__
+
+struct kvm_regs {
+	struct pt_regs regs;
+};
+
+struct kvm_sregs {
+};
+
+struct kvm_fpu {
+};
+
+struct kvm_debug_exit_arch {
+};
+
+struct kvm_guest_debug_arch {
+};
+
+/* definition of registers in kvm_run */
+struct kvm_sync_regs {
+};
+
+#ifndef __KERNEL__
+/* For hv_*() */
+#define KVM_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
+#define USER_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
+#define NO_EMULATE(name) [HV_SYS_##name] = qemu_emulate_illegal,
+#define BOTH_EMULATE(name) [HV_SYS_##name] = qemu_emulate_hv_##name,
+/* For others */
+#define USER_HCALL(name) [KVM_HCALL_##name] = qemu_handle_##name,
+#endif
+
+#define HCALL_DEFS \
+	/* For hv_*() */ \
+	KVM_EMULATE(init) \
+	NO_EMULATE(install_context) \
+	KVM_EMULATE(sysconf) \
+	KVM_EMULATE(get_rtc) \
+	KVM_EMULATE(set_rtc) \
+	NO_EMULATE(flush_asid) \
+	NO_EMULATE(flush_page) \
+	NO_EMULATE(flush_pages) \
+	USER_EMULATE(restart) \
+	USER_EMULATE(halt) \
+	USER_EMULATE(power_off) \
+	USER_EMULATE(inquire_physical) \
+	USER_EMULATE(inquire_memory_controller) \
+	KVM_EMULATE(inquire_virtual) \
+	KVM_EMULATE(inquire_asid) \
+	NO_EMULATE(console_read_if_ready) \
+	NO_EMULATE(console_write) \
+	NO_EMULATE(downcall_dispatch) \
+	KVM_EMULATE(inquire_topology) \
+	USER_EMULATE(fs_findfile) \
+	USER_EMULATE(fs_fstat) \
+	USER_EMULATE(fs_pread) \
+	KVM_EMULATE(physaddr_read64) \
+	KVM_EMULATE(physaddr_write64) \
+	USER_EMULATE(get_command_line) \
+	USER_EMULATE(set_caching) \
+	NO_EMULATE(bzero_page) \
+	KVM_EMULATE(register_message_state) \
+	KVM_EMULATE(send_message) \
+	KVM_EMULATE(receive_message) \
+	KVM_EMULATE(inquire_context) \
+	KVM_EMULATE(start_all_tiles) \
+	USER_EMULATE(dev_open) \
+	USER_EMULATE(dev_close) \
+	USER_EMULATE(dev_pread) \
+	USER_EMULATE(dev_pwrite) \
+	USER_EMULATE(dev_poll) \
+	USER_EMULATE(dev_poll_cancel) \
+	USER_EMULATE(dev_preada) \
+	USER_EMULATE(dev_pwritea) \
+	USER_EMULATE(flush_remote) \
+	NO_EMULATE(console_putc) \
+	KVM_EMULATE(inquire_tiles) \
+	KVM_EMULATE(confstr) \
+	USER_EMULATE(reexec) \
+	USER_EMULATE(set_command_line) \
+	USER_EMULATE(store_mapping) \
+	NO_EMULATE(inquire_realpa) \
+	NO_EMULATE(flush_all) \
+	KVM_EMULATE(get_ipi_pte) \
+	KVM_EMULATE(set_pte_super_shift) \
+	KVM_EMULATE(set_speed) \
+	/* For others */ \
+	USER_HCALL(virtio)
+
+#endif
+
+#endif /* _UAPI_ASM_TILE_KVM_H */
diff --git a/arch/tile/include/uapi/asm/kvm_virtio.h b/arch/tile/include/uapi/asm/kvm_virtio.h
new file mode 100644
index 0000000..d94f535
--- /dev/null
+++ b/arch/tile/include/uapi/asm/kvm_virtio.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#ifndef _UAPI_ASM_TILE_KVM_VIRTIO_H
+#define _UAPI_ASM_TILE_KVM_VIRTIO_H
+
+#include <linux/types.h>
+
+#define KVM_VIRTIO_UNKNOWN	0
+#define KVM_VIRTIO_NOTIFY	1
+#define KVM_VIRTIO_RESET	2
+#define KVM_VIRTIO_SET_STATUS	3
+
+struct kvm_device_desc {
+	/* The device type: console, network, disk etc.  Type 0 terminates. */
+	__u8 type;
+	/* The number of virtqueues (first in config array) */
+	__u8 num_vq;
+	/*
+	 * The number of bytes of feature bits.  Multiply by 2: one for host
+	 * features and one for Guest acknowledgements.
+	 */
+	__u8 feature_len;
+	/* The number of bytes of the config array after virtqueues. */
+	__u8 config_len;
+	/* A status byte, written by the Guest. */
+	__u8 status;
+	__u64 config[0];
+};
+
+struct kvm_vqinfo {
+	/* Pointer to the information contained in the device config. */
+	struct kvm_vqconfig *config;
+	/* The address where we mapped the virtio ring, so we can unmap it. */
+	void *pages;
+};
+
+struct kvm_vqconfig {
+	/* The physical address of the virtio ring */
+	__u64 pa;
+	/* The number of entries in the virtio_ring */
+	__u64 num;
+	/* The interrupt we get when something happens. Set by the guest. */
+	__u32 irq;
+
+};
+
+
+#endif /* _UAPI_ASM_TILE_KVM_VIRTIO_H */
diff --git a/arch/tile/kernel/Makefile b/arch/tile/kernel/Makefile
index b7c8b5e..b638d3e 100644
--- a/arch/tile/kernel/Makefile
+++ b/arch/tile/kernel/Makefile
@@ -29,5 +29,6 @@ obj-$(CONFIG_TILE_USB)		+= usb.o
 obj-$(CONFIG_TILE_HVGLUE_TRACE)	+= hvglue_trace.o
 obj-$(CONFIG_FUNCTION_TRACER)	+= ftrace.o mcount_64.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
+obj-$(CONFIG_KVM_GUEST)		+= kvm_virtio.o
 
 obj-y				+= vdso/
diff --git a/arch/tile/kernel/asm-offsets.c b/arch/tile/kernel/asm-offsets.c
index 97ea6ac..0a04a16 100644
--- a/arch/tile/kernel/asm-offsets.c
+++ b/arch/tile/kernel/asm-offsets.c
@@ -20,6 +20,9 @@
 #include <linux/hardirq.h>
 #include <linux/ptrace.h>
 #include <hv/hypervisor.h>
+#ifdef CONFIG_KVM
+#include <linux/kvm_host.h>
+#endif
 
 /* Check for compatible compiler early in the build. */
 #ifdef CONFIG_TILEGX
@@ -68,6 +71,10 @@ void foo(void)
 	DEFINE(THREAD_INFO_UNALIGN_JIT_TMP_OFFSET,
 	       offsetof(struct thread_info, unalign_jit_tmp));
 #endif
+#ifdef CONFIG_KVM
+	DEFINE(THREAD_INFO_VCPU_OFFSET,
+	       offsetof(struct thread_info, vcpu));
+#endif
 
 	DEFINE(TASK_STRUCT_THREAD_KSP_OFFSET,
 	       offsetof(struct task_struct, thread.ksp));
diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c
index b608e00..0393689 100644
--- a/arch/tile/kernel/early_printk.c
+++ b/arch/tile/kernel/early_printk.c
@@ -18,11 +18,27 @@
 #include <linux/string.h>
 #include <linux/irqflags.h>
 #include <linux/printk.h>
+#ifdef CONFIG_KVM_GUEST
+#include <linux/virtio_console.h>
+#include <linux/kvm_para.h>
+#include <asm/kvm_virtio.h>
+#endif
 #include <asm/setup.h>
 #include <hv/hypervisor.h>
 
+
 static void early_hv_write(struct console *con, const char *s, unsigned n)
 {
+#ifdef CONFIG_KVM_GUEST
+	char buf[512];
+
+	if (n > sizeof(buf) - 1)
+		n = sizeof(buf) - 1;
+	memcpy(buf, s, n);
+	buf[n] = '\0';
+
+	hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(buf));
+#else
 	tile_console_write(s, n);
 
 	/*
@@ -32,6 +48,7 @@ static void early_hv_write(struct console *con, const char *s, unsigned n)
 	 */
 	if (n && s[n-1] == '\n')
 		tile_console_write("\r", 1);
+#endif
 }
 
 static struct console early_hv_console = {
diff --git a/arch/tile/kernel/head_32.S b/arch/tile/kernel/head_32.S
index f3f17b0..8d5b40f 100644
--- a/arch/tile/kernel/head_32.S
+++ b/arch/tile/kernel/head_32.S
@@ -162,8 +162,8 @@ ENTRY(swapper_pg_dir)
 	.set addr, addr + PGDIR_SIZE
 	.endr
 
-	/* The true text VAs are mapped as VA = PA + MEM_SV_INTRPT */
-	PTE MEM_SV_INTRPT, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
+	/* The true text VAs are mapped as VA = PA + MEM_SV_START */
+	PTE MEM_SV_START, 0, (1 << (HV_PTE_INDEX_READABLE - 32)) | \
 			      (1 << (HV_PTE_INDEX_EXECUTABLE - 32))
 	.org swapper_pg_dir + PGDIR_SIZE
 	END(swapper_pg_dir)
diff --git a/arch/tile/kernel/head_64.S b/arch/tile/kernel/head_64.S
index 652b814..bd0e12f 100644
--- a/arch/tile/kernel/head_64.S
+++ b/arch/tile/kernel/head_64.S
@@ -135,9 +135,9 @@ ENTRY(_start)
 1:
 
 	/* Install the interrupt base. */
-	moveli r0, hw2_last(MEM_SV_START)
-	shl16insli r0, r0, hw1(MEM_SV_START)
-	shl16insli r0, r0, hw0(MEM_SV_START)
+	moveli r0, hw2_last(intrpt_start)
+	shl16insli r0, r0, hw1(intrpt_start)
+	shl16insli r0, r0, hw0(intrpt_start)
 	mtspr SPR_INTERRUPT_VECTOR_BASE_K, r0
 
 	/* Get our processor number and save it away in SAVE_K_0. */
diff --git a/arch/tile/kernel/hvglue.S b/arch/tile/kernel/hvglue.S
index 16576c6..2914a9e 100644
--- a/arch/tile/kernel/hvglue.S
+++ b/arch/tile/kernel/hvglue.S
@@ -71,5 +71,11 @@ gensym hv_flush_all, 0x6e0, 32
 gensym hv_get_ipi_pte, 0x700, 32
 gensym hv_set_pte_super_shift, 0x720, 32
 gensym hv_set_speed, 0x740, 32
+gensym hv_install_virt_context, 0x760, 32
+gensym hv_inquire_virt_context, 0x780, 32
+gensym hv_install_guest_context, 0x7a0, 32
+gensym hv_inquire_guest_context, 0x7c0, 32
 gensym hv_console_set_ipi, 0x7e0, 32
-gensym hv_glue_internals, 0x800, 30720
+gensym hv_glue_internals, 0x800, 2048
+gensym hcall_virtio, 0x1000, 32
+gensym hv_hcall_internals, 0x1020, 28640
diff --git a/arch/tile/kernel/hvglue_trace.c b/arch/tile/kernel/hvglue_trace.c
index 16ef6c1..3b15c76 100644
--- a/arch/tile/kernel/hvglue_trace.c
+++ b/arch/tile/kernel/hvglue_trace.c
@@ -75,6 +75,10 @@
 #define hv_get_ipi_pte _hv_get_ipi_pte
 #define hv_set_pte_super_shift _hv_set_pte_super_shift
 #define hv_set_speed _hv_set_speed
+#define hv_install_virt_context _hv_install_virt_context
+#define hv_inquire_virt_context _hv_inquire_virt_context
+#define hv_install_guest_context _hv_install_guest_context
+#define hv_inquire_guest_context _hv_inquire_guest_context
 #define hv_console_set_ipi _hv_console_set_ipi
 #include <hv/hypervisor.h>
 #undef hv_init
@@ -135,6 +139,10 @@
 #undef hv_get_ipi_pte
 #undef hv_set_pte_super_shift
 #undef hv_set_speed
+#undef hv_install_virt_context
+#undef hv_inquire_virt_context
+#undef hv_install_guest_context
+#undef hv_inquire_guest_context
 #undef hv_console_set_ipi
 
 /*
@@ -209,8 +217,14 @@ HV_WRAP3(HV_SetSpeed, hv_set_speed, unsigned long, speed, __hv64, start_cycle,
 	 unsigned long, flags)
 HV_WRAP4(int, hv_install_context, HV_PhysAddr, page_table, HV_PTE, access,
 	 HV_ASID, asid, __hv32, flags)
+HV_WRAP4(int, hv_install_virt_context, HV_PhysAddr, page_table, HV_PTE, access,
+	 HV_ASID, asid, __hv32, flags)
+HV_WRAP4(int, hv_install_guest_context, HV_PhysAddr, page_table, HV_PTE, access,
+	 HV_ASID, asid, __hv32, flags)
 HV_WRAP2(int, hv_set_pte_super_shift, int, level, int, log2_count)
 HV_WRAP0(HV_Context, hv_inquire_context)
+HV_WRAP0(HV_Context, hv_inquire_virt_context)
+HV_WRAP0(HV_Context, hv_inquire_guest_context)
 HV_WRAP1(int, hv_flush_asid, HV_ASID, asid)
 HV_WRAP2(int, hv_flush_page, HV_VirtAddr, address, HV_PageSize, page_size)
 HV_WRAP3(int, hv_flush_pages, HV_VirtAddr, start, HV_PageSize, page_size,
diff --git a/arch/tile/kernel/intvec_32.S b/arch/tile/kernel/intvec_32.S
index f3d26f4..2ce69a5 100644
--- a/arch/tile/kernel/intvec_32.S
+++ b/arch/tile/kernel/intvec_32.S
@@ -353,7 +353,7 @@ intvec_\vecname:
 #ifdef __COLLECT_LINKER_FEEDBACK__
 	.pushsection .text.intvec_feedback,"ax"
 	.org    (\vecnum << 5)
-	FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+	FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
 	jrp     lr
 	.popsection
 #endif
@@ -806,7 +806,7 @@ handle_interrupt:
 STD_ENTRY(interrupt_return)
 	/* If we're resuming to kernel space, don't check thread flags. */
 	{
-	 bnz    r30, .Lrestore_all  /* NMIs don't special-case user-space */
+	 bnz    r30, restore_all  /* NMIs don't special-case user-space */
 	 PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
 	}
 	lw      r29, r29
@@ -845,11 +845,11 @@ STD_ENTRY(interrupt_return)
 	 seq    r27, r27, r28
 	}
 	{
-	 bbns   r27, .Lrestore_all
+	 bbns   r27, restore_all
 	 addi   r28, r28, 8
 	}
 	sw      r29, r28
-	j       .Lrestore_all
+	j       restore_all
 
 .Lresume_userspace:
 	FEEDBACK_REENTER(interrupt_return)
@@ -887,7 +887,7 @@ STD_ENTRY(interrupt_return)
 	 auli   r1, r1, ha16(_TIF_ALLWORK_MASK)
 	}
 	and     r1, r29, r1
-	bzt     r1, .Lrestore_all
+	bzt     r1, restore_all
 
 	/*
 	 * Make sure we have all the registers saved for signal
@@ -926,7 +926,9 @@ STD_ENTRY(interrupt_return)
 	 * profile interrupt will actually disable interrupts in both SPRs
 	 * before returning, which is OK.)
 	 */
-.Lrestore_all:
+	.global restore_all
+	.type restore_all, @function
+restore_all:
 	PTREGS_PTR(r0, PTREGS_OFFSET_EX1)
 	{
 	 lw     r0, r0
@@ -1890,8 +1892,8 @@ int_unalign:
 	push_extra_callee_saves r0
 	j       do_trap
 
-/* Include .intrpt1 array of interrupt vectors */
-	.section ".intrpt1", "ax"
+/* Include .intrpt array of interrupt vectors */
+	.section ".intrpt", "ax"
 
 #define op_handle_perf_interrupt bad_intr
 #define op_handle_aux_perf_interrupt bad_intr
diff --git a/arch/tile/kernel/intvec_64.S b/arch/tile/kernel/intvec_64.S
index 30d2d02..54ae76b 100644
--- a/arch/tile/kernel/intvec_64.S
+++ b/arch/tile/kernel/intvec_64.S
@@ -29,11 +29,25 @@
 #include <arch/abi.h>
 #include <arch/interrupts.h>
 #include <arch/spr_def.h>
+#include <arch/opcode.h>
+#ifdef CONFIG_KVM
+#include <asm/kvm_host.h>
+#endif
 
 #define PTREGS_PTR(reg, ptreg) addli reg, sp, C_ABI_SAVE_AREA_SIZE + (ptreg)
 
 #define PTREGS_OFFSET_SYSCALL PTREGS_OFFSET_REG(TREG_SYSCALL_NR)
 
+#if CONFIG_KERNEL_PL == 1 || CONFIG_KERNEL_PL == 2
+/*
+ * Set "result" non-zero if ex1 holds the PL of the kernel
+ * (with or without ICS being set).  Note this works only
+ * because we never find the PL at level 3.
+ */
+# define IS_KERNEL_EX1(result, ex1) andi result, ex1, CONFIG_KERNEL_PL
+#else
+# error Recode IS_KERNEL_EX1 for CONFIG_KERNEL_PL
+#endif
 
 	.macro  push_reg reg, ptr=sp, delta=-8
 	{
@@ -302,7 +316,7 @@ intvec_\vecname:
 	mtspr   SPR_SYSTEM_SAVE_K_1, r0
 	mfspr   r0, SPR_EX_CONTEXT_K_1
 
-	andi    r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	IS_KERNEL_EX1(r0, r0)
 
 	.ifc    \vecnum, INT_DOUBLE_FAULT
 	/*
@@ -340,10 +354,6 @@ intvec_\vecname:
 	 *
 	 * Note that the hypervisor *always* sets SYSTEM_SAVE_K_2 for
 	 * any path that turns into a downcall to one of our TLB handlers.
-	 *
-	 * FIXME: if we end up never using this path, perhaps we should
-	 * prevent the hypervisor from generating downcalls in this case.
-	 * The advantage of getting a downcall is we can panic in Linux.
 	 */
 	mfspr   r0, SPR_SYSTEM_SAVE_K_2
 	{
@@ -483,6 +493,10 @@ intvec_\vecname:
 	mfspr   r2, SPR_SYSTEM_SAVE_K_3   /* address of page fault */
 	mfspr   r3, SPR_SYSTEM_SAVE_K_2   /* info about page fault */
 	.else
+	.ifc \c_routine, kvm_vpgtable_miss
+	mfspr   r2, SPR_SYSTEM_SAVE_K_3   /* address of page fault */
+	mfspr   r3, SPR_SYSTEM_SAVE_K_2   /* info about page fault */
+	.else
 	.ifc \vecnum, INT_ILL_TRANS
 	mfspr   r2, ILL_VA_PC
 	.else
@@ -505,6 +519,7 @@ intvec_\vecname:
 	.endif
 	.endif
 	.endif
+	.endif
 	/* Put function pointer in r0 */
 	moveli  r0, hw2_last(\c_routine)
 	shl16insli r0, r0, hw1(\c_routine)
@@ -518,7 +533,7 @@ intvec_\vecname:
 #ifdef __COLLECT_LINKER_FEEDBACK__
 	.pushsection .text.intvec_feedback,"ax"
 	.org    (\vecnum << 5)
-	FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt1, 1 << 8)
+	FEEDBACK_ENTER_EXPLICIT(intvec_\vecname, .intrpt, 1 << 8)
 	jrp     lr
 	.popsection
 #endif
@@ -634,24 +649,25 @@ intvec_\vecname:
 	/*
 	 * If we will be returning to the kernel, we will need to
 	 * reset the interrupt masks to the state they had before.
-	 * Set DISABLE_IRQ in flags iff we came from PL1 with irqs disabled.
+	 * Set DISABLE_IRQ in flags iff we came from kernel pl with
+	 * irqs disabled.
 	 */
-	mfspr   r32, SPR_EX_CONTEXT_K_1
+	mfspr   r22, SPR_EX_CONTEXT_K_1
 	{
-	 andi   r32, r32, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	 IS_KERNEL_EX1(r22, r22)
 	 PTREGS_PTR(r21, PTREGS_OFFSET_FLAGS)
 	}
-	beqzt   r32, 1f       /* zero if from user space */
-	IRQS_DISABLED(r32)    /* zero if irqs enabled */
+	beqzt  r22, 1f        /* zero if from user space */
+	IRQS_DISABLED(r22)    /* zero if irqs enabled */
 #if PT_FLAGS_DISABLE_IRQ != 1
 # error Value of IRQS_DISABLED used to set PT_FLAGS_DISABLE_IRQ; fix
 #endif
 1:
 	.ifnc \function,handle_syscall
 	/* Record the fact that we saved the caller-save registers above. */
-	ori     r32, r32, PT_FLAGS_CALLER_SAVES
+	ori     r22, r22, PT_FLAGS_CALLER_SAVES
 	.endif
-	st      r21, r32
+	st      r21, r22
 
 	/*
 	 * we've captured enough state to the stack (including in
@@ -691,12 +707,29 @@ intvec_\vecname:
 	move    tp, zero
 #endif
 
+	/*
+	 * Prepare the first 256 stack bytes to be rapidly accessible
+	 * without having to fetch the background data.
+	 */
+	addi    r52, sp, -64
+	{
+	 wh64   r52
+	 addi   r52, r52, -64
+	}
+	{
+	 wh64   r52
+	 addi   r52, r52, -64
+	}
+	{
+	 wh64   r52
+	 addi   r52, r52, -64
+	}
+	wh64    r52
+
 #ifdef __COLLECT_LINKER_FEEDBACK__
 	/*
 	 * Notify the feedback routines that we were in the
-	 * appropriate fixed interrupt vector area.  Note that we
-	 * still have ICS set at this point, so we can't invoke any
-	 * atomic operations or we will panic.  The feedback
+	 * appropriate fixed interrupt vector area.  The feedback
 	 * routines internally preserve r0..r10 and r30 up.
 	 */
 	.ifnc \function,handle_syscall
@@ -715,23 +748,15 @@ intvec_\vecname:
 #endif
 
 	/*
-	 * Prepare the first 256 stack bytes to be rapidly accessible
-	 * without having to fetch the background data.
+	 * Stash any interrupt state in r30..r33 for now.
+	 * This makes it easier to call C code in the code that follows.
+	 * We don't need to on the syscall path since we reload
+	 * them from the stack instead.
 	 */
-	addi    r52, sp, -64
-	{
-	 wh64   r52
-	 addi   r52, r52, -64
-	}
-	{
-	 wh64   r52
-	 addi   r52, r52, -64
-	}
-	{
-	 wh64   r52
-	 addi   r52, r52, -64
-	}
-	wh64    r52
+	.ifnc \function,handle_syscall
+	{ move r30, r0; move r31, r1 }
+	{ move r32, r2; move r33, r3 }
+	.endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
 	.ifnc \function,handle_nmi
@@ -742,17 +767,8 @@ intvec_\vecname:
 	 * For syscalls, we already have the register state saved away
 	 * on the stack, so we don't bother to do any register saves here,
 	 * and later we pop the registers back off the kernel stack.
-	 * For interrupt handlers, save r0-r3 in callee-saved registers.
 	 */
-	.ifnc \function,handle_syscall
-	{ move r30, r0; move r31, r1 }
-	{ move r32, r2; move r33, r3 }
-	.endif
 	TRACE_IRQS_OFF
-	.ifnc \function,handle_syscall
-	{ move r0, r30; move r1, r31 }
-	{ move r2, r32; move r3, r33 }
-	.endif
 	.endif
 #endif
 
@@ -801,11 +817,11 @@ handle_interrupt:
 STD_ENTRY(interrupt_return)
 	/* If we're resuming to kernel space, don't check thread flags. */
 	{
-	 bnez   r30, .Lrestore_all  /* NMIs don't special-case user-space */
+	 bnez   r30, restore_all  /* NMIs don't special-case user-space */
 	 PTREGS_PTR(r29, PTREGS_OFFSET_EX1)
 	}
 	ld      r29, r29
-	andi    r29, r29, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	IS_KERNEL_EX1(r29, r29)
 	{
 	 beqzt  r29, .Lresume_userspace
 	 move   r29, sp
@@ -817,14 +833,25 @@ STD_ENTRY(interrupt_return)
 	addli   r28, r29, THREAD_INFO_FLAGS_OFFSET
 	{
 	 ld     r28, r28
-	 addli  r29, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
+	 addli  r26, r29, THREAD_INFO_PREEMPT_COUNT_OFFSET
 	}
 	{
-	 andi   r28, r28, _TIF_NEED_RESCHED
-	 ld4s   r29, r29
+	 andi   r27, r28, _TIF_NEED_RESCHED
+	 ld4s   r26, r26
 	}
-	beqzt   r28, 1f
-	bnez    r29, 1f
+	beqzt   r27, 1f
+	bnez    r26, 1f
+#ifdef CONFIG_KVM
+	addli   r27, r29, THREAD_INFO_VCPU_OFFSET
+	ld	r27, r27
+	{
+	 beqzt  r27, 0f
+	 movei  r1, KVM_EXIT_AGAIN
+	}
+	push_extra_callee_saves r0
+	j       kvm_trigger_vmexit
+0:
+#endif
 	jal     preempt_schedule_irq
 	FEEDBACK_REENTER(interrupt_return)
 1:
@@ -846,11 +873,11 @@ STD_ENTRY(interrupt_return)
 	 cmpeq  r27, r27, r28
 	}
 	{
-	 blbc   r27, .Lrestore_all
+	 blbc   r27, restore_all
 	 addi   r28, r28, 8
 	}
 	st      r29, r28
-	j       .Lrestore_all
+	j       restore_all
 
 .Lresume_userspace:
 	FEEDBACK_REENTER(interrupt_return)
@@ -890,7 +917,7 @@ STD_ENTRY(interrupt_return)
 	 shl16insli r1, r1, hw0(_TIF_ALLWORK_MASK)
 	}
 	and     r1, r29, r1
-	beqzt   r1, .Lrestore_all
+	beqzt   r1, restore_all
 
 	/*
 	 * Make sure we have all the registers saved for signal
@@ -922,14 +949,16 @@ STD_ENTRY(interrupt_return)
 	 * ICS can only be used in very tight chunks of code to avoid
 	 * tripping over various assertions that it is off.
 	 */
-.Lrestore_all:
+	.global restore_all
+	.type restore_all, @function
+restore_all:
 	PTREGS_PTR(r0, PTREGS_OFFSET_EX1)
 	{
 	 ld      r0, r0
 	 PTREGS_PTR(r32, PTREGS_OFFSET_FLAGS)
 	}
 	{
-	 andi   r0, r0, SPR_EX_CONTEXT_1_1__PL_MASK
+	 IS_KERNEL_EX1(r0, r0)
 	 ld     r32, r32
 	}
 	bnez    r0, 1f
@@ -1000,7 +1029,7 @@ STD_ENTRY(interrupt_return)
 	pop_reg r21, sp, PTREGS_OFFSET_REG(31) - PTREGS_OFFSET_PC
 	{
 	 mtspr  SPR_EX_CONTEXT_K_1, lr
-	 andi   lr, lr, SPR_EX_CONTEXT_1_1__PL_MASK  /* mask off ICS */
+	 IS_KERNEL_EX1(lr, lr)
 	}
 	{
 	 mtspr  SPR_EX_CONTEXT_K_0, r21
@@ -1450,6 +1479,26 @@ int_unalign:
 	j       do_unaligned
 ENDPROC(hand_unalign_slow)
 
+#ifdef CONFIG_KVM
+/*
+ * Any call path that may lead to a vmexit needs to save the full
+ * callee-save register state, since if we vmexit we don't unwind
+ * the callee-saves from the C function stack frames, and instead
+ * just save away the register state from the interrupt handler as-is
+ * and later reload it directly and call back into the guest.
+ */
+	.macro  save_callee_saves_and_tailcall func
+kvm_\func:
+	push_extra_callee_saves r0
+	j       kvm_do_\func
+	ENDPROC(\func)
+	.endm
+
+	save_callee_saves_and_tailcall hypervisor_call
+	save_callee_saves_and_tailcall vpgtable_miss
+	save_callee_saves_and_tailcall vguest_fatal
+#endif
+
 /* Fill the return address stack with nonzero entries. */
 STD_ENTRY(fill_ra_stack)
 	{
@@ -1462,13 +1511,57 @@ STD_ENTRY(fill_ra_stack)
 4:	jrp	r0
 	STD_ENDPROC(fill_ra_stack)
 
+#ifdef CONFIG_KVM
+/*
+ * Handle the downcall dispatch service.  On entry, the client's
+ * system save register 3 holds the original contents of
+ * REG_SYSCALL_NR_NAME, which we need to restore before we iret to
+ * the correct interrupt vector.
+ * Note that we only support the INT_MESSAGE_RCV_DWNCL interrupt
+ * here, since this is the only interrupt handled this way on GX.
+ */
+handle_downcall_dispatch:
+	/*
+	 * If we were called from PL0, jump back to slow path.
+	 * We check just the low bit to make sure it's set, since we
+	 * can only be called from PL0 or PL1.
+	 */
+	mfspr	TREG_SYSCALL_NR_NAME, SPR_EX_CONTEXT_K_1
+	blbc	TREG_SYSCALL_NR_NAME, intvec_SWINT_0
+
+	/* Set the PC to the downcall interrupt vector, and PL to guest. */
+	mfspr	TREG_SYSCALL_NR_NAME, SPR_INTERRUPT_VECTOR_BASE_1
+	addli	TREG_SYSCALL_NR_NAME, TREG_SYSCALL_NR_NAME, \
+	 	INT_MESSAGE_RCV_DWNCL << 8
+	{
+	 mtspr	SPR_EX_CONTEXT_K_0, TREG_SYSCALL_NR_NAME
+	 movei	TREG_SYSCALL_NR_NAME, GUEST_PL | SPR_EX_CONTEXT_1_1__ICS_MASK
+	}
+	mtspr	SPR_EX_CONTEXT_K_1, TREG_SYSCALL_NR_NAME
+
+	/* Restore REG_SYSCALL_NR_NAME and return to the new vector. */
+	mfspr	TREG_SYSCALL_NR_NAME, SPR_SYSTEM_SAVE_1_3
+	iret
+
+	.macro int_hand_kvm_hcall  vecnum, vecname, c_routine, \
+	       processing=handle_interrupt
+	.org   (\vecnum << 8)
+		/* Need special code for downcall dispatch syscall. */
+		beqz TREG_SYSCALL_NR_NAME, handle_downcall_dispatch
+		__int_hand   \vecnum, \vecname, \c_routine, \processing
+	.endm
+
+#endif /* CONFIG_KVM */
+
 	.macro int_hand  vecnum, vecname, c_routine, processing=handle_interrupt
 	.org   (\vecnum << 8)
 		__int_hand   \vecnum, \vecname, \c_routine, \processing
 	.endm
 
-/* Include .intrpt1 array of interrupt vectors */
-	.section ".intrpt1", "ax"
+/* Include .intrpt array of interrupt vectors */
+	.section ".intrpt", "ax"
+	.global intrpt_start
+intrpt_start:
 
 #define op_handle_perf_interrupt bad_intr
 #define op_handle_aux_perf_interrupt bad_intr
@@ -1477,6 +1570,11 @@ STD_ENTRY(fill_ra_stack)
 #define do_hardwall_trap bad_intr
 #endif
 
+#ifndef CONFIG_KVM
+#define kvm_vpgtable_miss bad_intr
+#define kvm_vguest_fatal bad_intr
+#endif
+
 	int_hand     INT_MEM_ERROR, MEM_ERROR, do_trap
 	int_hand     INT_SINGLE_STEP_3, SINGLE_STEP_3, bad_intr
 #if CONFIG_KERNEL_PL == 2
@@ -1497,14 +1595,24 @@ STD_ENTRY(fill_ra_stack)
 	int_hand     INT_SWINT_3, SWINT_3, do_trap
 	int_hand     INT_SWINT_2, SWINT_2, do_trap
 	int_hand     INT_SWINT_1, SWINT_1, SYSCALL, handle_syscall
+#ifdef CONFIG_KVM
+	int_hand_kvm_hcall INT_SWINT_0, SWINT_0, kvm_hypervisor_call
+#else
 	int_hand     INT_SWINT_0, SWINT_0, do_trap
+#endif
 	int_hand     INT_ILL_TRANS, ILL_TRANS, do_trap
 	int_hand_unalign_fast INT_UNALIGN_DATA, UNALIGN_DATA
 	int_hand     INT_DTLB_MISS, DTLB_MISS, do_page_fault
 	int_hand     INT_DTLB_ACCESS, DTLB_ACCESS, do_page_fault
 	int_hand     INT_IDN_FIREWALL, IDN_FIREWALL, do_hardwall_trap
 	int_hand     INT_UDN_FIREWALL, UDN_FIREWALL, do_hardwall_trap
+#ifndef CONFIG_KVM_GUEST
 	int_hand     INT_TILE_TIMER, TILE_TIMER, do_timer_interrupt
+	int_hand     INT_AUX_TILE_TIMER, AUX_TILE_TIMER, bad_intr
+#else
+	int_hand     INT_TILE_TIMER, TILE_TIMER, bad_intr
+	int_hand     INT_AUX_TILE_TIMER, AUX_TILE_TIMER, do_timer_interrupt
+#endif
 	int_hand     INT_IDN_TIMER, IDN_TIMER, bad_intr
 	int_hand     INT_UDN_TIMER, UDN_TIMER, bad_intr
 	int_hand     INT_IDN_AVAIL, IDN_AVAIL, bad_intr
@@ -1534,8 +1642,10 @@ STD_ENTRY(fill_ra_stack)
 	int_hand     INT_MESSAGE_RCV_DWNCL, MESSAGE_RCV_DWNCL, \
 		     hv_message_intr
 	int_hand     INT_DEV_INTR_DWNCL, DEV_INTR_DWNCL, bad_intr
-	int_hand     INT_I_ASID, I_ASID, bad_intr
-	int_hand     INT_D_ASID, D_ASID, bad_intr
+	int_hand     INT_VPGTABLE_MISS_DWNCL, VPGTABLE_MISS_DWNCL, \
+	             kvm_vpgtable_miss
+	int_hand     INT_VGUEST_FATAL_DWNCL, VGUEST_FATAL_DWNCL, \
+		     kvm_vguest_fatal
 	int_hand     INT_DOUBLE_FAULT, DOUBLE_FAULT, do_trap
 
 	/* Synthetic interrupt delivered only by the simulator */
diff --git a/arch/tile/kernel/kvm_virtio.c b/arch/tile/kernel/kvm_virtio.c
new file mode 100644
index 0000000..c6b6c6a
--- /dev/null
+++ b/arch/tile/kernel/kvm_virtio.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+/* Referred lguest & s390 implemenation */
+/*
+ * kvm_virtio.c - virtio for kvm on s390
+ *
+ * Copyright IBM Corp. 2008
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ *    Author(s): Christian Borntraeger <borntraeger@...ibm.com>
+ */
+
+#include <linux/bootmem.h>
+#include <linux/io.h>
+#include <linux/vmalloc.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/export.h>
+#include <linux/virtio.h>
+#include <linux/virtio_config.h>
+#include <linux/virtio_console.h>
+#include <linux/virtio_ring.h>
+#include <linux/virtio_pci.h>
+
+#include <linux/kvm_para.h>
+#include <asm/kvm_virtio.h>
+
+static void *kvm_devices;
+
+/*
+ * TODO: We actually does not use PCI virtio here. We use this
+ * because qemu: virtqueue_init() uses VIRTIO_PCI_VRING_ALIGN.
+ * Maybe we should change them to generic definitions in both qemu & Linux.
+ * Besides, Let's check whether the alignment value (4096, i.e. default
+ * x86 page size) affects performance later.
+ */
+#define KVM_TILE_VIRTIO_RING_ALIGN	VIRTIO_PCI_VRING_ALIGN
+#define to_kvmdev(vd)	container_of(vd, struct kvm_device, vdev)
+
+/*
+ * memory layout: (Total: PAGE_SIZE)
+ * <device 0>
+ * - kvm device descriptor
+ *        struct kvm_device_desc
+ * - vqueue configuration (totally desc->num_vq)
+ *        struct kvm_vqconfig
+ *        ......
+ *        struct kvm_vqconfig
+ * - feature bits (size: desc->feature_len * 2)
+ * - config space (size: desc->config_len)
+ * <device 1>
+ * ......
+ */
+static struct kvm_vqconfig *kvm_vq_config(const struct kvm_device_desc *desc)
+{
+	return (struct kvm_vqconfig *)(desc + 1);
+}
+
+static u8 *kvm_vq_features(const struct kvm_device_desc *desc)
+{
+	return (u8 *)(kvm_vq_config(desc) + desc->num_vq);
+}
+
+static u8 *kvm_vq_configspace(const struct kvm_device_desc *desc)
+{
+	return kvm_vq_features(desc) + desc->feature_len * 2;
+}
+
+/*
+ * The total size of the config page used by this device (incl. desc)
+ */
+static unsigned desc_size(const struct kvm_device_desc *desc)
+{
+	return sizeof(*desc)
+		+ desc->num_vq * sizeof(struct kvm_vqconfig)
+		+ desc->feature_len * 2
+		+ desc->config_len;
+}
+
+/* This gets the device's feature bits. */
+static u32 kvm_get_features(struct virtio_device *vdev)
+{
+	unsigned int i;
+	u32 features = 0;
+	struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+	u8 *in_features = kvm_vq_features(desc);
+
+	for (i = 0; i < min(desc->feature_len * 8, 32); i++)
+		if (in_features[i / 8] & (1 << (i % 8)))
+			features |= (1 << i);
+	return features;
+}
+
+static void kvm_finalize_features(struct virtio_device *vdev)
+{
+	unsigned int i, bits;
+	struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+	/* Second half of bitmap is features we accept. */
+	u8 *out_features = kvm_vq_features(desc) + desc->feature_len;
+
+	/* Give virtio_ring a chance to accept features. */
+	vring_transport_features(vdev);
+
+	memset(out_features, 0, desc->feature_len);
+	bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
+	for (i = 0; i < bits; i++) {
+		if (test_bit(i, vdev->features))
+			out_features[i / 8] |= (1 << (i % 8));
+	}
+}
+
+/*
+ * Reading and writing elements in config space
+ */
+static void kvm_get(struct virtio_device *vdev, unsigned int offset,
+		   void *buf, unsigned len)
+{
+	struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+
+	BUG_ON(offset + len > desc->config_len);
+	memcpy(buf, kvm_vq_configspace(desc) + offset, len);
+}
+
+static void kvm_set(struct virtio_device *vdev, unsigned int offset,
+		   const void *buf, unsigned len)
+{
+	struct kvm_device_desc *desc = to_kvmdev(vdev)->desc;
+
+	BUG_ON(offset + len > desc->config_len);
+	memcpy(kvm_vq_configspace(desc) + offset, buf, len);
+}
+
+/*
+ * The operations to get and set the status word just access
+ * the status field of the device descriptor. set_status will also
+ * make a hypercall to the host, to tell about status changes
+ */
+static u8 kvm_get_status(struct virtio_device *vdev)
+{
+	return to_kvmdev(vdev)->desc->status;
+}
+
+static void kvm_set_status(struct virtio_device *vdev, u8 status)
+{
+	BUG_ON(!status);
+	to_kvmdev(vdev)->desc->status = status;
+	hcall_virtio(KVM_VIRTIO_SET_STATUS, to_kvmdev(vdev)->desc_pa);
+}
+
+/*
+ * To reset the device, we use the KVM_VIRTIO_RESET hypercall, using the
+ * descriptor address. The Host will zero the status and all the
+ * features.
+ */
+static void kvm_reset(struct virtio_device *vdev)
+{
+	hcall_virtio(KVM_VIRTIO_RESET, to_kvmdev(vdev)->desc_pa);
+}
+
+/*
+ * When the virtio_ring code wants to notify the Host, it calls us here and we
+ * make a hypercall.  We hand the address  of the virtqueue so the Host
+ * knows which virtqueue we're talking about.
+ */
+static void kvm_notify(struct virtqueue *vq)
+{
+	struct kvm_vqinfo *vqi = vq->priv;
+
+	hcall_virtio(KVM_VIRTIO_NOTIFY, vqi->config->pa);
+}
+
+/*
+ * Must set some caching mode to keep set_pte() happy.
+ * It doesn't matter what we choose, because the PFN
+ * is illegal, so we're going to take a page fault anyway.
+ */
+static inline pgprot_t io_prot(void)
+{
+	return hv_pte_set_mode(PAGE_KERNEL, HV_PTE_MODE_UNCACHED);
+}
+
+/*
+ * This routine finds the first virtqueue described in the configuration of
+ * this device and sets it up.
+ */
+static struct virtqueue *kvm_find_vq(struct virtio_device *vdev,
+				     unsigned index,
+				     void (*callback)(struct virtqueue *vq),
+				     const char *name)
+{
+	struct kvm_device *kdev = to_kvmdev(vdev);
+	struct kvm_vqinfo *vqi;
+	struct kvm_vqconfig *config;
+	struct virtqueue *vq;
+	long irq;
+	int err = -EINVAL;
+
+	if (index >= kdev->desc->num_vq)
+		return ERR_PTR(-ENOENT);
+
+	vqi = kzalloc(sizeof(*vqi), GFP_KERNEL);
+	if (!vqi)
+		return ERR_PTR(-ENOMEM);
+
+	config = kvm_vq_config(kdev->desc)+index;
+
+	vqi->config = config;
+	vqi->pages = generic_remap_prot(config->pa,
+				vring_size(config->num,
+					KVM_TILE_VIRTIO_RING_ALIGN),
+					0, io_prot());
+	if (!vqi->pages) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	vq = vring_new_virtqueue(index, config->num, KVM_TILE_VIRTIO_RING_ALIGN,
+				 vdev, 0, vqi->pages,
+				 kvm_notify, callback, name);
+	if (!vq) {
+		err = -ENOMEM;
+		goto unmap;
+	}
+
+	/*
+	 * Trigger the IPI interrupt in SW way.
+	 * TODO: We do not need to create one irq for each vq. A bit wasteful.
+	 */
+	irq = create_irq();
+	if (irq < 0) {
+		err = -ENXIO;
+		goto del_virtqueue;
+	}
+
+	tile_irq_activate(irq, TILE_IRQ_SW_CLEAR);
+
+	if (request_irq(irq, vring_interrupt, 0, dev_name(&vdev->dev), vq)) {
+		err = -ENXIO;
+		destroy_irq(irq);
+		goto del_virtqueue;
+	}
+
+	config->irq = irq;
+
+	vq->priv = vqi;
+	return vq;
+
+del_virtqueue:
+	vring_del_virtqueue(vq);
+unmap:
+	vunmap(vqi->pages);
+out:
+	return ERR_PTR(err);
+}
+
+static void kvm_del_vq(struct virtqueue *vq)
+{
+	struct kvm_vqinfo *vqi = vq->priv;
+
+	vring_del_virtqueue(vq);
+	vunmap(vqi->pages);
+	kfree(vqi);
+}
+
+static void kvm_del_vqs(struct virtio_device *vdev)
+{
+	struct virtqueue *vq, *n;
+
+	list_for_each_entry_safe(vq, n, &vdev->vqs, list)
+		kvm_del_vq(vq);
+}
+
+static int kvm_find_vqs(struct virtio_device *vdev, unsigned nvqs,
+			struct virtqueue *vqs[],
+			vq_callback_t *callbacks[],
+			const char *names[])
+{
+	struct kvm_device *kdev = to_kvmdev(vdev);
+	int i;
+
+	/* We must have this many virtqueues. */
+	if (nvqs > kdev->desc->num_vq)
+		return -ENOENT;
+
+	for (i = 0; i < nvqs; ++i) {
+		vqs[i] = kvm_find_vq(vdev, i, callbacks[i], names[i]);
+		if (IS_ERR(vqs[i]))
+			goto error;
+	}
+	return 0;
+
+error:
+	kvm_del_vqs(vdev);
+	return PTR_ERR(vqs[i]);
+}
+
+/*
+ * The config ops structure as defined by virtio config
+ */
+static struct virtio_config_ops kvm_vq_config_ops = {
+	.get_features = kvm_get_features,
+	.finalize_features = kvm_finalize_features,
+	.get = kvm_get,
+	.set = kvm_set,
+	.get_status = kvm_get_status,
+	.set_status = kvm_set_status,
+	.reset = kvm_reset,
+	.find_vqs = kvm_find_vqs,
+	.del_vqs = kvm_del_vqs,
+};
+
+/*
+ * The root device for the kvm virtio devices.
+ * This makes them appear as /sys/devices/kvm_tile/0,1,2 not /sys/devices/0,1,2.
+ */
+static struct device *kvm_root;
+
+/*
+ * adds a new device and register it with virtio
+ * appropriate drivers are loaded by the device model
+ */
+static void add_kvm_device(struct kvm_device_desc *d, unsigned int offset)
+{
+	struct kvm_device *kdev;
+
+	kdev = kzalloc(sizeof(*kdev), GFP_KERNEL);
+	if (!kdev) {
+		pr_emerg("Cannot allocate kvm dev %u type %u\n",
+			 offset, d->type);
+		return;
+	}
+
+	kdev->vdev.dev.parent = kvm_root;
+	kdev->vdev.id.device = d->type;
+	kdev->vdev.config = &kvm_vq_config_ops;
+	kdev->desc = d;
+	kdev->desc_pa = PFN_PHYS(max_pfn) + offset;
+
+	if (register_virtio_device(&kdev->vdev) != 0) {
+		pr_err("Failed to register kvm device %u type %u\n",
+		       offset, d->type);
+		kfree(kdev);
+	}
+}
+
+/*
+ * scan_devices() simply iterates through the device page.
+ * The type 0 is reserved to mean "end of devices".
+ */
+static void scan_devices(void)
+{
+	unsigned int i;
+	struct kvm_device_desc *d;
+
+	for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
+		d = kvm_devices + i;
+
+		if (d->type == 0)
+			break;
+
+		add_kvm_device(d, i);
+	}
+}
+
+/*
+ * Init function for virtio.
+ * devices are in a single page above the top of "normal" mem.
+ */
+static int __init kvm_devices_init(void)
+{
+	int rc = -ENOMEM;
+
+	kvm_root = root_device_register("kvm_tile");
+	if (IS_ERR(kvm_root)) {
+		rc = PTR_ERR(kvm_root);
+		pr_err("Could not register kvm_tile root device");
+		return rc;
+	}
+
+	kvm_devices = generic_remap_prot(PFN_PHYS(max_pfn), PAGE_SIZE,
+					 0, io_prot());
+	if (!kvm_devices) {
+		kvm_devices = NULL;
+		root_device_unregister(kvm_root);
+		return rc;
+	}
+
+	scan_devices();
+	return 0;
+}
+
+/* code for early console output with virtio_console */
+static __init int early_put_chars(u32 vtermno, const char *buf, int len)
+{
+	char scratch[512];
+
+	if (len > sizeof(scratch) - 1)
+		len = sizeof(scratch) - 1;
+	scratch[len] = '\0';
+	memcpy(scratch, buf, len);
+	hcall_virtio(KVM_VIRTIO_NOTIFY, __pa(scratch));
+
+	return len;
+}
+
+static int __init tile_virtio_console_init(void)
+{
+	return virtio_cons_early_init(early_put_chars);
+}
+console_initcall(tile_virtio_console_init);
+
+/*
+ * We do this after core stuff, but before the drivers.
+ */
+postcore_initcall(kvm_devices_init);
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 44cdc4a..2629ff1 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -27,6 +27,7 @@
 #include <linux/kernel.h>
 #include <linux/tracehook.h>
 #include <linux/signal.h>
+#include <linux/kvm_host.h>
 #include <asm/stack.h>
 #include <asm/switch_to.h>
 #include <asm/homecache.h>
@@ -247,11 +248,13 @@ struct task_struct *validate_current(void)
 /* Take and return the pointer to the previous task, for schedule_tail(). */
 struct task_struct *sim_notify_fork(struct task_struct *prev)
 {
+#ifndef CONFIG_KVM_GUEST   /* see notify_sim_task_change() */
 	struct task_struct *tsk = current;
 	__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_FORK_PARENT |
 		     (tsk->thread.creator_pid << _SIM_CONTROL_OPERATOR_BITS));
 	__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_FORK |
 		     (tsk->pid << _SIM_CONTROL_OPERATOR_BITS));
+#endif
 	return prev;
 }
 
@@ -450,6 +453,11 @@ void _prepare_arch_switch(struct task_struct *next)
 struct task_struct *__sched _switch_to(struct task_struct *prev,
 				       struct task_struct *next)
 {
+#ifdef CONFIG_KVM
+	/* vmexit is needed before context switch. */
+	BUG_ON(task_thread_info(prev)->vcpu);
+#endif
+
 	/* DMA state is already saved; save off other arch state. */
 	save_arch_state(&prev->thread);
 
@@ -519,6 +527,29 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
 	/* Enable interrupts; they are disabled again on return to caller. */
 	local_irq_enable();
 
+#ifdef CONFIG_KVM
+	/*
+	 * Some work requires us to exit the VM first.  Typically this
+	 * allows the process running the VM to respond to the work
+	 * (e.g. a signal), or allows the VM mechanism to latch
+	 * modified host state (e.g. a "hypervisor" message sent to a
+	 * different vcpu).  It also means that if we are considering
+	 * calling schedule(), we exit the VM first, so we never have
+	 * to worry about context-switching into a VM.
+	 */
+	if (current_thread_info()->vcpu) {
+		u32 do_exit = thread_info_flags &
+			(_TIF_NEED_RESCHED|_TIF_SIGPENDING|_TIF_VIRT_EXIT);
+
+		if (thread_info_flags & _TIF_VIRT_EXIT)
+			clear_thread_flag(TIF_VIRT_EXIT);
+		if (do_exit) {
+			kvm_trigger_vmexit(regs, KVM_EXIT_AGAIN);
+			/*NORETURN*/
+		}
+	}
+#endif
+
 	if (thread_info_flags & _TIF_NEED_RESCHED) {
 		schedule();
 		return 1;
@@ -538,11 +569,12 @@ int do_work_pending(struct pt_regs *regs, u32 thread_info_flags)
 		tracehook_notify_resume(regs);
 		return 1;
 	}
-	if (thread_info_flags & _TIF_SINGLESTEP) {
+
+	/* Handle a few flags here that stay set. */
+	if (thread_info_flags & _TIF_SINGLESTEP)
 		single_step_once(regs);
-		return 0;
-	}
-	panic("work_pending: bad flags %#x\n", thread_info_flags);
+
+	return 0;
 }
 
 unsigned long get_wchan(struct task_struct *p)
diff --git a/arch/tile/kernel/relocate_kernel_64.S b/arch/tile/kernel/relocate_kernel_64.S
index 1c09a4f..02bc446 100644
--- a/arch/tile/kernel/relocate_kernel_64.S
+++ b/arch/tile/kernel/relocate_kernel_64.S
@@ -34,11 +34,11 @@ STD_ENTRY(relocate_new_kernel)
 	addi	sp, sp, -8
 	/* we now have a stack (whether we need one or not) */
 
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
 	moveli	r40, hw2_last(hv_console_putc)
 	shl16insli r40, r40, hw1(hv_console_putc)
 	shl16insli r40, r40, hw0(hv_console_putc)
 
-#ifdef RELOCATE_NEW_KERNEL_VERBOSE
 	moveli	r0, 'r'
 	jalr	r40
 
@@ -176,10 +176,12 @@ STD_ENTRY(relocate_new_kernel)
 
 	/* we should not get here */
 
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
 	moveli	r0, '?'
 	jalr	r40
 	moveli	r0, '\n'
 	jalr	r40
+#endif
 
 	j	.Lhalt
 
@@ -237,7 +239,9 @@ STD_ENTRY(relocate_new_kernel)
 	j	.Lloop
 
 
-.Lerr:	moveli	r0, 'e'
+.Lerr:
+#ifdef RELOCATE_NEW_KERNEL_VERBOSE
+	moveli	r0, 'e'
 	jalr	r40
 	moveli	r0, 'r'
 	jalr	r40
@@ -245,6 +249,7 @@ STD_ENTRY(relocate_new_kernel)
 	jalr	r40
 	moveli	r0, '\n'
 	jalr	r40
+#endif
 .Lhalt:
 	moveli r41, hw2_last(hv_halt)
 	shl16insli r41, r41, hw1(hv_halt)
diff --git a/arch/tile/kernel/setup.c b/arch/tile/kernel/setup.c
index 774e819..2352a81 100644
--- a/arch/tile/kernel/setup.c
+++ b/arch/tile/kernel/setup.c
@@ -268,7 +268,7 @@ early_param("vmalloc", parse_vmalloc);
 /*
  * Determine for each controller where its lowmem is mapped and how much of
  * it is mapped there.  On controller zero, the first few megabytes are
- * already mapped in as code at MEM_SV_INTRPT, so in principle we could
+ * already mapped in as code at MEM_SV_START, so in principle we could
  * start our data mappings higher up, but for now we don't bother, to avoid
  * additional confusion.
  *
@@ -1074,7 +1074,20 @@ void __cpuinit setup_cpu(int boot)
 	 * SPRs, as well as the interrupt mask.
 	 */
 	__insn_mtspr(SPR_MPL_INTCTRL_0_SET_0, 1);
+
+#ifdef CONFIG_KVM
+	/*
+	 * If we launch a guest kernel, it will need some interrupts
+	 * that otherwise are not used by the host or by userspace.
+	 * Set them to MPL 1 now and leave them alone going forward;
+	 * they are masked in the host so will never fire there anyway,
+	 * and we mask them at PL1 as we exit the guest.
+	 */
 	__insn_mtspr(SPR_MPL_INTCTRL_1_SET_1, 1);
+	__insn_mtspr(SPR_MPL_SINGLE_STEP_1_SET_1, 1);
+	__insn_mtspr(SPR_MPL_AUX_TILE_TIMER_SET_1, 1);
+	__insn_mtspr(SPR_MPL_IPI_1_SET_1, 1);
+#endif
 
 	/* Initialize IRQ support for this cpu. */
 	setup_irq_regs();
@@ -1242,7 +1255,7 @@ static void __init validate_va(void)
 #ifndef __tilegx__   /* FIXME: GX: probably some validation relevant here */
 	/*
 	 * Similarly, make sure we're only using allowed VAs.
-	 * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_INTRPT,
+	 * We assume we can contiguously use MEM_USER_INTRPT .. MEM_HV_START,
 	 * and 0 .. KERNEL_HIGH_VADDR.
 	 * In addition, make sure we CAN'T use the end of memory, since
 	 * we use the last chunk of each pgd for the pgd_list.
@@ -1257,7 +1270,7 @@ static void __init validate_va(void)
 		if (range.size == 0)
 			break;
 		if (range.start <= MEM_USER_INTRPT &&
-		    range.start + range.size >= MEM_HV_INTRPT)
+		    range.start + range.size >= MEM_HV_START)
 			user_kernel_ok = 1;
 		if (range.start == 0)
 			max_va = range.size;
@@ -1693,7 +1706,7 @@ insert_ram_resource(u64 start_pfn, u64 end_pfn, bool reserved)
 static int __init request_standard_resources(void)
 {
 	int i;
-	enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+	enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
 
 #if defined(CONFIG_PCI) && !defined(__tilegx__)
 	insert_non_bus_resource();
diff --git a/arch/tile/kernel/smp.c b/arch/tile/kernel/smp.c
index 0ae1c59..62b3ba9 100644
--- a/arch/tile/kernel/smp.c
+++ b/arch/tile/kernel/smp.c
@@ -223,30 +223,34 @@ void __init ipi_init(void)
 
 #if CHIP_HAS_IPI()
 
-void smp_send_reschedule(int cpu)
+static void __smp_send_reschedule(int cpu)
 {
-	WARN_ON(cpu_is_offline(cpu));
-
 	/*
 	 * We just want to do an MMIO store.  The traditional writeq()
 	 * functions aren't really correct here, since they're always
 	 * directed at the PCI shim.  For now, just do a raw store,
-	 * casting away the __iomem attribute.
+	 * casting away the __iomem attribute.  We do the store as a
+	 * single asm() instruction to ensure that we can force a step
+	 * over it in the KVM case, if we are not binding vcpus to cpus,
+	 * rather than require it to be possible to issue validly.
 	 */
-	((unsigned long __force *)ipi_mappings[cpu])[IRQ_RESCHEDULE] = 0;
+	unsigned long *addr =
+		&((unsigned long __force *)ipi_mappings[cpu])[IRQ_RESCHEDULE];
+	asm volatile("st %0, zero" :: "r" (addr));
 }
 
 #else
 
-void smp_send_reschedule(int cpu)
+static void __smp_send_reschedule(int cpu)
 {
-	HV_Coord coord;
-
-	WARN_ON(cpu_is_offline(cpu));
-
-	coord.y = cpu_y(cpu);
-	coord.x = cpu_x(cpu);
+	HV_Coord coord = { .y = cpu_y(cpu), .x = cpu_x(cpu) };
 	hv_trigger_ipi(coord, IRQ_RESCHEDULE);
 }
 
 #endif /* CHIP_HAS_IPI() */
+
+void smp_send_reschedule(int cpu)
+{
+	WARN_ON(cpu_is_offline(cpu));
+	__smp_send_reschedule(cpu);
+}
diff --git a/arch/tile/kernel/stack.c b/arch/tile/kernel/stack.c
index 24fd223..362284a 100644
--- a/arch/tile/kernel/stack.c
+++ b/arch/tile/kernel/stack.c
@@ -103,7 +103,7 @@ static struct pt_regs *valid_fault_handler(struct KBacktraceIterator* kbt)
 	    p->sp >= sp) {
 		if (kbt->verbose)
 			pr_err("  <%s while in kernel mode>\n", fault);
-	} else if (EX1_PL(p->ex1) == USER_PL &&
+	} else if (user_mode(p) &&
 		   p->sp < PAGE_OFFSET && p->sp != 0) {
 		if (kbt->verbose)
 			pr_err("  <%s while in user mode>\n", fault);
diff --git a/arch/tile/kernel/sysfs.c b/arch/tile/kernel/sysfs.c
index e25b0a8..024b978 100644
--- a/arch/tile/kernel/sysfs.c
+++ b/arch/tile/kernel/sysfs.c
@@ -69,7 +69,11 @@ static ssize_t type_show(struct device *dev,
 			    struct device_attribute *attr,
 			    char *page)
 {
+#ifdef CONFIG_KVM_GUEST
+	return sprintf(page, "KVM\n");
+#else
 	return sprintf(page, "tilera\n");
+#endif
 }
 static DEVICE_ATTR(type, 0444, type_show, NULL);
 
diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
index 3c2dc87..b0b7264 100644
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@@ -117,9 +117,9 @@ void __init time_init(void)
 
 /*
  * Define the tile timer clock event device.  The timer is driven by
- * the TILE_TIMER_CONTROL register, which consists of a 31-bit down
+ * the TILE_[AUX_]TIMER_CONTROL register, which consists of a 31-bit down
  * counter, plus bit 31, which signifies that the counter has wrapped
- * from zero to (2**31) - 1.  The INT_TILE_TIMER interrupt will be
+ * from zero to (2**31) - 1.  The INT_[AUX_]TILE_TIMER interrupt will be
  * raised as long as bit 31 is set.
  */
 
@@ -129,8 +129,8 @@ static int tile_timer_set_next_event(unsigned long ticks,
 				     struct clock_event_device *evt)
 {
 	BUG_ON(ticks > MAX_TICK);
-	__insn_mtspr(SPR_TILE_TIMER_CONTROL, ticks);
-	arch_local_irq_unmask_now(INT_TILE_TIMER);
+	__insn_mtspr(SPR_LINUX_TIMER_CONTROL, ticks);
+	arch_local_irq_unmask_now(INT_LINUX_TIMER);
 	return 0;
 }
 
@@ -141,7 +141,7 @@ static int tile_timer_set_next_event(unsigned long ticks,
 static void tile_timer_set_mode(enum clock_event_mode mode,
 				struct clock_event_device *evt)
 {
-	arch_local_irq_mask_now(INT_TILE_TIMER);
+	arch_local_irq_mask_now(INT_LINUX_TIMER);
 }
 
 static DEFINE_PER_CPU(struct clock_event_device, tile_timer) = {
@@ -161,7 +161,7 @@ void __cpuinit setup_tile_timer(void)
 	evt->cpumask = cpumask_of(smp_processor_id());
 
 	/* Start out with timer not firing. */
-	arch_local_irq_mask_now(INT_TILE_TIMER);
+	arch_local_irq_mask_now(INT_LINUX_TIMER);
 
 	/*
 	 * Register tile timer.  Set min_delta to 1 microsecond, since
@@ -181,7 +181,7 @@ void do_timer_interrupt(struct pt_regs *regs, int fault_num)
 	 * Mask the timer interrupt here, since we are a oneshot timer
 	 * and there are now by definition no events pending.
 	 */
-	arch_local_irq_mask(INT_TILE_TIMER);
+	arch_local_irq_mask(INT_LINUX_TIMER);
 
 	/* Track time spent here in an interrupt context */
 	irq_enter();
diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c
index f110785..19d465c 100644
--- a/arch/tile/kernel/traps.c
+++ b/arch/tile/kernel/traps.c
@@ -30,7 +30,7 @@
 
 void __init trap_init(void)
 {
-	/* Nothing needed here since we link code at .intrpt1 */
+	/* Nothing needed here since we link code at .intrpt */
 }
 
 int unaligned_fixup = 1;
diff --git a/arch/tile/kernel/vmlinux.lds.S b/arch/tile/kernel/vmlinux.lds.S
index c7ae53d..8b20163 100644
--- a/arch/tile/kernel/vmlinux.lds.S
+++ b/arch/tile/kernel/vmlinux.lds.S
@@ -5,7 +5,7 @@
 #include <hv/hypervisor.h>
 
 /* Text loads starting from the supervisor interrupt vector address. */
-#define TEXT_OFFSET MEM_SV_INTRPT
+#define TEXT_OFFSET MEM_SV_START
 
 OUTPUT_ARCH(tile)
 ENTRY(_start)
@@ -13,7 +13,7 @@ jiffies = jiffies_64;
 
 PHDRS
 {
-  intrpt1 PT_LOAD ;
+  intrpt PT_LOAD ;
   text PT_LOAD ;
   data PT_LOAD ;
 }
@@ -24,11 +24,11 @@ SECTIONS
   #define LOAD_OFFSET TEXT_OFFSET
 
   /* Interrupt vectors */
-  .intrpt1 (LOAD_OFFSET) : AT ( 0 )   /* put at the start of physical memory */
+  .intrpt (LOAD_OFFSET) : AT ( 0 )   /* put at the start of physical memory */
   {
     _text = .;
-    *(.intrpt1)
-  } :intrpt1 =0
+    *(.intrpt)
+  } :intrpt =0
 
   /* Hypervisor call vectors */
   . = ALIGN(0x10000);
diff --git a/arch/tile/kvm/Kconfig b/arch/tile/kvm/Kconfig
index 2298cb1..65f7f9d 100644
--- a/arch/tile/kvm/Kconfig
+++ b/arch/tile/kvm/Kconfig
@@ -27,9 +27,6 @@ config KVM
 	  This module provides access to the hardware capabilities through
 	  a character device node named /dev/kvm.
 
-	  To compile this as a module, choose M here: the module
-	  will be called kvm.
-
 	  If unsure, say N.
 
 source drivers/vhost/Kconfig
diff --git a/arch/tile/kvm/Makefile b/arch/tile/kvm/Makefile
new file mode 100644
index 0000000..2c3d206
--- /dev/null
+++ b/arch/tile/kvm/Makefile
@@ -0,0 +1,12 @@
+#
+# Makefile for Kernel-based Virtual Machine module
+#
+
+ccflags-y := -Ivirt/kvm -Iarch/tile/kvm
+
+kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o)
+
+kvm-y += kvm-tile.o
+kvm-y += entry.o
+
+obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/tile/kvm/entry.S b/arch/tile/kvm/entry.S
new file mode 100644
index 0000000..07aa3a6
--- /dev/null
+++ b/arch/tile/kvm/entry.S
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/linkage.h>
+#include <asm/switch_to.h>
+#include <asm/processor.h>
+#include <arch/spr_def.h>
+#include <arch/abi.h>
+
+#define FRAME_SIZE ((4 + CALLEE_SAVED_REGS_COUNT) * 8)
+#define SAVE_REG(r) { st r12, r; addi r12, r12, 8 }
+#define LOAD_REG(r) { ld r, r12; addi r12, r12, 8 }
+#define FOR_EACH_CALLEE_SAVED_REG(f)					\
+							f(r30); f(r31); \
+	f(r32); f(r33); f(r34); f(r35);	f(r36); f(r37); f(r38); f(r39); \
+	f(r40); f(r41); f(r42); f(r43); f(r44); f(r45); f(r46); f(r47); \
+	f(r48); f(r49); f(r50); f(r51); f(r52);
+
+/*
+ * Called with interrupts disabled from kvm_tile_run() and is responsible
+ * just for saving the callee-save registers and the stack pointer, then
+ * resetting ksp0 so subsequent interrupts don't wipe the kernel stack.
+ * It uses restore_all in intvec_64.S to jump back into the guest.
+ * The kvm_vmexit function below undoes the stack manipulation.
+ */
+STD_ENTRY(kvm_vmresume)
+	/* Do function prolog and save callee-saves on stack. */
+	{
+	  move r10, sp
+	  st sp, lr
+	}
+	{
+	  addli r11, sp, -FRAME_SIZE + 8
+	  addli sp, sp, -FRAME_SIZE
+	}
+	{
+	  st r11, r10
+	  addi r12, sp, 16
+	}
+	FOR_EACH_CALLEE_SAVED_REG(SAVE_REG)
+	SAVE_REG(tp)
+	SAVE_REG(lr)
+
+	/* Save frame pointer in thread_info so we can get it back later. */
+	st r1, sp
+
+	/* Set the ksp0 for this core to be below this frame. */
+	mfspr r10, SPR_SYSTEM_SAVE_K_0
+	bfins r10, sp, 0, CPU_SHIFT-1
+	mtspr SPR_SYSTEM_SAVE_K_0, r10
+
+	/* sp points to ABI save area below pt_regs for restore_all. */
+	addli sp, r0, -C_ABI_SAVE_AREA_SIZE
+
+	/* Execute an "interrupt return" to the guest. */
+	{
+	 movei r30, 0
+	 j restore_all
+	}
+	STD_ENDPROC(kvm_vmresume)
+
+/*
+ * Called with interrupts disabled from kvm_trigger_vmexit(); returns with
+ * interrupts still disabled to kvm_vmresume()'s caller, discarding all the
+ * stack contents below the kvm_vmresume() frame.  kvm_vmresume()'s caller
+ * is responsible for resetting SPR_SYSTEM_SAVE_K_0 to its previous value.
+ */
+STD_ENTRY(kvm_vmexit)
+	{
+	 move sp, r0
+	 addi r12, r0, 16
+	}
+	FOR_EACH_CALLEE_SAVED_REG(LOAD_REG)
+	LOAD_REG(tp)
+	LOAD_REG(lr)
+	{
+	  addli sp, sp, FRAME_SIZE
+	  jrp lr
+	}
+	STD_ENDPROC(kvm_vmexit)
diff --git a/arch/tile/kvm/kvm-tile.c b/arch/tile/kvm/kvm-tile.c
new file mode 100644
index 0000000..29b601a
--- /dev/null
+++ b/arch/tile/kvm/kvm-tile.c
@@ -0,0 +1,1585 @@
+/*
+ * Copyright 2013 Tilera Corporation. All Rights Reserved.
+ *
+ *   This program is free software; you can redistribute it and/or
+ *   modify it under the terms of the GNU General Public License
+ *   as published by the Free Software Foundation, version 2.
+ *
+ *   This program is distributed in the hope that it will be useful, but
+ *   WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ *   NON INFRINGEMENT.  See the GNU General Public License for
+ *   more details.
+ */
+
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/kvm_types.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <asm/traps.h>
+#include <asm/pgalloc.h>
+#include <hv/hypervisor.h>
+#include <linux/rtc.h>
+#include <asm/atomic.h>
+#include <asm/tlbflush.h>
+#include <arch/spr_def.h>
+#include <arch/sim.h>
+#include <generated/utsrelease.h>
+
+
+struct kvm_stats_debugfs_item debugfs_entries[] = {
+	{ NULL }
+};
+
+static pte_t *get_vpgd_pte(struct kvm *kvm, unsigned long address)
+{
+	struct mm_struct *mm = kvm->mm;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	if (kvm->arch.vpgd == NULL)
+		kvm->arch.vpgd = pgd_alloc(kvm->mm);
+	pgd = kvm->arch.vpgd + pgd_index(address);
+	pud = pud_alloc(mm, pgd, address);
+	if (!pud)
+		return NULL;
+	pmd = pmd_alloc(mm, pud, address);
+	if (!pmd)
+		return NULL;
+	return pte_alloc_kernel(pmd, address);
+}
+
+int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
+{
+	return VM_FAULT_SIGBUS;
+}
+
+void kvm_arch_free_memslot(struct kvm_memory_slot *free,
+			   struct kvm_memory_slot *dont)
+{
+}
+
+int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
+{
+	return 0;
+}
+
+/* FIXME: support huge pages. */
+int kvm_arch_prepare_memory_region(struct kvm *kvm,
+				   struct kvm_memory_slot *memslot,
+				   struct kvm_userspace_memory_region *mem,
+				   enum kvm_mr_change change)
+{
+	unsigned long gpa, i;
+
+	gpa = mem->guest_phys_addr;
+	for (i = 0; i < mem->memory_size; i += PAGE_SIZE, gpa += PAGE_SIZE)
+		if (get_vpgd_pte(kvm, gpa) == NULL)
+			return -ENOMEM;
+
+	return 0;
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+				   struct kvm_userspace_memory_region *mem,
+				   const struct kvm_memory_slot *old,
+				   enum kvm_mr_change change)
+{
+	unsigned long gpa, address, pfn, i;
+	struct page *page[1];
+	pte_t *ptep, *vptep;
+
+	gpa = mem->guest_phys_addr;
+	address = mem->userspace_addr;
+	for (i = 0; i < mem->memory_size;
+	     i += PAGE_SIZE, gpa += PAGE_SIZE, address += PAGE_SIZE) {
+		vptep = get_vpgd_pte(kvm, gpa);
+		BUG_ON(vptep == NULL);
+		get_user_pages_fast(address, 1, 1, page);
+		pfn = page_to_pfn(page[0]);
+		ptep = virt_to_pte(NULL, (unsigned long)__va(PFN_PHYS(pfn)));
+		*vptep = *ptep;
+	}
+}
+
+void kvm_arch_flush_shadow_all(struct kvm *kvm)
+{
+}
+
+void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
+				   struct kvm_memory_slot *slot)
+{
+	kvm_arch_flush_shadow_all(kvm);
+}
+
+gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
+{
+	return 0;
+}
+
+long kvm_arch_dev_ioctl(struct file *filp,
+			unsigned int ioctl, unsigned long arg)
+{
+	return 0;
+}
+
+static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, unsigned long irq)
+{
+	if (irq < 0)
+		return -EINVAL;
+
+	set_bit(irq, &vcpu->arch.ipi_events);
+	kvm_vcpu_kick(vcpu);
+
+	return 0;
+}
+
+long kvm_arch_vcpu_ioctl(struct file *filp,
+			 unsigned int ioctl, unsigned long arg)
+{
+	struct kvm_vcpu *vcpu = filp->private_data;
+	void __user *argp = (void __user *)arg;
+	int r = 0;
+
+	switch (ioctl) {
+	case KVM_INTERRUPT: {
+		struct kvm_interrupt irq;
+
+		r = -EFAULT;
+		if (copy_from_user(&irq, argp, sizeof(irq)))
+			goto out;
+		r = kvm_vcpu_ioctl_interrupt(vcpu, irq.irq);
+		if (r)
+			goto out;
+		r = 0;
+		break;
+	}
+	case KVM_TILE_RESET_SPR: {
+		/* Initialize guest SPR values */
+		vcpu->arch.timer_control =
+			1UL << SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT;
+		vcpu->arch.vmexit_cycles = get_cycles();
+		vcpu->arch.INTERRUPT_MASK_1 = -1UL;
+		vcpu->arch.INTERRUPT_VECTOR_BASE_1 = 0xfd000000;
+		vcpu->arch.IPI_MASK_1 = -1UL;
+		break;
+	}
+	default:
+		r = -EINVAL;
+	}
+
+out:
+	return r;
+}
+
+int kvm_dev_ioctl_check_extension(long ext)
+{
+	return 0;
+}
+
+int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
+			       struct kvm_dirty_log *log)
+{
+	return 0;
+}
+
+long kvm_arch_vm_ioctl(struct file *filp,
+		       unsigned int ioctl, unsigned long arg)
+{
+	long r = -EINVAL;
+
+	return r;
+}
+
+int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
+{
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
+				  struct kvm_translation *tr)
+{
+	struct kvm *kvm = vcpu->kvm;
+	unsigned long page_size;
+	unsigned long gva = tr->linear_address;
+	unsigned long gpgd_gpa, gpmd_gpa, gpte_gpa;
+	pud_t gpud;
+	pmd_t gpmd;
+	pte_t gpte;
+
+	/* Get guest pgd (aka pud for three-level tables). */
+	gpgd_gpa = vcpu->arch.guest_context.page_table +
+		(sizeof(pgd_t) * pgd_index(gva));
+	if (kvm_read_guest(kvm, gpgd_gpa, &gpud, sizeof(pgd_t)) < 0)
+		goto fail;
+	if (!pud_present(gpud))
+		goto fail;
+
+	/* Get guest pmd. */
+	if (pud_huge_page(gpud)) {
+		/* FIXME: no super huge page support yet. */
+		if (pte_super(*(pte_t *)&gpud))
+			goto fail;
+		gpte = *(pte_t *)&gpud;
+		page_size = PGDIR_SIZE;
+		goto ok;
+	}
+	gpmd_gpa = (pud_ptfn(gpud) << HV_LOG2_PAGE_TABLE_ALIGN) +
+		(sizeof(pmd_t) * pmd_index(gva));
+	if (kvm_read_guest(kvm, gpmd_gpa, &gpmd, sizeof(pmd_t)) < 0)
+		goto fail;
+	if (!pmd_present(gpmd))
+		goto fail;
+
+	/* Get guest pte. */
+	if (pmd_huge_page(gpmd)) {
+		/* FIXME: no super huge page support yet. */
+		if (pte_super(*(pte_t *)&gpmd))
+			goto fail;
+		gpte = *(pte_t *)&gpmd;
+		page_size = PMD_SIZE;
+		goto ok;
+	}
+	gpte_gpa = (pmd_ptfn(gpmd) << HV_LOG2_PAGE_TABLE_ALIGN) +
+		(sizeof(pte_t) * pte_index(gva));
+	if (kvm_read_guest(kvm, gpte_gpa, &gpte, sizeof(pte_t)) < 0)
+		goto fail;
+	if (!pte_present(gpte))
+		goto fail;
+
+	page_size = PAGE_SIZE;
+
+ok:
+	tr->physical_address =
+		PFN_PHYS(pte_pfn(gpte)) + (gva & (page_size - 1));
+	tr->valid = 1;
+	tr->writeable = pte_write(gpte);
+	tr->usermode = pte_user(gpte);
+
+	return 0;
+
+fail:
+	tr->valid = 0;
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	regs->regs = vcpu->arch.regs;
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
+{
+	vcpu->arch.regs = regs->regs;
+	vcpu->arch.regs.flags = PT_FLAGS_CALLER_SAVES | PT_FLAGS_RESTORE_REGS;
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs)
+{
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
+				  struct kvm_sregs *sregs)
+{
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
+				    struct kvm_mp_state *mp_state)
+{
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
+				    struct kvm_mp_state *mp_state)
+{
+	return 0;
+}
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg)
+{
+	return 0;
+}
+
+/*
+ * panic_hv() will dump stack info of both guest os and host os, and set
+ * proper exit reason so that qemu can terminate the guest process.
+ *
+ * FIXME: Probably KVM_EXIT_EXCEPTION?  If using KVM_EXIT_EXCEPTION,
+ * current qemu process will "hang" (killable but Ctrl+C not working),
+ * so use KVM_EXIT_SHUTDOWN here temporarily.
+ */
+static int panic_hv(struct kvm_vcpu *vcpu, const char *fmt, ...)
+{
+	char panic_buf[256];
+	struct pt_regs *regs;
+	va_list ap;
+	int i;
+
+	va_start(ap, fmt);
+	vsnprintf(panic_buf, sizeof(panic_buf), fmt, ap);
+	va_end(ap);
+	pr_err("KVM guest panic (vcpu %d) - %s\n", vcpu->vcpu_id, panic_buf);
+
+	/* Show guest os info */
+	regs = &vcpu->arch.regs;
+	for (i = 0; i < 17; i++)
+		pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT" r%-2d: "REGFMT"\n",
+		       i, regs->regs[i], i+18, regs->regs[i+18],
+		       i+36, regs->regs[i+36]);
+	pr_err(" r18: "REGFMT" r35: "REGFMT" tp : "REGFMT"\n",
+	       regs->regs[18], regs->regs[35], regs->tp);
+	pr_err(" sp : "REGFMT" lr : "REGFMT"\n", regs->sp, regs->lr);
+	pr_err(" pc : "REGFMT" ex1: %ld     faultnum: %ld\n",
+	       regs->pc, regs->ex1, regs->faultnum);
+
+	/* Show host os info */
+	pr_err("\nKVM stack in the host:\n");
+	dump_stack();
+
+	/* Shut down the guest os */
+	pr_err("Shutting down guest.\n");
+	vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
+	return 0;
+}
+
+/* Copied from virt/kvm/kvm_main.c */
+static int next_segment(unsigned long len, int offset)
+{
+	if (len > PAGE_SIZE - offset)
+		return PAGE_SIZE - offset;
+	else
+		return len;
+}
+
+static int kvm_read_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
+			     void *data, unsigned long len)
+{
+	struct kvm *kvm = vcpu->kvm;
+	int seg;
+	int offset = offset_in_page(gva);
+	int ret;
+
+	while ((seg = next_segment(len, offset)) != 0) {
+		struct kvm_translation tr;
+		tr.linear_address = gva;
+		kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+		if (!tr.valid)
+			return -EFAULT;
+		ret = kvm_read_guest_page(kvm, PFN_DOWN(tr.physical_address),
+					  data, offset, seg);
+		if (ret < 0)
+			return ret;
+		offset = 0;
+		len -= seg;
+		data += seg;
+		gva += seg;
+	}
+	return 0;
+}
+
+static int kvm_write_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
+			      const void *data, unsigned long len)
+{
+	struct kvm *kvm = vcpu->kvm;
+	int seg;
+	int offset = offset_in_page(gva);
+	int ret;
+
+	while ((seg = next_segment(len, offset)) != 0) {
+		struct kvm_translation tr;
+		tr.linear_address = gva;
+		kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+		if (!tr.valid)
+			return -EFAULT;
+		ret = kvm_write_guest_page(kvm, PFN_DOWN(tr.physical_address),
+					   data, offset, seg);
+		if (ret < 0)
+			return ret;
+		offset = 0;
+		len -= seg;
+		data += seg;
+		gva += seg;
+	}
+	return 0;
+}
+
+static int kvm_clear_guest_va(struct kvm_vcpu *vcpu, unsigned long gva,
+			      unsigned long len)
+{
+	struct kvm *kvm = vcpu->kvm;
+	int seg;
+	int offset = offset_in_page(gva);
+	int ret;
+
+	while ((seg = next_segment(len, offset)) != 0) {
+		struct kvm_translation tr;
+		tr.linear_address = gva;
+		kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+		if (!tr.valid)
+			return -EFAULT;
+		ret = kvm_clear_guest_page(kvm, PFN_DOWN(tr.physical_address),
+					   offset, seg);
+		if (ret < 0)
+			return ret;
+		offset = 0;
+		len -= seg;
+		gva += seg;
+	}
+	return 0;
+}
+
+/*
+ * The following functions are emulation functions for various
+ * hypervisor system calls (i.e. hv_*()). Return value:
+ *   1 if the host os can emulate it completely.
+ *   < 0 if errors occur and then qemu will handle them.
+ *   0 if qemu emulation is needed.
+ * In both the < 0 and the == 0 cases, exit reason should
+ * be set for qemu handling.
+ */
+
+/* generic handler for hypercall which needs user (QEMU) to handle. */
+static int kvm_deliver_to_user(struct kvm_vcpu *vcpu)
+{
+	vcpu->run->exit_reason = KVM_EXIT_HYPERCALL;
+	return 0;
+}
+
+/* handler for illegal hypercall */
+static int kvm_emulate_illegal(struct kvm_vcpu *vcpu)
+{
+	return panic_hv(vcpu, "Illegal kvm hypercall: %ld",
+			(unsigned long)vcpu->arch.regs.regs[10]);
+}
+
+static int kvm_emulate_hv_init(struct kvm_vcpu *vcpu)
+{
+	int version = vcpu->arch.regs.regs[0];
+	int chip_num = vcpu->arch.regs.regs[1];
+	int chip_rev_num = vcpu->arch.regs.regs[2];
+	int client_pl = vcpu->arch.regs.regs[3];
+
+	if (client_pl != 1)
+		return panic_hv(vcpu, "Guest is requesting PL %d, but KVM"
+				" guests must request PL 1.\n"
+				"Reconfigure your guest with KVM_GUEST set.\n",
+				client_pl);
+
+	if (version != HV_VERSION)
+		return panic_hv(vcpu, "Client built for hv version %d, but"
+				" this hv is version %d\n",
+				version, HV_VERSION);
+
+	if (chip_num != TILE_CHIP)
+		return panic_hv(vcpu, "Client built for chip %d, but this"
+				" hardware is chip %d\n",
+				chip_num, TILE_CHIP);
+
+	if (chip_rev_num != TILE_CHIP_REV)
+		return panic_hv(vcpu, "Client built for chip rev %d, but this"
+				" hardware is chip rev %d\n",
+				chip_rev_num, TILE_CHIP_REV);
+
+	return 1;
+}
+
+static int kvm_emulate_hv_sysconf(struct kvm_vcpu *vcpu)
+{
+	HV_SysconfQuery query = (HV_SysconfQuery)vcpu->arch.regs.regs[0];
+	long rc;
+
+	switch (query) {
+	case HV_SYSCONF_PAGE_SIZE_SMALL:
+		rc = PAGE_SIZE;
+		break;
+
+	case HV_SYSCONF_PAGE_SIZE_LARGE:
+		rc = HPAGE_SIZE;
+		break;
+
+	case HV_SYSCONF_VALID_PAGE_SIZES:
+#if PAGE_SHIFT == 16
+		rc = HV_CTX_PG_SM_64K;
+#elif PAGE_SHIFT == 14
+		rc = HV_CTX_PG_SM_16K;
+#else
+# error Fix hv_sysconf emulation for new page size
+#endif
+		break;
+
+	case HV_SYSCONF_PAGE_SIZE_JUMBO:
+		rc = 0;  /* FIXME add super page support */
+		break;
+
+	case HV_SYSCONF_CPU_SPEED:
+	case HV_SYSCONF_CPU_TEMP:
+	case HV_SYSCONF_BOARD_TEMP:
+		rc = hv_sysconf(query);
+		break;
+
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	vcpu->arch.regs.regs[0] = rc;
+	return 1;
+}
+
+static int kvm_emulate_hv_confstr(struct kvm_vcpu *vcpu)
+{
+	HV_SysconfQuery query = (HV_SysconfQuery)vcpu->arch.regs.regs[0];
+	long buflen = vcpu->arch.regs.regs[2];
+	char hvbuf[256];
+	const char *p;
+	long rc;
+
+	switch (query) {
+
+	/* For hardware attributes, just pass to the hypervisor. */
+	case HV_CONFSTR_BOARD_PART_NUM:
+	case HV_CONFSTR_BOARD_SERIAL_NUM:
+	case HV_CONFSTR_CHIP_SERIAL_NUM:
+	case HV_CONFSTR_BOARD_REV:
+	case HV_CONFSTR_CHIP_MODEL:
+	case HV_CONFSTR_BOARD_DESC:
+	case HV_CONFSTR_MEZZ_PART_NUM:
+	case HV_CONFSTR_MEZZ_SERIAL_NUM:
+	case HV_CONFSTR_MEZZ_REV:
+	case HV_CONFSTR_MEZZ_DESC:
+	case HV_CONFSTR_SWITCH_CONTROL:
+	case HV_CONFSTR_CHIP_REV:
+	case HV_CONFSTR_CPUMOD_PART_NUM:
+	case HV_CONFSTR_CPUMOD_SERIAL_NUM:
+	case HV_CONFSTR_CPUMOD_REV:
+	case HV_CONFSTR_CPUMOD_DESC:
+		rc = hv_confstr(query, (HV_VirtAddr)hvbuf, sizeof(hvbuf));
+		if (rc > sizeof(hvbuf)) {
+			/* Not the best answer, but very unlikely anyway. */
+			rc = sizeof(hvbuf);
+			hvbuf[sizeof(hvbuf)-1] = '\0';
+		}
+		p = hvbuf;
+		break;
+
+	/* For hypervisor version info, just report the kernel version. */
+	case HV_CONFSTR_HV_SW_VER:
+		p = UTS_RELEASE;
+		break;
+	case HV_CONFSTR_HV_CONFIG:
+	case HV_CONFSTR_HV_CONFIG_VER:
+		p = "";
+		break;
+
+	default:
+		rc = HV_EINVAL;
+		goto done;
+	}
+
+	rc = strlen(p) + 1;  /* include NUL */
+	if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[1],
+			       p, min(rc, buflen)))
+		rc = HV_EFAULT;
+
+done:
+	vcpu->arch.regs.regs[0] = rc;
+	return 1;
+}
+
+static int kvm_emulate_hv_get_rtc(struct kvm_vcpu *vcpu)
+{
+	HV_RTCTime *hvtm = (HV_RTCTime *) &vcpu->arch.regs.regs[0];
+	struct rtc_time tm;
+	struct timeval tv;
+
+	do_gettimeofday(&tv);
+	rtc_time_to_tm(tv.tv_sec, &tm);
+	hvtm->tm_sec = tm.tm_sec;
+	hvtm->tm_min = tm.tm_min;
+	hvtm->tm_hour = tm.tm_hour;
+	hvtm->tm_mday = tm.tm_mday;
+	hvtm->tm_mon = tm.tm_mon;
+	hvtm->tm_year = tm.tm_year;
+	hvtm->flags = 0;
+
+	return 1;
+}
+
+static int kvm_emulate_hv_set_rtc(struct kvm_vcpu *vcpu)
+{
+	/* Do nothing here. */
+	pr_warn("hv_set_rtc() will not work in kvm guest\n");
+	return 1;
+}
+
+static int kvm_emulate_hv_inquire_virtual(struct kvm_vcpu *vcpu)
+{
+	int idx = vcpu->arch.regs.regs[0];
+	HV_VirtAddrRange *var = (HV_VirtAddrRange *)&vcpu->arch.regs.regs[0];
+
+	switch (idx) {
+	case 0:
+		var->start =                  0UL;
+		var->size  =       0x20000000000UL;
+		break;
+	case 1:
+		var->start = 0xFFFFFFFF80000000UL;
+		var->size  =         0x80000000UL;
+		break;
+	default:
+		var->start =                  0UL;
+		var->size  =                  0UL;
+		break;
+	}
+
+	return 1;
+}
+
+/* Give all the ASIDs to the guest; we flush the whole TLB anyway. */
+static int kvm_emulate_hv_inquire_asid(struct kvm_vcpu *vcpu)
+{
+	int idx = vcpu->arch.regs.regs[0];
+	HV_ASIDRange *var = (HV_ASIDRange *)&vcpu->arch.regs.regs[0];
+
+	if (idx == 0) {
+		var->start = min_asid;
+		var->size = max_asid - min_asid + 1;
+	} else {
+		var->start = 0;
+		var->size = 0;
+	}
+
+	return 1;
+}
+
+static int kvm_emulate_hv_inquire_topology(struct kvm_vcpu *vcpu)
+{
+	HV_Topology *tp;
+	int cpus;
+
+	/* Depends on the definition of struct HV_Topology */
+	tp = (HV_Topology *)&vcpu->arch.regs.regs[0];
+
+	cpus = atomic_read(&vcpu->kvm->online_vcpus);
+	tp->coord.x = vcpu->vcpu_id;
+	tp->coord.y = 0;
+	tp->width = cpus;
+	tp->height = 1;
+
+	return 1;
+}
+
+static int xy_to_vcpu(struct kvm *kvm, int x, int y)
+{
+	if (y != 0 || x < 0 || x >= atomic_read(&kvm->online_vcpus))
+		return -1;
+	return x;
+}
+
+/*
+ * The primary vcpu is the one that initially runs while the others
+ * all block.  It is the only that is allowed to call hv_start_all_tiles().
+ * The other cpus are secondary.
+ */
+static bool is_secondary_vcpu(struct kvm_vcpu *vcpu)
+{
+	return vcpu->vcpu_id != 0;
+}
+
+static int kvm_emulate_hv_start_all_tiles(struct kvm_vcpu *vcpu)
+{
+	struct completion *c = &vcpu->kvm->arch.smp_start;
+	if (is_secondary_vcpu(vcpu) || completion_done(c))
+		return panic_hv(vcpu, "start_all_tiles() called again");
+	complete_all(c);
+	return 1;
+}
+
+static int kvm_emulate_hv_physaddr_read64(struct kvm_vcpu *vcpu)
+{
+	gpa_t gpa = vcpu->arch.regs.regs[0];
+	HV_PTE *access = (HV_PTE *) &vcpu->arch.regs.regs[1];
+	gfn_t gfn;
+	pfn_t pfn;
+	hpa_t hpa;
+
+	gfn = gpa_to_gfn(gpa);
+	pfn = gfn_to_pfn(vcpu->kvm, gfn);
+	if (is_error_pfn(pfn))
+		return panic_hv(vcpu, "bogus PA %llx in physaddr_write64()",
+			 gpa);
+	hpa = pfn_to_hpa(pfn) | (gpa & ~PAGE_MASK);
+
+	vcpu->arch.regs.regs[0] = hv_physaddr_read64(hpa, *access);
+
+	return 1;
+}
+
+static int kvm_emulate_hv_physaddr_write64(struct kvm_vcpu *vcpu)
+{
+	gpa_t gpa = vcpu->arch.regs.regs[0];
+	HV_PTE *access = (HV_PTE *)vcpu->arch.regs.regs[1];
+	uint64_t val = vcpu->arch.regs.regs[2];
+	gfn_t gfn;
+	pfn_t pfn;
+	hpa_t hpa;
+
+	gfn = gpa_to_gfn(gpa);
+	pfn = gfn_to_pfn(vcpu->kvm, gfn);
+	if (is_error_pfn(pfn))
+		return panic_hv(vcpu, "bogus PA %llx in physaddr_write64()",
+			 gpa);
+	hpa = pfn_to_hpa(pfn) | (gpa & ~PAGE_MASK);
+
+	hv_physaddr_write64(hpa, *access, val);
+
+	return 1;
+}
+
+static int kvm_emulate_hv_register_message_state(struct kvm_vcpu *vcpu)
+{
+	/* Do we care about the argument msgstate? */
+	vcpu->arch.regs.regs[0] = HV_OK;
+
+	return 1;
+}
+
+/*
+ * NOTE: we may coalesce multiple messages with the same tag to the
+ * same recepient.  Currently the only messages used by Linux are
+ * start/stop cpu (where coalescing is OK), and the smp_call_function()
+ * IPI message tag.  In the latter case we rely on the generic
+ * smp_call_function code to properly handle this, and since it only
+ * uses the IPI as a way to wake up the generic list-walking code,
+ * it's OK if we coalesce several IPI deliveries before the recipient
+ * core takes action.
+ */
+static int kvm_emulate_hv_send_message(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_vcpu *vcpui;
+	HV_Recipient recip[NR_CPUS];
+	HV_Recipient *recips = (HV_Recipient *)vcpu->arch.regs.regs[0];
+	int nrecip = vcpu->arch.regs.regs[1];
+	int buflen = vcpu->arch.regs.regs[3];
+	int sent, vcpu_id, tag;
+
+	/* NOTE: we only support the Linux usage of buflen == sizeof(int). */
+	if (unlikely(buflen != sizeof(int) ||
+		     nrecip >= atomic_read(&kvm->online_vcpus))) {
+		vcpu->arch.regs.regs[0] = HV_EINVAL;
+		return 1;
+	}
+
+	/* Get the buf info */
+	if (kvm_read_guest_va(vcpu, vcpu->arch.regs.regs[2],
+			      &tag, sizeof(tag))) {
+		vcpu->arch.regs.regs[0] = HV_EFAULT;
+		return 1;
+	}
+
+	/* Range-check the tag value. */
+	if (tag < 0 || tag >= MAX_MSG_TAG) {
+		vcpu->arch.regs.regs[0] = HV_EFAULT;
+		return 1;
+	}
+
+	/* Get all the recipients */
+	if (kvm_read_guest_va(vcpu, (unsigned long)recips, &recip,
+			      nrecip * sizeof(HV_Recipient))) {
+		vcpu->arch.regs.regs[0] = HV_EFAULT;
+		return 1;
+	}
+
+	for (sent = 0; sent < nrecip; sent++) {
+		if (recip[sent].state != HV_TO_BE_SENT)
+			continue;
+		vcpu_id = xy_to_vcpu(kvm, recip[sent].x, recip[sent].y);
+		if (unlikely(vcpu_id < 0 || vcpu_id == vcpu->vcpu_id)) {
+			recip[sent].state = HV_BAD_RECIP;
+			continue;
+		}
+		vcpui = kvm_get_vcpu(kvm, vcpu_id);
+		set_bit(tag, &vcpui->arch.pending_msgs);
+		kvm_vcpu_kick(vcpui);
+		recip[sent].state = HV_SENT;
+	}
+
+	if (kvm_write_guest_va(vcpu, (unsigned long)recips, &recip,
+			       nrecip * sizeof(HV_Recipient))) {
+		vcpu->arch.regs.regs[0] = HV_EFAULT;
+		return 1;
+	}
+
+	vcpu->arch.regs.regs[0] = sent;
+
+	return 1;
+}
+
+static int kvm_emulate_hv_receive_message(struct kvm_vcpu *vcpu)
+{
+	HV_RcvMsgInfo *rmi = (HV_RcvMsgInfo *)&vcpu->arch.regs.regs[0];
+	int buflen = vcpu->arch.regs.regs[3];
+	int tag;
+
+	/* Currently we only support messages from other tiles. */
+	rmi->source = HV_MSG_TILE;
+
+	if (buflen <= sizeof(int)) {
+		rmi->msglen = HV_E2BIG;
+		return 1;
+	}
+
+	tag = find_first_bit(&vcpu->arch.pending_msgs, MAX_MSG_TAG);
+	if (tag >= MAX_MSG_TAG) {
+		/* No more messages */
+		rmi->msglen = 0;
+		return 1;
+	}
+
+	if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[2],
+			       &tag, sizeof(int))) {
+		rmi->msglen = HV_EFAULT;
+		return 1;
+	}
+
+	/*
+	 * This clear_bit could race with a set_bit as another core
+	 * delivers a new smp_function_call to this core.  However,
+	 * the smp_function_call code will have set up the additional
+	 * smp_function_call data on the kernel's list prior to
+	 * raising the interrupt, so even if we lose the new
+	 * interrupt due to the race, we still haven't dispatched
+	 * to the original interrupt handler, and when we do, it
+	 * will find both smp_function_calls waiting for it, so the
+	 * race is harmless.  This is consistent with the fact that
+	 * the generic code is trying to support pretty much
+	 * arbitrary architecture-dependent IPI semantics, so it
+	 * is very conservative about what it assumes.
+	 *
+	 * Also note that we only clear_bit on the core that owns
+	 * the mask, so there's no race condition caused by the
+	 * find_first_bit above and the clear_bit here, since once
+	 * a bit is found it will stay set until this point.
+	 */
+	clear_bit(tag, &vcpu->arch.pending_msgs);
+	rmi->msglen = sizeof(int);
+	return 1;
+}
+
+static int kvm_emulate_hv_inquire_context(struct kvm_vcpu *vcpu)
+{
+	HV_Context *ctx = (HV_Context *) &vcpu->arch.regs.regs[0];
+
+	*ctx = hv_inquire_guest_context();
+
+	return 1;
+}
+
+static int kvm_emulate_hv_inquire_tiles(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	HV_InqTileSet set = vcpu->arch.regs.regs[0];
+	unsigned long gva = vcpu->arch.regs.regs[1];
+	int length = vcpu->arch.regs.regs[2];
+	struct cpumask mask = CPU_MASK_NONE;
+	int cpus, i, retval, bytes2copy, bytes2zero;
+
+	switch (set) {
+	case HV_INQ_TILES_AVAIL:
+	case HV_INQ_TILES_HFH_CACHE:
+	case HV_INQ_TILES_LOTAR:
+		cpus = atomic_read(&kvm->online_vcpus);
+		for (i = 0; i < cpus; ++i)
+			cpumask_set_cpu(i, &mask);
+		break;
+	case HV_INQ_TILES_SHARED:
+		break;
+	default:
+		retval = HV_EINVAL;
+		goto done;
+	}
+
+	bytes2copy = (length > sizeof(mask)) ? sizeof(mask) : length;
+	bytes2zero = length - bytes2copy;
+
+	if (kvm_write_guest_va(vcpu, gva, &mask, bytes2copy)) {
+		retval = HV_EFAULT;
+		goto done;
+	}
+
+	if (kvm_clear_guest_va(vcpu, gva + bytes2copy, bytes2zero)) {
+		retval = HV_EFAULT;
+		goto done;
+	}
+
+	retval = HV_OK;
+done:
+	vcpu->arch.regs.regs[0] = retval;
+	return 1;
+}
+
+static int kvm_emulate_hv_get_ipi_pte(struct kvm_vcpu *vcpu)
+{
+	HV_Coord vtarget = *(HV_Coord *)&vcpu->arch.regs.regs[0];
+	int pl = (int) vcpu->arch.regs.regs[1];
+	struct kvm_vcpu *target_vcpu;
+	int vcpu_id;
+
+	vcpu_id = vtarget.x;
+	if (pl != GUEST_PL || vtarget.y != 0 || vcpu_id < 0 ||
+	    vcpu_id >= atomic_read(&vcpu->kvm->online_vcpus)) {
+		vcpu->arch.regs.regs[0] = HV_EINVAL;
+		return 1;
+	}
+
+	target_vcpu = kvm_get_vcpu(vcpu->kvm, vcpu_id);
+	if (kvm_write_guest_va(vcpu, vcpu->arch.regs.regs[2],
+			    &target_vcpu->arch.ipi_gpte, sizeof(pte_t))) {
+		vcpu->arch.regs.regs[0] = HV_EFAULT;
+		return 1;
+	}
+
+	vcpu->arch.regs.regs[0] = HV_OK;
+
+	return 1;
+}
+
+struct kvm_vcpu *ipi_vcpu_lookup(struct kvm *kvm, unsigned long gpa)
+{
+	struct kvm_vcpu *vcpui;
+	unsigned long idx;
+
+	kvm_for_each_vcpu(idx, vcpui, kvm)
+		if (vcpui->arch.ipi_gpa == gpa)
+			return vcpui;
+
+	return NULL;
+}
+
+/*
+ * Most page faults will be downcall-ed from hv to and be handled directly
+ * by either guest os or host os. This function is used to handle the
+ * rest cases.
+ */
+static int handle_mmio(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_translation tr;
+	struct kvm_vcpu *ipi_vcpu;
+
+	tr.linear_address = (__u64) vcpu->arch.fault_addr;
+	kvm_arch_vcpu_ioctl_translate(vcpu, &tr);
+	if (!tr.valid)
+		return 0;
+
+	/* ipi PTE for rescheduling interrupt? */
+	ipi_vcpu = ipi_vcpu_lookup(kvm, tr.physical_address);
+	if (!ipi_vcpu)
+		return 0;
+
+	set_bit(IRQ_RESCHEDULE, &ipi_vcpu->arch.ipi_events);
+	kvm_vcpu_kick(ipi_vcpu);
+
+	/* Juke the PC past the store instruction. */
+	vcpu->arch.regs.pc += 8;
+	return 1;
+}
+
+static int kvm_emulate_hv_set_pte_super_shift(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * We do not expect this call in guest so far. At least guest os
+	 * should just follow host os instead of *set*. Besides,
+	 * hv_set_pte_super_shift() will not be called in guest os with
+	 * current guest os setting.
+	 */
+	vcpu->arch.regs.regs[0] = HV_EINVAL;
+
+	return 1;
+}
+
+static int kvm_emulate_hv_set_speed(struct kvm_vcpu *vcpu)
+{
+	HV_SetSpeed *hvss = (HV_SetSpeed *) &vcpu->arch.regs.regs[0];
+
+	hvss->new_speed = HV_EPERM;
+	hvss->end_cycle = 0;
+	hvss->delta_ns = 0;
+
+	return 1;
+}
+
+static int (*hcall_handlers[KVM_NUM_HCALLS])(struct kvm_vcpu *vcpu) = {
+	HCALL_DEFS
+};
+
+static int kvm_handle_exit(struct kvm_vcpu *vcpu)
+{
+	unsigned long hcall_idx;
+
+	switch (vcpu->run->exit_reason) {
+	case KVM_EXIT_HYPERCALL:
+		hcall_idx = vcpu->arch.regs.regs[10];
+		if (unlikely(hcall_idx >= KVM_NUM_HCALLS ||
+			     hcall_handlers[hcall_idx] == NULL))
+			return kvm_emulate_illegal(vcpu);
+
+		/* Juke us past the swint0 when we return. */
+		vcpu->arch.regs.pc += 8;
+
+		return hcall_handlers[hcall_idx](vcpu);
+
+	case KVM_EXIT_MMIO:
+		if (handle_mmio(vcpu))
+			return 1;
+		return panic_hv(vcpu, "Out-of-bounds client memory access");
+
+	case KVM_EXIT_AGAIN:
+		return 1;
+
+	default:
+		return 0;
+	}
+}
+
+static void kvm_kick_func(void *info)
+{
+	struct kvm_vcpu *vcpu = info;
+
+	/* If this is not the thread that we expect, just return. */
+	if (unlikely(vcpu->pid != get_task_pid(current, PIDTYPE_PID)))
+		return;
+
+	/* Setting this flag will cause a vmexit instead of a vmresume. */
+	set_thread_flag(TIF_VIRT_EXIT);
+}
+
+/* Note this function has been a standard kvm interface in latest Linux. */
+void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
+{
+	int me, cpu;
+
+	/* If it is waiting in kvm_vcpu_block(), wake it up. */
+	if (waitqueue_active(&vcpu->wq))
+		wake_up_interruptible(&vcpu->wq);
+
+	/* If we are kicking our own vcpu, make sure we vmexit. */
+	if (vcpu == current_thread_info()->vcpu) {
+		set_thread_flag(TIF_VIRT_EXIT);
+		return;
+	}
+
+	/*
+	 * If the vcpu is running the guest, interrupt its cpu,
+	 * causing it to vmexit by setting TIF_VIRT_EXIT.  Note we can
+	 * race with a guest already doing a vmexit, but that is benign.
+	 */
+	cpu = vcpu->cpu;
+	me = get_cpu();
+	if (cpu != me && (unsigned) cpu < nr_cpu_ids && cpu_online(cpu))
+		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+			smp_call_function_single(cpu, kvm_kick_func, vcpu, 0);
+	put_cpu();
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
+
+/*
+ * Any interrupt that would normally be handled by the host at PL2
+ * needs to be reassigned to the guest at PL1 as we enter.
+ *
+ * The TLB interrupts remain handled by the hypervisor and are downcalled
+ * to the appropriate host or guest as necessary.
+ *
+ * FIXME: We don't give the UDN interrupts for now; at some point we
+ * plan to allow an option to pin the vcpus and report the true
+ * geometry to the guest, at which point passing the UDN access would
+ * make sense.
+ *
+ * FIXME: For now we don't pass the profiling interrupts to the guest,
+ * and instead require profiling be run in the host; we should be able
+ * to support guest-level profiling pretty easily, but we need to
+ * think about whether there are vcpu migration issues there.
+ */
+static void kvm_grant_mpls(void)
+{
+	__insn_mtspr(SPR_MPL_SWINT_1_SET_1, 1);
+	__insn_mtspr(SPR_MPL_ILL_SET_1, 1);
+	__insn_mtspr(SPR_MPL_GPV_SET_1, 1);
+	__insn_mtspr(SPR_MPL_ILL_TRANS_SET_1, 1);
+	__insn_mtspr(SPR_MPL_UNALIGN_DATA_SET_1, 1);
+}
+
+static void kvm_ungrant_mpls(void)
+{
+	__insn_mtspr(SPR_MPL_SWINT_1_SET_2, 1);
+	__insn_mtspr(SPR_MPL_ILL_SET_2, 1);
+	__insn_mtspr(SPR_MPL_GPV_SET_2, 1);
+	__insn_mtspr(SPR_MPL_ILL_TRANS_SET_2, 1);
+	__insn_mtspr(SPR_MPL_UNALIGN_DATA_SET_2, 1);
+}
+
+/*
+ * There is lots of state that is (for the non-virtualized case) held
+ * permanently in SPRs, or that is in any case not context-switched.
+ * The next two routines switch in and out all the SPR state.
+ *
+ * We try to fix the timer so that when we restart, we fix up the
+ * timer value so that will fire at the correct wall-clock time even
+ * if we have been scheduled out for a little bit.  This may also
+ * mean we end up firing it immediately on return, and suffer a
+ * timer delay in the guest.
+ */
+static void kvm_save_sprs(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.timer_control = __insn_mfspr(SPR_AUX_TILE_TIMER_CONTROL);
+	vcpu->arch.vmexit_cycles = get_cycles();
+
+#define SAVE_SPR(x) vcpu->arch.x = __insn_mfspr(SPR_ ## x)
+	FOR_EACH_GUEST_SPR(SAVE_SPR);
+#undef SAVE_SPR
+}
+
+static void kvm_restore_sprs(struct kvm_vcpu *vcpu)
+{
+	unsigned long count = vcpu->arch.timer_control;
+	unsigned long underflow =
+		(count >> SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT) & 1;
+	unsigned long disabled =
+		(count >> SPR_AUX_TILE_TIMER_CONTROL__DISABLE_SHIFT) & 1;
+
+	if (!disabled) {
+		unsigned long delta = get_cycles() - vcpu->arch.vmexit_cycles;
+		count &= SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK;
+		underflow |= delta > count;
+		count -= delta;
+		count &= SPR_AUX_TILE_TIMER_CONTROL__COUNT_MASK;
+		count |= (underflow << SPR_AUX_TILE_TIMER_CONTROL__UNDERFLOW_SHIFT);
+	}
+	__insn_mtspr(SPR_AUX_TILE_TIMER_CONTROL, count);
+
+#define RESTORE_SPR(x) __insn_mtspr(SPR_ ## x, vcpu->arch.x)
+	FOR_EACH_GUEST_SPR(RESTORE_SPR);
+#undef RESTORE_SPR
+}
+
+/*
+ * When entering the guest, we need to eliminate any PL0 translations
+ * that were in use by qemu, since the guest's PL0 translations will
+ * be different.  We also flush PL1 translations in case there have
+ * been changes to the virtualization page table, etc.
+ *
+ * FIXME: Add a way to just flush PL0/PL1, or just flush below
+ * the host PAGE_OFFSET, or add vpid support, etc.
+ */
+static void kvm_guest_context_enter(struct kvm_vcpu *vcpu)
+{
+	HV_Context *ctx;
+	pgd_t *vpgdir;
+	pte_t *ptep;
+	int rc;
+
+	/* Install virtualization context */
+	vpgdir = vcpu->kvm->arch.vpgd;
+	BUG_ON(vpgdir == NULL);
+	ptep = virt_to_pte(NULL, (unsigned long)vpgdir);
+	rc = hv_install_virt_context(__pa(vpgdir), *ptep, 0, 0);
+	WARN_ON_ONCE(rc < 0);
+
+	/* Install guest context */
+	ctx = &vcpu->arch.guest_context;
+	rc = hv_install_guest_context(ctx->page_table, ctx->access,
+				      ctx->asid, ctx->flags);
+	WARN_ONCE(rc < 0, "install_guest_context(%#llx,%#llx,%#x,%#x): %d\n",
+		  ctx->page_table, ctx->access.val,
+		  ctx->asid, ctx->flags, rc);
+
+	hv_flush_all(0);
+}
+
+/*
+ * De-install the virtualization context so we take faults below the
+ * host Linux PL in the normal manner going forward.
+ *
+ * We flush all the TLB mappings as we exit the guest, since the
+ * guest has been using the ASIDs as it pleases, and may have installed
+ * incompatible mappings for qemu's process as well.  Note that we don't
+ * worry about host-PL interrupts that occur while the guest is running,
+ * on the assumption that such interrupts can't touch userspace
+ * addresses legally anyway.
+ *
+ * NOTE: we may want to add a hypervisor call to just flush mappings
+ * below PL2 and use that here instead.
+ */
+static void kvm_guest_context_exit(struct kvm_vcpu *vcpu)
+{
+	int rc;
+
+	/* Remember guest context */
+	vcpu->arch.guest_context = hv_inquire_guest_context();
+
+	/* Disable virtualization context */
+	rc = hv_install_virt_context(HV_CTX_NONE, hv_pte(0), 0, 0);
+	WARN_ON_ONCE(rc < 0);
+
+	/* Flush everything in the TLB. */
+	hv_flush_all(0);
+}
+
+static void kvm_inject_interrupts(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Capture current set of ipi_events.  We might race with
+	 * another thread adding an event, but if so we'll just miss
+	 * it on this go-around and see it next time.
+	 */
+	vcpu->arch.IPI_EVENT_1 |= __insn_exch(&vcpu->arch.ipi_events, 0);
+
+	/*
+	 * Note: We could set PC and EX1 for the guest os to jump
+	 * directly to the INT_MESSAGE_RCV_DWNCL handler if the interrupt
+	 * is unmasked and the guest is not at PL1 with ICS set.
+	 * But in fact it's about as fast to just set INTCTRL_1_STATUS
+	 * here and then run the short INTCTRL_1 handler in the guest.
+	 */
+	vcpu->arch.INTCTRL_1_STATUS = (vcpu->arch.pending_msgs != 0);
+}
+
+static void kvm_tile_run(struct kvm_vcpu *vcpu)
+{
+	struct thread_info *ti = current_thread_info();
+	unsigned long prev_k_0 = __insn_mfspr(SPR_SYSTEM_SAVE_K_0);
+
+	/*
+	 * Disable interrupts while we set up the guest state.
+	 * This way, if we race with another core trying to tell us
+	 * to fix up our guest state, we will take the kick only as
+	 * we actually try to enter the guest, and instead we will
+	 * vmexit and end up retrying.
+	 */
+	local_irq_disable();
+	kvm_guest_context_enter(vcpu);
+	clear_bit(KVM_REQ_KICK, &vcpu->requests);
+	ti->vcpu = vcpu;
+	vcpu->cpu = get_cpu();
+	kvm_inject_interrupts(vcpu);
+	kvm_grant_mpls();
+	kvm_restore_sprs(vcpu);
+
+	/* Calling this function irets into the guest. */
+	kvm_vmresume(&vcpu->arch.regs, &vcpu->arch.host_sp);
+
+	/* We resume here due to a call to kvm_vmexit. */
+	__insn_mtspr(SPR_SYSTEM_SAVE_K_0, prev_k_0);
+
+	vcpu->cpu = -1;
+	put_cpu();
+	ti->vcpu = NULL;
+	set_bit(KVM_REQ_KICK, &vcpu->requests);
+	vcpu->run->ready_for_interrupt_injection = 1;
+	kvm_ungrant_mpls();
+	kvm_save_sprs(vcpu);
+	__insn_mtspr(SPR_INTERRUPT_MASK_1, -1UL);
+	kvm_guest_context_exit(vcpu);
+	local_irq_enable();
+}
+
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	int r = 1;
+
+	while (r > 0) {
+		kvm_guest_enter();
+		kvm_tile_run(vcpu);
+		kvm_guest_exit();
+
+		r = kvm_handle_exit(vcpu);
+		/*
+		 * <0: error for userspace.
+		 * =0: QEMU to handle.
+		 * >0: host os can handle it fully.
+		 */
+		if (r <= 0)
+			break;
+
+		if (signal_pending(current)) {
+			vcpu->run->exit_reason = KVM_EXIT_INTR;
+			r = -EINTR;
+			break;
+		}
+
+#ifdef CONFIG_HOMECACHE
+		if (current_thread_info()->homecache_cpu !=
+		    smp_processor_id()) {
+			/* Do homecache migration when returning to qemu. */
+			vcpu->run->exit_reason = KVM_EXIT_INTR;
+			r = -EINTR;
+			break;
+		}
+#endif
+
+		kvm_resched(vcpu);
+	}
+
+	return r;
+}
+
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	int r;
+	sigset_t sigsaved;
+
+	/* Secondary cpus must wait until they are told they can start. */
+	if (vcpu->arch.suspended) {
+		struct completion *c = &vcpu->kvm->arch.smp_start;
+		if (wait_for_completion_interruptible(c))
+			return -EINTR;
+		vcpu->arch.suspended = 0;
+	}
+
+	if (vcpu->sigset_active)
+		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
+
+	r = __vcpu_run(vcpu, kvm_run);
+
+	if (vcpu->sigset_active)
+		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+
+	return r;
+}
+
+int kvm_arch_init(void *opaque)
+{
+	return 0;
+}
+
+void kvm_arch_exit(void)
+{
+}
+
+int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	int i;
+	unsigned long resv_gfn_start;
+	struct kvm_memory_slot *s;
+	struct kvm *kvm = vcpu->kvm;
+
+	if (!kvm->arch.resv_gpa_start) {
+		resv_gfn_start = 0;
+
+		for (i = 0; i < KVM_USER_MEM_SLOTS; i++) {
+			s = &kvm->memslots->memslots[i];
+
+			if (!s->npages)
+				continue;
+
+			if ((s->base_gfn + s->npages) > resv_gfn_start)
+				resv_gfn_start = s->base_gfn + s->npages;
+		}
+
+		kvm->arch.resv_gpa_start = PFN_PHYS(resv_gfn_start);
+	}
+
+	/* Initialize to enter fake PA=VA mode in hypervisor. */
+	vcpu->arch.guest_context.page_table = HV_CTX_NONE;
+
+	vcpu->arch.ipi_gpa =
+		kvm->arch.resv_gpa_start + (vcpu->vcpu_id * PAGE_SIZE);
+	vcpu->arch.ipi_gpte =
+		pfn_pte(PFN_DOWN(vcpu->arch.ipi_gpa), PAGE_KERNEL);
+
+	/* Mark the core suspended if it is not the boot cpu. */
+	vcpu->arch.suspended = is_secondary_vcpu(vcpu);
+
+	return 0;
+}
+
+void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+}
+
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	/* Notify simulator that this task handles this vcpu. */
+	sim_set_vcpu(vcpu->vcpu_id);
+}
+
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+	sim_clear_vcpu();
+}
+
+struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
+{
+	/* FIXME: some archs set up a cache for these structs? */
+	struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
+	int rc;
+
+	if (!vcpu)
+		return ERR_PTR(-ENOMEM);
+
+	rc = kvm_vcpu_init(vcpu, kvm, id);
+	if (rc) {
+		kfree(vcpu);
+		return ERR_PTR(rc);
+	}
+
+	return vcpu;
+}
+
+int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+{
+	memset(&vcpu->arch.regs, 0, sizeof(struct pt_regs));
+	return 0;
+}
+
+int kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
+{
+	kvm_vcpu_uninit(vcpu);
+	kfree(vcpu);
+}
+
+void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+{
+	return kvm_arch_vcpu_destroy(vcpu);
+}
+
+int kvm_arch_hardware_enable(void *garbage)
+{
+	return 0;
+}
+
+void kvm_arch_hardware_disable(void *garbage)
+{
+}
+
+int kvm_arch_hardware_setup(void)
+{
+	return 0;
+}
+
+void kvm_arch_hardware_unsetup(void)
+{
+}
+
+void kvm_arch_check_processor_compat(void *rtn)
+{
+}
+
+int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
+{
+	if (type)
+		return -EINVAL;
+
+	init_completion(&kvm->arch.smp_start);
+	return 0;
+}
+
+void kvm_arch_destroy_vm(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		kvm_arch_vcpu_free(vcpu);
+
+	/* Seems to be unnecessary? */
+	mutex_lock(&kvm->lock);
+	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
+		kvm->vcpus[i] = NULL;
+
+	atomic_set(&kvm->online_vcpus, 0);
+	mutex_unlock(&kvm->lock);
+
+	/* FIXME: release all the pmds and ptes as well! */
+	if (kvm->arch.vpgd)
+		pgd_free(kvm->mm, kvm->arch.vpgd);
+}
+
+void kvm_arch_sync_events(struct kvm *kvm)
+{
+}
+
+int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
+{
+	return 0;
+}
+
+/* Called from guest hv glue via swint0 traps. */
+void kvm_do_hypervisor_call(struct pt_regs *regs, int fault_num)
+{
+	/* Hypercalls are only valid from PL1. */
+	if (EX1_PL(regs->ex1) != 0) {
+		kvm_trigger_vmexit(regs, KVM_EXIT_HYPERCALL);
+		/*NORETURN*/
+	}
+	do_trap(regs, fault_num, 0);
+}
+
+void kvm_do_vpgtable_miss(struct pt_regs *regs, int fault_num,
+			  unsigned long fault_addr, unsigned long write)
+{
+	struct kvm_vcpu *vcpu = current_thread_info()->vcpu;
+	BUG_ON(vcpu == NULL);
+	vcpu->arch.fault_addr = fault_addr;
+	kvm_trigger_vmexit(regs, KVM_EXIT_MMIO);
+	/*NORETURN*/
+}
+
+void kvm_do_vguest_fatal(struct pt_regs *regs, int fault_num)
+{
+	kvm_trigger_vmexit(regs, KVM_EXIT_SHUTDOWN);
+	/*NORETURN*/
+}
+
+void kvm_trigger_vmexit(struct pt_regs *regs, int exit_reason)
+{
+	struct kvm_vcpu *vcpu = current_thread_info()->vcpu;
+	vcpu->run->exit_reason = exit_reason;
+	vcpu->arch.regs = *regs;
+	vcpu->arch.regs.flags = PT_FLAGS_CALLER_SAVES | PT_FLAGS_RESTORE_REGS;
+	kvm_vmexit(vcpu->arch.host_sp);
+	/*NORETURN*/
+}
+
+static int __init kvm_tile_init(void)
+{
+	return kvm_init(NULL, sizeof(struct kvm_vcpu),
+			__alignof__(struct kvm_vcpu), THIS_MODULE);
+}
+
+static void __exit kvm_tile_exit(void)
+{
+	kvm_exit();
+}
+
+module_init(kvm_tile_init);
+module_exit(kvm_tile_exit);
diff --git a/arch/tile/lib/exports.c b/arch/tile/lib/exports.c
index 82733c8..1590282 100644
--- a/arch/tile/lib/exports.c
+++ b/arch/tile/lib/exports.c
@@ -50,18 +50,26 @@ EXPORT_SYMBOL(__copy_in_user_inatomic);
 
 /* hypervisor glue */
 #include <hv/hypervisor.h>
+EXPORT_SYMBOL(hv_confstr);
+EXPORT_SYMBOL(hv_dev_close);
 EXPORT_SYMBOL(hv_dev_open);
+EXPORT_SYMBOL(hv_dev_poll);
+EXPORT_SYMBOL(hv_dev_poll_cancel);
 EXPORT_SYMBOL(hv_dev_pread);
-EXPORT_SYMBOL(hv_dev_pwrite);
 EXPORT_SYMBOL(hv_dev_preada);
+EXPORT_SYMBOL(hv_dev_pwrite);
 EXPORT_SYMBOL(hv_dev_pwritea);
-EXPORT_SYMBOL(hv_dev_poll);
-EXPORT_SYMBOL(hv_dev_poll_cancel);
-EXPORT_SYMBOL(hv_dev_close);
-EXPORT_SYMBOL(hv_sysconf);
-EXPORT_SYMBOL(hv_confstr);
+EXPORT_SYMBOL(hv_flush_all);
 EXPORT_SYMBOL(hv_get_rtc);
+#ifdef __tilegx__
+EXPORT_SYMBOL(hv_inquire_guest_context);
+EXPORT_SYMBOL(hv_install_guest_context);
+EXPORT_SYMBOL(hv_install_virt_context);
+#endif
+EXPORT_SYMBOL(hv_physaddr_read64);
+EXPORT_SYMBOL(hv_physaddr_write64);
 EXPORT_SYMBOL(hv_set_rtc);
+EXPORT_SYMBOL(hv_sysconf);
 
 /* libgcc.a */
 uint32_t __udivsi3(uint32_t dividend, uint32_t divisor);
diff --git a/arch/tile/mm/elf.c b/arch/tile/mm/elf.c
index 23f044e..86cff48 100644
--- a/arch/tile/mm/elf.c
+++ b/arch/tile/mm/elf.c
@@ -42,7 +42,9 @@ static int notify_exec(struct mm_struct *mm)
 	char *buf, *path;
 	struct vm_area_struct *vma;
 
+#ifndef CONFIG_KVM_GUEST   /* see notify_sim_task_change() */
 	if (!sim_is_simulator())
+#endif
 		return 1;
 
 	if (mm->exe_file == NULL)
diff --git a/arch/tile/mm/fault.c b/arch/tile/mm/fault.c
index 64eec3f..39c48cb 100644
--- a/arch/tile/mm/fault.c
+++ b/arch/tile/mm/fault.c
@@ -283,7 +283,7 @@ static int handle_page_fault(struct pt_regs *regs,
 	flags = (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
 		 (write ? FAULT_FLAG_WRITE : 0));
 
-	is_kernel_mode = (EX1_PL(regs->ex1) != USER_PL);
+	is_kernel_mode = !user_mode(regs);
 
 	tsk = validate_current();
 
@@ -824,7 +824,7 @@ void do_page_fault(struct pt_regs *regs, int fault_num,
 	}
 
 #if CHIP_HAS_TILE_DMA() || CHIP_HAS_SN_PROC()
-	if (EX1_PL(regs->ex1) != USER_PL) {
+	if (!user_mode(regs)) {
 		struct async_tlb *async;
 		switch (fault_num) {
 #if CHIP_HAS_TILE_DMA()
diff --git a/arch/tile/mm/init.c b/arch/tile/mm/init.c
index 3bfa127..c6d2160 100644
--- a/arch/tile/mm/init.c
+++ b/arch/tile/mm/init.c
@@ -234,7 +234,7 @@ static pgprot_t __init init_pgprot(ulong address)
 {
 	int cpu;
 	unsigned long page;
-	enum { CODE_DELTA = MEM_SV_INTRPT - PAGE_OFFSET };
+	enum { CODE_DELTA = MEM_SV_START - PAGE_OFFSET };
 
 #if CHIP_HAS_CBOX_HOME_MAP()
 	/* For kdata=huge, everything is just hash-for-home. */
@@ -538,7 +538,7 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base)
 		}
 	}
 
-	address = MEM_SV_INTRPT;
+	address = MEM_SV_START;
 	pmd = get_pmd(pgtables, address);
 	pfn = 0;  /* code starts at PA 0 */
 	if (ktext_small) {
@@ -1021,7 +1021,7 @@ static void free_init_pages(char *what, unsigned long begin, unsigned long end)
 
 void free_initmem(void)
 {
-	const unsigned long text_delta = MEM_SV_INTRPT - PAGE_OFFSET;
+	const unsigned long text_delta = MEM_SV_START - PAGE_OFFSET;
 
 	/*
 	 * Evict the dirty initdata on the boot cpu, evict the w1data
@@ -1040,7 +1040,7 @@ void free_initmem(void)
 
 	/*
 	 * Free the pages mapped from 0xc0000000 that correspond to code
-	 * pages from MEM_SV_INTRPT that we won't use again after init.
+	 * pages from MEM_SV_START that we won't use again after init.
 	 */
 	free_init_pages("unused kernel text",
 			(unsigned long)_sinittext - text_delta,
diff --git a/arch/tile/mm/pgtable.c b/arch/tile/mm/pgtable.c
index 3004433..d6948d4 100644
--- a/arch/tile/mm/pgtable.c
+++ b/arch/tile/mm/pgtable.c
@@ -486,25 +486,18 @@ void check_mm_caching(struct mm_struct *prev, struct mm_struct *next)
 
 #if CHIP_HAS_MMIO()
 
-/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
-void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
-			   pgprot_t home)
+void *generic_remap_prot(resource_size_t phys_addr, unsigned long size,
+		    unsigned long flags, pgprot_t prot)
 {
 	void *addr;
 	struct vm_struct *area;
 	unsigned long offset, last_addr;
-	pgprot_t pgprot;
 
 	/* Don't allow wraparound or zero size */
 	last_addr = phys_addr + size - 1;
 	if (!size || last_addr < phys_addr)
 		return NULL;
 
-	/* Create a read/write, MMIO VA mapping homed at the requested shim. */
-	pgprot = PAGE_KERNEL;
-	pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
-	pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
-
 	/*
 	 * Mappings have to be page-aligned
 	 */
@@ -515,17 +508,35 @@ void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
 	/*
 	 * Ok, go for it..
 	 */
-	area = get_vm_area(size, VM_IOREMAP /* | other flags? */);
+	area = get_vm_area(size, flags);
 	if (!area)
 		return NULL;
 	area->phys_addr = phys_addr;
 	addr = area->addr;
 	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
-			       phys_addr, pgprot)) {
+			       phys_addr, prot)) {
 		free_vm_area(area);
 		return NULL;
 	}
-	return (__force void __iomem *) (offset + (char *)addr);
+	return (void *) (offset + (char *)addr);
+}
+EXPORT_SYMBOL(generic_remap_prot);
+
+/* Map an arbitrary MMIO address, homed according to pgprot, into VA space. */
+void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
+			   pgprot_t home)
+{
+	pgprot_t pgprot;
+	unsigned long flags;
+
+	/* Create a read/write, MMIO VA mapping homed at the requested shim. */
+	pgprot = PAGE_KERNEL;
+	pgprot = hv_pte_set_mode(pgprot, HV_PTE_MODE_MMIO);
+	pgprot = hv_pte_set_lotar(pgprot, hv_pte_get_lotar(home));
+	flags = VM_IOREMAP; /* | other flags? */
+
+	return (__force void __iomem *) generic_remap_prot(phys_addr,
+							   size, flags, pgprot);
 }
 EXPORT_SYMBOL(ioremap_prot);
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index acccd08..d3879c5 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -171,6 +171,7 @@ struct kvm_pit_config {
 #define KVM_EXIT_WATCHDOG         21
 #define KVM_EXIT_S390_TSCH        22
 #define KVM_EXIT_EPR              23
+#define KVM_EXIT_AGAIN            24
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -1012,6 +1013,8 @@ struct kvm_s390_ucas_mapping {
 #define KVM_KVMCLOCK_CTRL	  _IO(KVMIO,   0xad)
 #define KVM_ARM_VCPU_INIT	  _IOW(KVMIO,  0xae, struct kvm_vcpu_init)
 #define KVM_GET_REG_LIST	  _IOWR(KVMIO, 0xb0, struct kvm_reg_list)
+/* Reset some SPR registers for tilegx */
+#define KVM_TILE_RESET_SPR	  _IO(KVMIO,  0xa8)
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_PCI_2_3		(1 << 1)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1580dd4..1b8a1f1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1691,7 +1691,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	finish_wait(&vcpu->wq, &wait);
 }
 
-#ifndef CONFIG_S390
+#if !defined(CONFIG_S390) && !defined(CONFIG_TILE)
 /*
  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
  */
@@ -1714,7 +1714,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 	put_cpu();
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
-#endif /* !CONFIG_S390 */
+#endif
 
 void kvm_resched(struct kvm_vcpu *vcpu)
 {
@@ -1978,7 +1978,8 @@ static long kvm_vcpu_ioctl(struct file *filp,
 	if (vcpu->kvm->mm != current->mm)
 		return -EIO;
 
-#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS)
+#if defined(CONFIG_S390) || defined(CONFIG_PPC) || defined(CONFIG_MIPS) || \
+	defined(CONFIG_TILEGX)
 	/*
 	 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
 	 * so vcpu_load() would break it.
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ