[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20210518004807.258503-1-sathyanarayanan.kuppuswamy@linux.intel.com>
Date: Mon, 17 May 2021 17:48:07 -0700
From: Kuppuswamy Sathyanarayanan
<sathyanarayanan.kuppuswamy@...ux.intel.com>
To: Peter Zijlstra <peterz@...radead.org>,
Andy Lutomirski <luto@...nel.org>,
Dave Hansen <dave.hansen@...el.com>
Cc: Tony Luck <tony.luck@...el.com>, Andi Kleen <ak@...ux.intel.com>,
Kirill Shutemov <kirill.shutemov@...ux.intel.com>,
Kuppuswamy Sathyanarayanan <knsathya@...nel.org>,
Dan Williams <dan.j.williams@...el.com>,
Raj Ashok <ashok.raj@...el.com>,
Sean Christopherson <seanjc@...gle.com>,
linux-kernel@...r.kernel.org,
Kuppuswamy Sathyanarayanan
<sathyanarayanan.kuppuswamy@...ux.intel.com>
Subject: [RFC v2-fix 1/1] x86/tdx: Handle in-kernel MMIO
From: "Kirill A. Shutemov" <kirill.shutemov@...ux.intel.com>
In traditional VMs, MMIO tends to be implemented by giving a
guest access to a mapping which will cause a VMEXIT on access.
That's not possible in TDX guest. So use #VE to implement MMIO
support. In TDX guest, MMIO triggers #VE with EPT_VIOLATION
exit reason.
For now we only handle a subset of instructions that the kernel
uses for MMIO operations. User-space access triggers SIGBUS.
Also, reasons for supporting #VE based MMIO in TDX guest are,
* MMIO is widely used and we'll have more drivers in the future.
* We don't want to annotate every TDX specific MMIO readl/writel etc.
* If we didn't annotate we would need to add an alternative to every
MMIO access in the kernel (even though 99.9% will never be used on
TDX) which would be a complete waste and incredible binary bloat
for nothing.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@...ux.intel.com>
Reviewed-by: Andi Kleen <ak@...ux.intel.com>
Signed-off-by: Kuppuswamy Sathyanarayanan <sathyanarayanan.kuppuswamy@...ux.intel.com>
---
Changes since RFC v2:
* Fixed commit log as per Dave's review.
arch/x86/kernel/tdx.c | 100 ++++++++++++++++++++++++++++++++++++++++++
1 file changed, 100 insertions(+)
diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
index b9e3010987e0..9330c7a9ad69 100644
--- a/arch/x86/kernel/tdx.c
+++ b/arch/x86/kernel/tdx.c
@@ -5,6 +5,8 @@
#include <asm/tdx.h>
#include <asm/vmx.h>
+#include <asm/insn.h>
+#include <linux/sched/signal.h> /* force_sig_fault() */
#include <linux/cpu.h>
#include <linux/protected_guest.h>
@@ -209,6 +211,101 @@ static void tdg_handle_io(struct pt_regs *regs, u32 exit_qual)
}
}
+static unsigned long tdg_mmio(int size, bool write, unsigned long addr,
+ unsigned long val)
+{
+ return tdx_hypercall_out_r11(EXIT_REASON_EPT_VIOLATION, size,
+ write, addr, val);
+}
+
+static inline void *get_reg_ptr(struct pt_regs *regs, struct insn *insn)
+{
+ static const int regoff[] = {
+ offsetof(struct pt_regs, ax),
+ offsetof(struct pt_regs, cx),
+ offsetof(struct pt_regs, dx),
+ offsetof(struct pt_regs, bx),
+ offsetof(struct pt_regs, sp),
+ offsetof(struct pt_regs, bp),
+ offsetof(struct pt_regs, si),
+ offsetof(struct pt_regs, di),
+ offsetof(struct pt_regs, r8),
+ offsetof(struct pt_regs, r9),
+ offsetof(struct pt_regs, r10),
+ offsetof(struct pt_regs, r11),
+ offsetof(struct pt_regs, r12),
+ offsetof(struct pt_regs, r13),
+ offsetof(struct pt_regs, r14),
+ offsetof(struct pt_regs, r15),
+ };
+ int regno;
+
+ regno = X86_MODRM_REG(insn->modrm.value);
+ if (X86_REX_R(insn->rex_prefix.value))
+ regno += 8;
+
+ return (void *)regs + regoff[regno];
+}
+
+static int tdg_handle_mmio(struct pt_regs *regs, struct ve_info *ve)
+{
+ int size;
+ bool write;
+ unsigned long *reg;
+ struct insn insn;
+ unsigned long val = 0;
+
+ /*
+ * User mode would mean the kernel exposed a device directly
+ * to ring3, which shouldn't happen except for things like
+ * DPDK.
+ */
+ if (user_mode(regs)) {
+ pr_err("Unexpected user-mode MMIO access.\n");
+ force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *) ve->gla);
+ return 0;
+ }
+
+ kernel_insn_init(&insn, (void *) regs->ip, MAX_INSN_SIZE);
+ insn_get_length(&insn);
+ insn_get_opcode(&insn);
+
+ write = ve->exit_qual & 0x2;
+
+ size = insn.opnd_bytes;
+ switch (insn.opcode.bytes[0]) {
+ /* MOV r/m8 r8 */
+ case 0x88:
+ /* MOV r8 r/m8 */
+ case 0x8A:
+ /* MOV r/m8 imm8 */
+ case 0xC6:
+ size = 1;
+ break;
+ }
+
+ if (inat_has_immediate(insn.attr)) {
+ BUG_ON(!write);
+ val = insn.immediate.value;
+ tdg_mmio(size, write, ve->gpa, val);
+ return insn.length;
+ }
+
+ BUG_ON(!inat_has_modrm(insn.attr));
+
+ reg = get_reg_ptr(regs, &insn);
+
+ if (write) {
+ memcpy(&val, reg, size);
+ tdg_mmio(size, write, ve->gpa, val);
+ } else {
+ val = tdg_mmio(size, write, ve->gpa, val);
+ memset(reg, 0, size);
+ memcpy(reg, &val, size);
+ }
+ return insn.length;
+}
+
unsigned long tdg_get_ve_info(struct ve_info *ve)
{
u64 ret;
@@ -258,6 +355,9 @@ int tdg_handle_virtualization_exception(struct pt_regs *regs,
case EXIT_REASON_IO_INSTRUCTION:
tdg_handle_io(regs, ve->exit_qual);
break;
+ case EXIT_REASON_EPT_VIOLATION:
+ ve->instr_len = tdg_handle_mmio(regs, ve);
+ break;
default:
pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
return -EFAULT;
--
2.25.1
Powered by blists - more mailing lists