lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <20251223054806.1611168-6-jon@nutanix.com>
Date: Mon, 22 Dec 2025 22:47:58 -0700
From: Jon Kohler <jon@...anix.com>
To: seanjc@...gle.com, pbonzini@...hat.com, tglx@...utronix.de,
        mingo@...hat.com, bp@...en8.de, dave.hansen@...ux.intel.com,
        x86@...nel.org, hpa@...or.com, kvm@...r.kernel.org,
        linux-kernel@...r.kernel.org, Jonathan Corbet <corbet@....net>,
        Jon Kohler <jon@...anix.com>,
        linux-doc@...r.kernel.org (open list:DOCUMENTATION)
Cc: ken@...elabs.ch, Alexander.Grest@...rosoft.com, chao.gao@...el.com,
        madvenka@...ux.microsoft.com, mic@...ikod.net, nsaenz@...zon.es,
        tao1.su@...ux.intel.com, xiaoyao.li@...el.com, zhao1.liu@...el.com
Subject: [PATCH 5/8] KVM: x86/mmu: bootstrap support for Intel MBEC

Extend kvm_mmu_page_role access bitfield from 3 to 4, where the 4th bit
will be used to track user executable pages with Intel mode-based
execute control (MBEC).

Extend SPTE generation and introduce shadow_ux value to account for
user and kernel executable distinctions under MBEC.

Extend kvm_mmu_page_role to include a new has_mbec field, such that a
given MMU's can be flagged as one with MBEC-awareness, which is to say
with an access bitfield that can evaluate to 4 different options.

Modified various functions to utilize the new access masks, ensuring
compatibility with MBEC.

Update mmu documentation to clarify the role of execute permissions when
MBEC is enabled.

Add capability helper for cpu_has_vmx_mode_based_ept_exec and plumb
into kvm_mmu_set_ept_masks.

NOTE: Also roll back change to ACC_ALL vs ACC_RWX in mmu.c, as VM's do
not boot properly, which likely need to be reworked, open to ideas!

Past that, no functional change intended, as nothing sets has_mbec yet.

Signed-off-by: Jon Kohler <jon@...anix.com>
---
 Documentation/virt/kvm/x86/mmu.rst |  9 ++++-
 arch/x86/include/asm/kvm_host.h    | 11 ++---
 arch/x86/include/asm/vmx.h         |  4 ++
 arch/x86/kvm/mmu.h                 |  8 +++-
 arch/x86/kvm/mmu/mmu.c             | 24 +++++------
 arch/x86/kvm/mmu/mmutrace.h        | 23 +++++++----
 arch/x86/kvm/mmu/paging_tmpl.h     | 24 +++++++----
 arch/x86/kvm/mmu/spte.c            | 65 +++++++++++++++++++++++-------
 arch/x86/kvm/mmu/spte.h            | 57 +++++++++++++++++++++++---
 arch/x86/kvm/mmu/tdp_mmu.c         | 12 ++++--
 arch/x86/kvm/vmx/capabilities.h    |  6 +++
 arch/x86/kvm/vmx/vmx.c             |  3 +-
 12 files changed, 188 insertions(+), 58 deletions(-)

diff --git a/Documentation/virt/kvm/x86/mmu.rst b/Documentation/virt/kvm/x86/mmu.rst
index 2b3b6d442302..f63db1a2b3df 100644
--- a/Documentation/virt/kvm/x86/mmu.rst
+++ b/Documentation/virt/kvm/x86/mmu.rst
@@ -172,7 +172,14 @@ Shadow pages contain the following information:
     quadrant maps 1GB virtual address space.
   role.access:
     Inherited guest access permissions from the parent ptes in the form uwx.
-    Note execute permission is positive, not negative.
+    Note execute permission is positive, not negative. When Intel MBEC is
+    not enabled, permissions follow the uwx form. When Intel MBEC is enabled,
+    execute is split into two permissions, kernel executable and user
+    executable, with the split controlled by role.has_mbec.
+  role.has_mbec:
+    When role.has_mbec=1, Intel mode-based execute control is active, which
+    gives the guest the ability to split execute pages into two permissions,
+    kernel executable and user executable.
   role.invalid:
     The page is invalid and should not be used.  It is a root page that is
     currently pinned (by a cpu hardware register pointing to it); once it is
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 48598d017d6f..66afcff43ec5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -319,11 +319,11 @@ struct kvm_kernel_irq_routing_entry;
  * the number of unique SPs that can theoretically be created is 2^n, where n
  * is the number of bits that are used to compute the role.
  *
- * But, even though there are 20 bits in the mask below, not all combinations
+ * But, even though there are 22 bits in the mask below, not all combinations
  * of modes and flags are possible:
  *
  *   - invalid shadow pages are not accounted, mirror pages are not shadowed,
- *     so the bits are effectively 18.
+ *     so the bits are effectively 20.
  *
  *   - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging);
  *     execonly and ad_disabled are only used for nested EPT which has
@@ -338,7 +338,7 @@ struct kvm_kernel_irq_routing_entry;
  *     cr0_wp=0, therefore these three bits only give rise to 5 possibilities.
  *
  * Therefore, the maximum number of possible upper-level shadow pages for a
- * single gfn is a bit less than 2^13.
+ * single gfn is a bit less than 2^15.
  */
 union kvm_mmu_page_role {
 	u32 word;
@@ -347,7 +347,7 @@ union kvm_mmu_page_role {
 		unsigned has_4_byte_gpte:1;
 		unsigned quadrant:2;
 		unsigned direct:1;
-		unsigned access:3;
+		unsigned access:4;
 		unsigned invalid:1;
 		unsigned efer_nx:1;
 		unsigned cr0_wp:1;
@@ -357,7 +357,8 @@ union kvm_mmu_page_role {
 		unsigned guest_mode:1;
 		unsigned passthrough:1;
 		unsigned is_mirror:1;
-		unsigned :4;
+		unsigned has_mbec:1;
+		unsigned:2;
 
 		/*
 		 * This is left at the top of the word so that
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index de3abec84fe5..9a98c063148c 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -547,6 +547,7 @@ enum vmcs_field {
 #define VMX_EPT_IPAT_BIT    			(1ull << 6)
 #define VMX_EPT_ACCESS_BIT			(1ull << 8)
 #define VMX_EPT_DIRTY_BIT			(1ull << 9)
+#define VMX_EPT_USER_EXECUTABLE_MASK		(1ull << 10)
 #define VMX_EPT_SUPPRESS_VE_BIT			(1ull << 63)
 #define VMX_EPT_RWX_MASK                        (VMX_EPT_READABLE_MASK |       \
 						 VMX_EPT_WRITABLE_MASK |       \
@@ -605,9 +606,12 @@ enum vm_entry_failure_code {
 #define EPT_VIOLATION_GVA_TRANSLATED	BIT(8)
 
 #define EPT_VIOLATION_RWX_TO_PROT(__epte) (((__epte) & VMX_EPT_RWX_MASK) << 3)
+#define EPT_VIOLATION_USER_EXEC_TO_PROT(__epte) (((__epte) & VMX_EPT_USER_EXECUTABLE_MASK) >> 4)
 
 static_assert(EPT_VIOLATION_RWX_TO_PROT(VMX_EPT_RWX_MASK) ==
 	      (EPT_VIOLATION_PROT_READ | EPT_VIOLATION_PROT_WRITE | EPT_VIOLATION_PROT_EXEC));
+static_assert(EPT_VIOLATION_USER_EXEC_TO_PROT(VMX_EPT_USER_EXECUTABLE_MASK) ==
+	      (EPT_VIOLATION_PROT_USER_EXEC));
 
 /*
  * Exit Qualifications for NOTIFY VM EXIT
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index f63074048ec6..558a15ff82e6 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -81,7 +81,8 @@ u8 kvm_mmu_get_max_tdp_level(void);
 void kvm_mmu_set_mmio_spte_mask(u64 mmio_value, u64 mmio_mask, u64 access_mask);
 void kvm_mmu_set_mmio_spte_value(struct kvm *kvm, u64 mmio_value);
 void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask);
-void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only);
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only,
+			   bool has_mbec);
 
 void kvm_init_mmu(struct kvm_vcpu *vcpu);
 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
@@ -174,6 +175,11 @@ static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
 	__kvm_mmu_refresh_passthrough_bits(vcpu, mmu);
 }
 
+static inline bool mmu_has_mbec(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.mmu->root_role.has_mbec;
+}
+
 /*
  * Check if a given access (described through the I/D, W/R and U/S bits of a
  * page fault error code pfec) causes a permission fault with the given PTE
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index b1a7c7cc0c44..b0eb8d4c5ef2 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2033,7 +2033,7 @@ static bool kvm_sync_page_check(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	 */
 	const union kvm_mmu_page_role sync_role_ign = {
 		.level = 0xf,
-		.access = 0x7,
+		.access = 0xf,
 		.quadrant = 0x3,
 		.passthrough = 0x1,
 	};
@@ -3080,7 +3080,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, struct kvm_memory_slot *slot,
 		ret = RET_PF_SPURIOUS;
 	} else {
 		flush |= mmu_spte_update(sptep, spte);
-		trace_kvm_mmu_set_spte(level, gfn, sptep);
+		trace_kvm_mmu_set_spte(vcpu, level, gfn, sptep);
 	}
 
 	if (wrprot && write_fault)
@@ -3452,7 +3452,7 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		if (it.level == fault->goal_level)
 			break;
 
-		sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_RWX);
+		sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
 		if (sp == ERR_PTR(-EEXIST))
 			continue;
 
@@ -3465,7 +3465,7 @@ static int direct_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 	if (WARN_ON_ONCE(it.level != fault->goal_level))
 		return -EFAULT;
 
-	ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_RWX,
+	ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
 			   base_gfn, fault->pfn, fault);
 	if (ret == RET_PF_SPURIOUS)
 		return ret;
@@ -3698,9 +3698,9 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		 * current CPU took the fault.
 		 *
 		 * Need not check the access of upper level table entries since
-		 * they are always ACC_RWX.
+		 * they are always ACC_ALL.
 		 */
-		if (is_access_allowed(fault, spte)) {
+		if (is_access_allowed(fault, spte, vcpu)) {
 			ret = RET_PF_SPURIOUS;
 			break;
 		}
@@ -3748,7 +3748,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 
 		/* Verify that the fault can be handled in the fast path */
 		if (new_spte == spte ||
-		    !is_access_allowed(fault, new_spte))
+		    !is_access_allowed(fault, new_spte, vcpu))
 			break;
 
 		/*
@@ -4804,7 +4804,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	if (r)
 		return r;
 
-	r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_RWX);
+	r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL);
 	if (r != RET_PF_CONTINUE)
 		return r;
 
@@ -4895,7 +4895,7 @@ static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
 	if (r)
 		return r;
 
-	r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_RWX);
+	r = kvm_mmu_faultin_pfn(vcpu, fault, ACC_ALL);
 	if (r != RET_PF_CONTINUE)
 		return r;
 
@@ -5614,7 +5614,7 @@ static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
 {
 	union kvm_cpu_role role = {0};
 
-	role.base.access = ACC_RWX;
+	role.base.access = ACC_ALL;
 	role.base.smm = is_smm(vcpu);
 	role.base.guest_mode = is_guest_mode(vcpu);
 	role.ext.valid = 1;
@@ -5695,7 +5695,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
 {
 	union kvm_mmu_page_role role = {0};
 
-	role.access = ACC_RWX;
+	role.access = ACC_ALL;
 	role.cr0_wp = true;
 	role.efer_nx = true;
 	role.smm = cpu_role.base.smm;
@@ -5826,7 +5826,7 @@ kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
 	role.base.direct = false;
 	role.base.ad_disabled = !accessed_dirty;
 	role.base.guest_mode = true;
-	role.base.access = ACC_RWX;
+	role.base.access = ACC_ALL;
 
 	role.ext.word = 0;
 	role.ext.execonly = execonly;
diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h
index 764e3015d021..74d51370422a 100644
--- a/arch/x86/kvm/mmu/mmutrace.h
+++ b/arch/x86/kvm/mmu/mmutrace.h
@@ -22,10 +22,16 @@
 	__entry->root_count = sp->root_count;		\
 	__entry->unsync = sp->unsync;
 
+/*
+ * X == ACC_EXEC_MASK: executable without guest_exec_control and only
+ *                     supervisor execute with guest exec control
+ * x == ACC_USER_EXEC_MASK: user execute with guest exec control
+ */
 #define KVM_MMU_PAGE_PRINTK() ({				        \
 	const char *saved_ptr = trace_seq_buffer_ptr(p);		\
 	static const char *access_str[] = {			        \
-		"---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"  \
+		"----", "---X", "-w--", "-w-X", "--u-", "--uX", "-wu-", "-wuX", \
+		"x---", "x--X", "xw--", "xw-X", "x-u-", "x-uX", "xwu-", "xwuX"	\
 	};							        \
 	union kvm_mmu_page_role role;				        \
 								        \
@@ -336,8 +342,8 @@ TRACE_EVENT(
 
 TRACE_EVENT(
 	kvm_mmu_set_spte,
-	TP_PROTO(int level, gfn_t gfn, u64 *sptep),
-	TP_ARGS(level, gfn, sptep),
+	TP_PROTO(struct kvm_vcpu *vcpu, int level, gfn_t gfn, u64 *sptep),
+	TP_ARGS(vcpu, level, gfn, sptep),
 
 	TP_STRUCT__entry(
 		__field(u64, gfn)
@@ -346,7 +352,8 @@ TRACE_EVENT(
 		__field(u8, level)
 		/* These depend on page entry type, so compute them now.  */
 		__field(bool, r)
-		__field(bool, x)
+		__field(bool, kx)
+		__field(bool, ux)
 		__field(signed char, u)
 	),
 
@@ -356,15 +363,17 @@ TRACE_EVENT(
 		__entry->sptep = virt_to_phys(sptep);
 		__entry->level = level;
 		__entry->r = shadow_present_mask || (__entry->spte & PT_PRESENT_MASK);
-		__entry->x = is_executable_pte(__entry->spte);
+		__entry->kx = is_executable_pte(__entry->spte, false, vcpu);
+		__entry->ux = is_executable_pte(__entry->spte, true, vcpu);
 		__entry->u = shadow_user_mask ? !!(__entry->spte & shadow_user_mask) : -1;
 	),
 
-	TP_printk("gfn %llx spte %llx (%s%s%s%s) level %d at %llx",
+	TP_printk("gfn %llx spte %llx (%s%s%s%s%s) level %d at %llx",
 		  __entry->gfn, __entry->spte,
 		  __entry->r ? "r" : "-",
 		  __entry->spte & PT_WRITABLE_MASK ? "w" : "-",
-		  __entry->x ? "x" : "-",
+		  __entry->kx ? "X" : "-",
+		  __entry->ux ? "x" : "-",
 		  __entry->u == -1 ? "" : (__entry->u ? "u" : "-"),
 		  __entry->level, __entry->sptep
 	)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index ed762bb4b007..664b424108ed 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -124,12 +124,17 @@ static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *acce
 	*access &= mask;
 }
 
-static inline int FNAME(is_present_gpte)(unsigned long pte)
+static inline int FNAME(is_present_gpte)(struct kvm_vcpu *vcpu,
+					 unsigned long pte)
 {
 #if PTTYPE != PTTYPE_EPT
 	return pte & PT_PRESENT_MASK;
 #else
-	return pte & 7;
+	/*
+	 * For EPT, an entry is present if any of bits 2:0 are set.
+	 * With mode-based execute control, bit 10 also indicates presence.
+	 */
+	return (pte & 7) || (mmu_has_mbec(vcpu) && (pte & (1ULL << 10)));
 #endif
 }
 
@@ -152,7 +157,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 				  struct kvm_mmu_page *sp, u64 *spte,
 				  u64 gpte)
 {
-	if (!FNAME(is_present_gpte)(gpte))
+	if (!FNAME(is_present_gpte)(vcpu, gpte))
 		goto no_present;
 
 	/* Prefetch only accessed entries (unless A/D bits are disabled). */
@@ -181,8 +186,9 @@ static inline unsigned FNAME(gpte_access)(u64 gpte)
 	unsigned access;
 #if PTTYPE == PTTYPE_EPT
 	access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
-		((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
-		((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
+		 ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
+		 ((gpte & VMX_EPT_USER_EXECUTABLE_MASK) ? ACC_USER_EXEC_MASK : 0) |
+		 ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
 #else
 	BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
 	BUILD_BUG_ON(ACC_EXEC_MASK != 1);
@@ -332,7 +338,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	if (walker->level == PT32E_ROOT_LEVEL) {
 		pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
-		if (!FNAME(is_present_gpte)(pte))
+		if (!FNAME(is_present_gpte)(vcpu, pte))
 			goto error;
 		--walker->level;
 	}
@@ -414,7 +420,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 		 */
 		pte_access = pt_access & (pte ^ walk_nx_mask);
 
-		if (unlikely(!FNAME(is_present_gpte)(pte)))
+		if (unlikely(!FNAME(is_present_gpte)(vcpu, pte)))
 			goto error;
 
 		if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) {
@@ -493,6 +499,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	 *         out of date if it is serving an EPT misconfiguration.
 	 * [5:3] - Calculated by the page walk of the guest EPT page tables
 	 * [7:8] - Derived from [7:8] of real exit_qualification
+	 * [10]  - Derived from real exit_qualification, useful only with MBEC.
 	 *
 	 * The other bits are set to 0.
 	 */
@@ -511,6 +518,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 		 * ACC_*_MASK flags!
 		 */
 		walker->fault.exit_qualification |= EPT_VIOLATION_RWX_TO_PROT(pte_access);
+		if (mmu_has_mbec(vcpu))
+			walker->fault.exit_qualification |=
+				EPT_VIOLATION_USER_EXEC_TO_PROT(pte_access);
 	}
 #endif
 	walker->fault.address = addr;
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 37647afde7d3..a4720eedcacb 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -30,6 +30,7 @@ u64 __read_mostly shadow_host_writable_mask;
 u64 __read_mostly shadow_mmu_writable_mask;
 u64 __read_mostly shadow_nx_mask;
 u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
+u64 __read_mostly shadow_ux_mask;
 u64 __read_mostly shadow_user_mask;
 u64 __read_mostly shadow_accessed_mask;
 u64 __read_mostly shadow_dirty_mask;
@@ -223,19 +224,38 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	 * would tie make_spte() further to vCPU/MMU state, and add complexity
 	 * just to optimize a mode that is anything but performance critical.
 	 */
-	if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) &&
+	if (level > PG_LEVEL_4K &&
+	    (pte_access & (ACC_EXEC_MASK | ACC_USER_EXEC_MASK)) &&
 	    is_nx_huge_page_enabled(vcpu->kvm)) {
 		pte_access &= ~ACC_EXEC_MASK;
+		if (mmu_has_mbec(vcpu))
+			pte_access &= ~ACC_USER_EXEC_MASK;
 	}
 
-	if (pte_access & ACC_EXEC_MASK)
-		spte |= shadow_x_mask;
-	else
+	if (pte_access & (ACC_EXEC_MASK | ACC_USER_EXEC_MASK)) {
+		if (pte_access & ACC_EXEC_MASK)
+			spte |= shadow_x_mask;
+
+		if (pte_access & ACC_USER_EXEC_MASK)
+			spte |= shadow_ux_mask;
+	} else {
 		spte |= shadow_nx_mask;
+	}
 
 	if (pte_access & ACC_USER_MASK)
 		spte |= shadow_user_mask;
 
+	/*
+	 * With MBEC enabled, EPT misconfigurations occur if bit 0 is clear
+	 * (read disabled) and bit 10 is set (user-executable). Prevent the
+	 * creation of such invalid SPTEs by clearing the user-executable bit
+	 * when read access is not permitted.
+	 */
+	if (mmu_has_mbec(vcpu) &&
+	    !(spte & VMX_EPT_READABLE_MASK) &&
+	    (spte & VMX_EPT_USER_EXECUTABLE_MASK))
+		spte &= ~VMX_EPT_USER_EXECUTABLE_MASK;
+
 	if (level > PG_LEVEL_4K)
 		spte |= PT_PAGE_SIZE_MASK;
 
@@ -311,17 +331,21 @@ static u64 modify_spte_protections(u64 spte, u64 set, u64 clear)
 	KVM_MMU_WARN_ON(set & clear);
 	spte = (spte | set) & ~clear;
 
+	/*
+	 * With MBEC enabled, ensure we don't create invalid SPTEs where
+	 * read access is disabled but user-executable access is enabled.
+	 */
+	if (shadow_ux_mask &&
+	    !(spte & VMX_EPT_READABLE_MASK) &&
+	    (spte & VMX_EPT_USER_EXECUTABLE_MASK))
+		spte &= ~VMX_EPT_USER_EXECUTABLE_MASK;
+
 	if (is_access_track)
 		spte = mark_spte_for_access_track(spte);
 
 	return spte;
 }
 
-static u64 make_spte_executable(u64 spte)
-{
-	return modify_spte_protections(spte, shadow_x_mask, shadow_nx_mask);
-}
-
 static u64 make_spte_nonexecutable(u64 spte)
 {
 	return modify_spte_protections(spte, shadow_nx_mask, shadow_x_mask);
@@ -356,8 +380,14 @@ u64 make_small_spte(struct kvm *kvm, u64 huge_spte,
 		 * the page executable as the NX hugepage mitigation no longer
 		 * applies.
 		 */
-		if ((role.access & ACC_EXEC_MASK) && is_nx_huge_page_enabled(kvm))
-			child_spte = make_spte_executable(child_spte);
+		if ((role.access & (ACC_EXEC_MASK | ACC_USER_EXEC_MASK)) &&
+		    is_nx_huge_page_enabled(kvm)) {
+			if (role.access & ACC_EXEC_MASK)
+				child_spte |= shadow_x_mask;
+
+			if (role.access & ACC_USER_EXEC_MASK)
+				child_spte |= shadow_ux_mask;
+		}
 	}
 
 	return child_spte;
@@ -389,7 +419,8 @@ u64 make_nonleaf_spte(u64 *child_pt, bool ad_disabled)
 	u64 spte = SPTE_MMU_PRESENT_MASK;
 
 	spte |= __pa(child_pt) | shadow_present_mask | PT_WRITABLE_MASK |
-		shadow_user_mask | shadow_x_mask | shadow_me_value;
+		shadow_user_mask | shadow_ux_mask | shadow_x_mask |
+		shadow_me_value;
 
 	if (ad_disabled)
 		spte |= SPTE_TDP_AD_DISABLED;
@@ -489,7 +520,8 @@ void kvm_mmu_set_me_spte_mask(u64 me_value, u64 me_mask)
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_me_spte_mask);
 
-void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
+void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only,
+			   bool has_mbec)
 {
 	kvm_ad_enabled		= has_ad_bits;
 
@@ -501,7 +533,7 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
 	/* VMX_EPT_SUPPRESS_VE_BIT is needed for W or X violation. */
 	shadow_present_mask	=
 		(has_exec_only ? 0ull : VMX_EPT_READABLE_MASK) | VMX_EPT_SUPPRESS_VE_BIT;
-
+	shadow_ux_mask		= has_mbec ? VMX_EPT_USER_EXECUTABLE_MASK : 0ull;
 	shadow_acc_track_mask	= VMX_EPT_RWX_MASK;
 	shadow_host_writable_mask = EPT_SPTE_HOST_WRITABLE;
 	shadow_mmu_writable_mask  = EPT_SPTE_MMU_WRITABLE;
@@ -509,9 +541,11 @@ void kvm_mmu_set_ept_masks(bool has_ad_bits, bool has_exec_only)
 	/*
 	 * EPT Misconfigurations are generated if the value of bits 2:0
 	 * of an EPT paging-structure entry is 110b (write/execute).
+	 * With MBEC, the additional case of bit 0 clear and bit 10 set
+	 * (read disabled but user-executable) is prevented in make_spte().
 	 */
 	kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE,
-				   VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT, 0);
+				   (VMX_EPT_RWX_MASK | VMX_EPT_SUPPRESS_VE_BIT), 0);
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_set_ept_masks);
 
@@ -551,6 +585,7 @@ void kvm_mmu_reset_all_pte_masks(void)
 	shadow_dirty_mask	= PT_DIRTY_MASK;
 	shadow_nx_mask		= PT64_NX_MASK;
 	shadow_x_mask		= 0;
+	shadow_ux_mask		= 0;
 	shadow_present_mask	= PT_PRESENT_MASK;
 
 	shadow_acc_track_mask	= 0;
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 101a2f61ec96..74fb1fe60d89 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -45,8 +45,9 @@ static_assert(SPTE_TDP_AD_ENABLED == 0);
 #define ACC_EXEC_MASK    1
 #define ACC_WRITE_MASK   PT_WRITABLE_MASK
 #define ACC_USER_MASK    PT_USER_MASK
+#define ACC_USER_EXEC_MASK (1ULL << 3)
 #define ACC_RWX          (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
-#define ACC_ALL          ACC_RWX
+#define ACC_ALL          (ACC_RWX | ACC_USER_EXEC_MASK)
 
 /* The mask for the R/X bits in EPT PTEs */
 #define SPTE_EPT_READABLE_MASK			0x1ull
@@ -180,6 +181,7 @@ extern u64 __read_mostly shadow_mmu_writable_mask;
 extern u64 __read_mostly shadow_nx_mask;
 extern u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
 extern u64 __read_mostly shadow_user_mask;
+extern u64 __read_mostly shadow_ux_mask;
 extern u64 __read_mostly shadow_accessed_mask;
 extern u64 __read_mostly shadow_dirty_mask;
 extern u64 __read_mostly shadow_mmio_value;
@@ -344,9 +346,53 @@ static inline bool is_last_spte(u64 pte, int level)
 	return (level == PG_LEVEL_4K) || is_large_pte(pte);
 }
 
-static inline bool is_executable_pte(u64 spte)
+static inline bool is_executable_pte(u64 spte, bool is_user_access,
+				     struct kvm_vcpu *vcpu)
 {
-	return (spte & (shadow_x_mask | shadow_nx_mask)) == shadow_x_mask;
+	if (spte & shadow_nx_mask)
+		return false;
+
+	if (!mmu_has_mbec(vcpu))
+		return (spte & shadow_x_mask) == shadow_x_mask;
+
+	/*
+	 * Warn against AMD systems (where shadow_x_mask == 0) reaching
+	 * this point, so this will always evaluate to true for user-mode
+	 * pages, but until GMET is implemented, this should be a no-op.
+	 */
+	if (WARN_ON_ONCE(!shadow_x_mask))
+		return is_user_access || !(spte & shadow_user_mask);
+
+	return spte & (is_user_access ? shadow_ux_mask : shadow_x_mask);
+}
+
+static inline bool is_executable_pte_fault(u64 spte,
+					   struct kvm_page_fault *fault,
+					   struct kvm_vcpu *vcpu)
+{
+	if (spte & shadow_nx_mask)
+		return false;
+
+	if (!mmu_has_mbec(vcpu))
+		return (spte & shadow_x_mask) == shadow_x_mask;
+
+	/*
+	 * Warn against AMD systems (where shadow_x_mask == 0) reaching
+	 * this point, so this will always evaluate to true for user-mode
+	 * pages, but until GMET is implemented, this should be a no-op.
+	 */
+	if (WARN_ON_ONCE(!shadow_x_mask))
+		return fault->user || !(spte & shadow_user_mask);
+
+	/*
+	 * For TDP MMU, the fault->user bit indicates a read access,
+	 * not the guest's CPL. For execute faults, check both execute
+	 * permissions since we don't know the actual CPL.
+	 */
+	if (fault->is_tdp)
+		return spte & (shadow_x_mask | shadow_ux_mask);
+
+	return spte & (fault->user ? shadow_ux_mask : shadow_x_mask);
 }
 
 static inline kvm_pfn_t spte_to_pfn(u64 pte)
@@ -479,10 +525,11 @@ static inline bool is_mmu_writable_spte(u64 spte)
  * SPTE protections.  Note, the caller is responsible for checking that the
  * SPTE is a shadow-present, leaf SPTE (either before or after).
  */
-static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
+static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte,
+				     struct kvm_vcpu *vcpu)
 {
 	if (fault->exec)
-		return is_executable_pte(spte);
+		return is_executable_pte_fault(spte, fault, vcpu);
 
 	if (fault->write)
 		return is_writable_pte(spte);
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 98221ed34c51..46988a11dc51 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1183,16 +1183,20 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 		return RET_PF_RETRY;
 
 	if (is_shadow_present_pte(iter->old_spte) &&
-	    (fault->prefetch || is_access_allowed(fault, iter->old_spte)) &&
+	    (fault->prefetch ||
+	     is_access_allowed(fault, iter->old_spte, vcpu)) &&
 	    is_last_spte(iter->old_spte, iter->level)) {
 		WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte));
 		return RET_PF_SPURIOUS;
 	}
 
 	if (unlikely(!fault->slot))
-		new_spte = make_mmio_spte(vcpu, iter->gfn, ACC_RWX);
+		new_spte = make_mmio_spte(vcpu, iter->gfn,
+					  ACC_ALL);
 	else
-		wrprot = make_spte(vcpu, sp, fault->slot, ACC_RWX, iter->gfn,
+		wrprot = make_spte(vcpu, sp, fault->slot,
+				   ACC_ALL,
+				   iter->gfn,
 				   fault->pfn, iter->old_spte, fault->prefetch,
 				   false, fault->map_writable, &new_spte);
 
@@ -1220,7 +1224,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 				     new_spte);
 		ret = RET_PF_EMULATE;
 	} else {
-		trace_kvm_mmu_set_spte(iter->level, iter->gfn,
+		trace_kvm_mmu_set_spte(vcpu, iter->level, iter->gfn,
 				       rcu_dereference(iter->sptep));
 	}
 
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 02aadb9d730e..8107c8cb1d4b 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -270,6 +270,12 @@ static inline bool cpu_has_vmx_tsc_scaling(void)
 		SECONDARY_EXEC_TSC_SCALING;
 }
 
+static inline bool cpu_has_vmx_mode_based_ept_exec(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_MODE_BASED_EPT_EXEC;
+}
+
 static inline bool cpu_has_vmx_bus_lock_detection(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index b6e370213769..520ccca27502 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8607,7 +8607,8 @@ __init int vmx_hardware_setup(void)
 
 	if (enable_ept)
 		kvm_mmu_set_ept_masks(enable_ept_ad_bits,
-				      cpu_has_vmx_ept_execute_only());
+				      cpu_has_vmx_ept_execute_only(),
+				      cpu_has_vmx_mode_based_ept_exec());
 	else
 		vt_x86_ops.get_mt_mask = NULL;
 
-- 
2.43.0


Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ