lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <080351cb-6c3d-4540-953d-6205f1ff0745@suse.com>
Date: Tue, 22 Apr 2025 13:12:29 +0200
From: Jürgen Groß <jgross@...e.com>
To: "Xin Li (Intel)" <xin@...or.com>, linux-kernel@...r.kernel.org,
 kvm@...r.kernel.org, linux-perf-users@...r.kernel.org,
 linux-hyperv@...r.kernel.org, virtualization@...ts.linux.dev,
 linux-pm@...r.kernel.org, linux-edac@...r.kernel.org,
 xen-devel@...ts.xenproject.org, linux-acpi@...r.kernel.org,
 linux-hwmon@...r.kernel.org, netdev@...r.kernel.org,
 platform-driver-x86@...r.kernel.org
Cc: tglx@...utronix.de, mingo@...hat.com, bp@...en8.de,
 dave.hansen@...ux.intel.com, x86@...nel.org, hpa@...or.com, acme@...nel.org,
 andrew.cooper3@...rix.com, peterz@...radead.org, namhyung@...nel.org,
 mark.rutland@....com, alexander.shishkin@...ux.intel.com, jolsa@...nel.org,
 irogers@...gle.com, adrian.hunter@...el.com, kan.liang@...ux.intel.com,
 wei.liu@...nel.org, ajay.kaher@...adcom.com,
 bcm-kernel-feedback-list@...adcom.com, tony.luck@...el.com,
 pbonzini@...hat.com, vkuznets@...hat.com, seanjc@...gle.com,
 luto@...nel.org, boris.ostrovsky@...cle.com, kys@...rosoft.com,
 haiyangz@...rosoft.com, decui@...rosoft.com
Subject: Re: [RFC PATCH v2 22/34] x86/msr: Utilize the alternatives mechanism
 to read MSR

On 22.04.25 10:22, Xin Li (Intel) wrote:
> To eliminate the indirect call overhead introduced by the pv_ops API,
> utilize the alternatives mechanism to read MSR:
> 
>      1) When built with !CONFIG_XEN_PV, X86_FEATURE_XENPV becomes a
>         disabled feature, preventing the Xen code from being built
>         and ensuring the native code is executed unconditionally.
> 
>      2) When built with CONFIG_XEN_PV:
> 
>         2.1) If not running on the Xen hypervisor (!X86_FEATURE_XENPV),
>              the kernel runtime binary is patched to unconditionally
>              jump to the native MSR read code.
> 
>         2.2) If running on the Xen hypervisor (X86_FEATURE_XENPV), the
>              kernel runtime binary is patched to unconditionally jump
>              to the Xen MSR read code.
> 
> The alternatives mechanism is also used to choose the new immediate
> form MSR read instruction when it's available.
> 
> Consequently, remove the pv_ops MSR read APIs and the Xen callbacks.
> 
> Suggested-by: H. Peter Anvin (Intel) <hpa@...or.com>
> Signed-off-by: Xin Li (Intel) <xin@...or.com>
> ---
>   arch/x86/include/asm/msr.h            | 277 +++++++++++++++++++-------
>   arch/x86/include/asm/paravirt.h       |  40 ----
>   arch/x86/include/asm/paravirt_types.h |   9 -
>   arch/x86/kernel/paravirt.c            |   2 -
>   arch/x86/xen/enlighten_pv.c           |  48 ++---
>   arch/x86/xen/xen-asm.S                |  49 +++++
>   arch/x86/xen/xen-ops.h                |   7 +
>   7 files changed, 279 insertions(+), 153 deletions(-)
> 
> diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
> index bd3bdb3c3d23..5271cb002b23 100644
> --- a/arch/x86/include/asm/msr.h
> +++ b/arch/x86/include/asm/msr.h
> @@ -75,6 +75,7 @@ static inline void do_trace_rdpmc(u32 msr, u64 val, int failed) {}
>   #endif
>   
>   #ifdef CONFIG_XEN_PV
> +extern void asm_xen_read_msr(void);
>   extern void asm_xen_write_msr(void);
>   extern u64 xen_read_pmc(int counter);
>   #endif
> @@ -88,6 +89,8 @@ extern u64 xen_read_pmc(int counter);
>   
>   /* The GNU Assembler (Gas) with Binutils 2.41 adds the .insn directive support */
>   #if defined(CONFIG_AS_IS_GNU) && CONFIG_AS_VERSION >= 24100
> +#define ASM_RDMSR_IMM			\
> +	" .insn VEX.128.F2.M7.W0 0xf6 /0, %[msr]%{:u32}, %[val]\n\t"
>   #define ASM_WRMSRNS_IMM			\
>   	" .insn VEX.128.F3.M7.W0 0xf6 /0, %[val], %[msr]%{:u32}\n\t"
>   #else
> @@ -97,10 +100,17 @@ extern u64 xen_read_pmc(int counter);
>    * The register operand is encoded as %rax because all uses of the immediate
>    * form MSR access instructions reference %rax as the register operand.
>    */
> +#define ASM_RDMSR_IMM			\
> +	" .byte 0xc4,0xe7,0x7b,0xf6,0xc0; .long %c[msr]"
>   #define ASM_WRMSRNS_IMM			\
>   	" .byte 0xc4,0xe7,0x7a,0xf6,0xc0; .long %c[msr]"
>   #endif
>   
> +#define RDMSR_AND_SAVE_RESULT		\
> +	"rdmsr\n\t"			\
> +	"shl $0x20, %%rdx\n\t"		\
> +	"or %%rdx, %%rax\n\t"
> +
>   #define PREPARE_RDX_FOR_WRMSR		\
>   	"mov %%rax, %%rdx\n\t"		\
>   	"shr $0x20, %%rdx\n\t"
> @@ -127,35 +137,135 @@ static __always_inline bool is_msr_imm_insn(void *ip)
>   #endif
>   }
>   
> -static __always_inline u64 __rdmsr(u32 msr)
> +/*
> + * There are two sets of APIs for MSR accesses: native APIs and generic APIs.
> + * Native MSR APIs execute MSR instructions directly, regardless of whether the
> + * CPU is paravirtualized or native.  Generic MSR APIs determine the appropriate
> + * MSR access method at runtime, allowing them to be used generically on both
> + * paravirtualized and native CPUs.
> + *
> + * When the compiler can determine the MSR number at compile time, the APIs
> + * with the suffix _constant() are used to enable the immediate form MSR
> + * instructions when available.  The APIs with the suffix _variable() are
> + * used when the MSR number is not known until run time.
> + *
> + * Below is a diagram illustrating the derivation of the MSR read APIs:
> + *
> + *      __native_rdmsrq_variable()    __native_rdmsrq_constant()
> + *                         \           /
> + *                          \         /
> + *                         __native_rdmsrq()   -----------------------
> + *                            /     \                                |
> + *                           /       \                               |
> + *               native_rdmsrq()    native_read_msr_safe()           |
> + *                   /    \                                          |
> + *                  /      \                                         |
> + *      native_rdmsr()    native_read_msr()                          |
> + *                                                                   |
> + *                                                                   |
> + *                                                                   |
> + *                    __xenpv_rdmsrq()                               |
> + *                         |                                         |
> + *                         |                                         |
> + *                      __rdmsrq()   <--------------------------------
> + *                       /    \
> + *                      /      \
> + *                 rdmsrq()   rdmsrq_safe()
> + *                    /          \
> + *                   /            \
> + *                rdmsr()        rdmsr_safe()
> + */
> +
> +static __always_inline bool __native_rdmsrq_variable(u32 msr, u64 *val, int type)
>   {
> -	DECLARE_ARGS(val, low, high);
> +#ifdef CONFIG_X86_64
> +	BUILD_BUG_ON(__builtin_constant_p(msr));
>   
> -	asm volatile("1: rdmsr\n"
> -		     "2:\n"
> -		     _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR)
> -		     : EAX_EDX_RET(val, low, high) : "c" (msr));
> +	asm_inline volatile goto(
> +		"1:\n"
> +		RDMSR_AND_SAVE_RESULT
> +		_ASM_EXTABLE_TYPE(1b, %l[badmsr], %c[type])	/* For RDMSR */
>   
> -	return EAX_EDX_VAL(val, low, high);
> +		: [val] "=a" (*val)
> +		: "c" (msr), [type] "i" (type)
> +		: "rdx"
> +		: badmsr);
> +#else
> +	asm_inline volatile goto(
> +		"1: rdmsr\n\t"
> +		_ASM_EXTABLE_TYPE(1b, %l[badmsr], %c[type])	/* For RDMSR */
> +
> +		: "=A" (*val)
> +		: "c" (msr), [type] "i" (type)
> +		:
> +		: badmsr);
> +#endif
> +
> +	return false;
> +
> +badmsr:
> +	*val = 0;
> +
> +	return true;
>   }
>   
> -#define native_rdmsr(msr, val1, val2)			\
> -do {							\
> -	u64 __val = __rdmsr((msr));			\
> -	(void)((val1) = (u32)__val);			\
> -	(void)((val2) = (u32)(__val >> 32));		\
> -} while (0)
> +#ifdef CONFIG_X86_64
> +static __always_inline bool __native_rdmsrq_constant(u32 msr, u64 *val, int type)
> +{
> +	BUILD_BUG_ON(!__builtin_constant_p(msr));
> +
> +	asm_inline volatile goto(
> +		"1:\n"
> +		ALTERNATIVE("mov %[msr], %%ecx\n\t"
> +			    "2:\n"
> +			    RDMSR_AND_SAVE_RESULT,
> +			    ASM_RDMSR_IMM,
> +			    X86_FEATURE_MSR_IMM)
> +		_ASM_EXTABLE_TYPE(1b, %l[badmsr], %c[type])	/* For RDMSR immediate */
> +		_ASM_EXTABLE_TYPE(2b, %l[badmsr], %c[type])	/* For RDMSR */
> +
> +		: [val] "=a" (*val)
> +		: [msr] "i" (msr), [type] "i" (type)
> +		: "ecx", "rdx"
> +		: badmsr);
> +
> +	return false;
> +
> +badmsr:
> +	*val = 0;
> +
> +	return true;
> +}
> +#endif
> +
> +static __always_inline bool __native_rdmsrq(u32 msr, u64 *val, int type)
> +{
> +#ifdef CONFIG_X86_64
> +	if (__builtin_constant_p(msr))
> +		return __native_rdmsrq_constant(msr, val, type);
> +#endif
> +
> +	return __native_rdmsrq_variable(msr, val, type);
> +}
>   
>   static __always_inline u64 native_rdmsrq(u32 msr)
>   {
> -	return __rdmsr(msr);
> +	u64 val = 0;
> +
> +	__native_rdmsrq(msr, &val, EX_TYPE_RDMSR);
> +	return val;
>   }
>   
> +#define native_rdmsr(msr, low, high)			\
> +do {							\
> +	u64 __val = native_rdmsrq(msr);			\
> +	(void)((low) = (u32)__val);			\
> +	(void)((high) = (u32)(__val >> 32));		\
> +} while (0)
> +
>   static inline u64 native_read_msr(u32 msr)
>   {
> -	u64 val;
> -
> -	val = __rdmsr(msr);
> +	u64 val = native_rdmsrq(msr);
>   
>   	if (tracepoint_enabled(read_msr))
>   		do_trace_read_msr(msr, val, 0);
> @@ -163,36 +273,91 @@ static inline u64 native_read_msr(u32 msr)
>   	return val;
>   }
>   
> -static inline int native_read_msr_safe(u32 msr, u64 *p)
> +static inline int native_read_msr_safe(u32 msr, u64 *val)
>   {
>   	int err;
> -	DECLARE_ARGS(val, low, high);
>   
> -	asm volatile("1: rdmsr ; xor %[err],%[err]\n"
> -		     "2:\n\t"
> -		     _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err])
> -		     : [err] "=r" (err), EAX_EDX_RET(val, low, high)
> -		     : "c" (msr));
> -	if (tracepoint_enabled(read_msr))
> -		do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), err);
> +	err = __native_rdmsrq(msr, val, EX_TYPE_RDMSR_SAFE) ? -EIO : 0;
>   
> -	*p = EAX_EDX_VAL(val, low, high);
> +	if (tracepoint_enabled(read_msr))
> +		do_trace_read_msr(msr, *val, err);
>   
>   	return err;
>   }
>   
> +#ifdef CONFIG_XEN_PV
> +/* No plan to support immediate form MSR instructions in Xen */
> +static __always_inline bool __xenpv_rdmsrq(u32 msr, u64 *val, int type)
> +{
> +	asm_inline volatile goto(
> +		"1: call asm_xen_read_msr\n\t"
> +		_ASM_EXTABLE_TYPE(1b, %l[badmsr], %c[type])	/* For CALL */
> +
> +		: [val] "=a" (*val), ASM_CALL_CONSTRAINT
> +		: "c" (msr), [type] "i" (type)
> +		: "rdx"
> +		: badmsr);
> +
> +	return false;
> +
> +badmsr:
> +	*val = 0;
> +
> +	return true;
> +}
> +#endif
> +
> +static __always_inline bool __rdmsrq(u32 msr, u64 *val, int type)
> +{
> +	bool ret;
> +
> +#ifdef CONFIG_XEN_PV
> +	if (cpu_feature_enabled(X86_FEATURE_XENPV))
> +		return __xenpv_rdmsrq(msr, val, type);

I don't think this will work for the Xen PV case.

X86_FEATURE_XENPV is set only after the first MSR is being read.

This can be fixed by setting the feature earlier, but it shows that the
paravirt feature has its benefits in such cases.


Juergen

Download attachment "OpenPGP_0xB0DE9DD628BF132F.asc" of type "application/pgp-keys" (3684 bytes)

Download attachment "OpenPGP_signature.asc" of type "application/pgp-signature" (496 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ