This came out a lot more complicated than I would have liked. Instead of simply enabling all of the XSAVE features that we both know about and the CPU supports, we have to be careful to not overflow our buffer in 'init_fpstate.xsave'. To do this, we enable each XSAVE feature and then ask the CPU how large that makes the buffer. If we would overflow the buffer we allocated, we turn off the feature. This means that no matter what the CPU does, we will not corrupt random memory like we do before this patch. It also means that we can fall back in a way which cripples the system the least. --- b/arch/x86/include/asm/fpu/types.h | 20 ++++- b/arch/x86/kernel/fpu/xstate.c | 140 +++++++++++++++++++++++++++++++------ 2 files changed, 134 insertions(+), 26 deletions(-) diff -puN arch/x86/kernel/fpu/xstate.c~xsaves-are-my-friend arch/x86/kernel/fpu/xstate.c --- a/arch/x86/kernel/fpu/xstate.c~xsaves-are-my-friend 2015-07-15 15:44:58.280300715 -0700 +++ b/arch/x86/kernel/fpu/xstate.c 2015-07-15 17:12:28.744659856 -0700 @@ -161,11 +161,10 @@ void fpstate_sanitize_xstate(struct fpu */ void fpu__init_cpu_xstate(void) { - if (!cpu_has_xsave || !xfeatures_mask) + if (!cpu_has_xsave) return; cr4_set_bits(X86_CR4_OSXSAVE); - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); } /* @@ -291,25 +290,129 @@ static void __init setup_init_fpu_buf(vo } /* - * Calculate total size of enabled xstates in XCR0/xfeatures_mask. + * This returns the size that the 'xsave*' instructions will + * write to the xsave buffer. This is dynamic and will change + * as we enable more feature bits in XCR0. */ -static void __init init_xstate_size(void) +unsigned int kernel_xstate_buffer_size(void) { unsigned int eax, ebx, ecx, edx; - int i; if (!cpu_has_xsaves) { cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xstate_size = ebx; - return; + return ebx; + } else { + cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx); + return ebx; } +} + +u64 cpu_supported_xfeatures_mask(void) +{ + unsigned int eax, ebx, ecx, edx; + u64 ret; + + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); + ret = eax + ((u64)edx << 32); + + return ret; +} + +/* + * We enable the features in sets. Some implementations are + * buggy and do not handle it if only one of two related + * xfeatures are enabled. + */ +u64 xfeature_sets[] = { + XSTATE_YMM, + XSTATE_BNDREGS | XSTATE_BNDCSR, + XSTATE_AVX512, +}; + +/* + * Enable each supported feature in XCR0, determine the required + * size of the save buffer with the feature enabled, and determine + * if it fits in the buffer we allocated. + * + * This function will initialize these global variables: + * 1. xfeatures_mask + * 2. xstate_size + */ +static void __init enable_supported_xfeatures(void) +{ + int i; + int xbufsz = sizeof(init_fpstate.xsave); + u64 cpu_support_mask = cpu_supported_xfeatures_mask(); - xstate_size = FXSAVE_SIZE + XSAVE_HDR_SIZE; - for (i = 2; i < 64; i++) { - if (test_bit(i, (unsigned long *)&xfeatures_mask)) { - cpuid_count(XSTATE_CPUID, i, &eax, &ebx, &ecx, &edx); - xstate_size += eax; + /* + * We are past sanity_check_xfeatures_supported() so we + * know these states are supported. + */ + xfeatures_mask |= XSTATE_FPSSE; + + for (i = 0; i < ARRAY_SIZE(xfeature_sets); i++) { + u64 feature_mask = xfeature_sets[i]; + int tmp_xstate_size; + + /* Support only the state known to the OS: */ + if ((XCNTXT_MASK & feature_mask) != feature_mask) + continue; + /* Only try the states supported by the CPU */ + if ((cpu_support_mask & feature_mask) != feature_mask) + continue; + + /* Enable the potential feature in XCR0 */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask | feature_mask); + /* Ask the CPU how large of an XSAVE buffer we need now */ + tmp_xstate_size = kernel_xstate_buffer_size(); + + if (tmp_xstate_size > xbufsz) { + /* reset XCR0 to its previous value */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); + pr_warn("XSAVE feature 0x%02llx overflows init buf, " + "disabling (%d > %d)\n", feature_mask, + tmp_xstate_size, xbufsz); + continue; } + /* "Commit" the feature to the permanent mask */ + xfeatures_mask |= feature_mask; + xstate_size = tmp_xstate_size; + } + pr_info("x86/fpu: enabled %ld xfeatures (0x%llx), in %d byte buffer\n", + hweight64(xfeatures_mask), xfeatures_mask, xstate_size); + /* XCR0 should be fully initialized now */ +} + +/* + * The kernel generally only checks CPUID bits to tell if it + * should be using a given feature. Some features are unusable + * unless we are able to enable XSAVE state management for them. + * So, we clear the CPUID bits for a features that we did not + * enable. + */ +void clear_unsupported_xfeature_cpuid_flags(void) +{ + /* + * SDM: An operating system must enable its YMM state + * management to support AVX... otherwise, an attempt + * to execute an instruction cause a #UD exception. + */ + if (!(xfeatures_mask & XSTATE_YMM)) { + setup_clear_cpu_cap(X86_FEATURE_AVX); + setup_clear_cpu_cap(X86_FEATURE_AVX2); + } + /* + * SDM: An OS must enable its ZMM and opmask state + * management to support Intel AVX-512 + */ + if (!(xfeatures_mask & XSTATE_OPMASK) || + !(xfeatures_mask & XSTATE_ZMM_Hi256) || + !(xfeatures_mask & XSTATE_Hi16_ZMM)) { + setup_clear_cpu_cap(X86_FEATURE_AVX512F); + } + if (!(xfeatures_mask & XSTATE_BNDREGS) || + !(xfeatures_mask & XSTATE_BNDCSR)) { + setup_clear_cpu_cap(X86_FEATURE_MPX); } } @@ -319,7 +422,6 @@ static void __init init_xstate_size(void */ void __init fpu__init_system_xstate(void) { - unsigned int eax, ebx, ecx, edx; static int on_boot_cpu = 1; WARN_ON_FPU(!on_boot_cpu); @@ -335,22 +437,16 @@ void __init fpu__init_system_xstate(void return; } - cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xfeatures_mask = eax + ((u64)edx << 32); - - if ((xfeatures_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { + if ((cpu_supported_xfeatures_mask() & XSTATE_FPSSE) != XSTATE_FPSSE) { pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask); BUG(); } - /* Support only the state known to the OS: */ - xfeatures_mask = xfeatures_mask & XCNTXT_MASK; - /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); - /* Recompute the context size for enabled features: */ - init_xstate_size(); + enable_supported_xfeatures(); + clear_unsupported_xfeature_cpuid_flags(); update_regset_xstate_info(xstate_size, xfeatures_mask); fpu__init_prepare_fx_sw_frame(); diff -puN arch/x86/include/asm/fpu/types.h~xsaves-are-my-friend arch/x86/include/asm/fpu/types.h --- a/arch/x86/include/asm/fpu/types.h~xsaves-are-my-friend 2015-07-15 15:44:58.285300941 -0700 +++ b/arch/x86/include/asm/fpu/types.h 2015-07-15 16:54:46.641091924 -0700 @@ -159,10 +159,22 @@ struct xstate_header { u64 reserved[6]; } __attribute__((packed)); -/* New processor state extensions should be added here: */ -#define XSTATE_RESERVE (sizeof(struct ymmh_struct) + \ - sizeof(struct lwp_struct) + \ - sizeof(struct mpx_struct) ) +/* + * The largest xsave buffer known today is 2752 bytes on a system + * implementing AVX-512. This includes the 512-byte i387 state + * and 64-byte header. We add a small amount of padding in case + * an implementation adds some padding or wastes some space. + * + * Note, if we overflow this, we will disable some XSTATE + * features. + * + * Also, note that the real size we need is enumerated by + * cpuid leaves and can not be known at compile time. + */ +#define XSTATE_RESERVE (2752 + 256 - \ + sizeof(struct xstate_header) - \ + sizeof(struct fxregs_state)) + /* * This is our most modern FPU state format, as saved by the XSAVE * and restored by the XRSTOR instructions. _