[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <Pine.LNX.4.64.0706071850180.16563@p34.internal.lan>
Date: Thu, 7 Jun 2007 18:50:24 -0400 (EDT)
From: Justin Piszcz <jpiszcz@...idpixels.com>
To: Jesse Barnes <jesse.barnes@...el.com>
cc: Andi Kleen <andi@...stfloor.org>, linux-kernel@...r.kernel.org,
"Eric W. Biederman" <ebiederm@...ssion.com>
Subject: Re: [PATCH] trim memory not covered by WB MTRRs
Will see if it still patches against -rc4.
On Thu, 7 Jun 2007, Jesse Barnes wrote:
> On some machines, buggy BIOSes don't properly setup WB MTRRs to
> cover all available RAM, meaning the last few megs (or even gigs)
> of memory will be marked uncached. Since Linux tends to allocate
> from high memory addresses first, this causes the machine to be
> unusably slow as soon as the kernel starts really using memory
> (i.e. right around init time).
>
> This patch works around the problem by scanning the MTRRs at
> boot and figuring out whether the current end_pfn value (setup
> by early e820 code) goes beyond the highest WB MTRR range, and
> if so, trimming it to match. A fairly obnoxious KERN_WARNING
> is printed too, letting the user know that not all of their
> memory is available due to a likely BIOS bug.
>
> Something similar could be done on i386 if needed, but the boot
> ordering would be slightly different, since the MTRR code on i386
> depends on the boot_cpu_data structure being setup.
>
> This patch incorporates the feedback from Eric and Andi:
> - use MAX_VAR_RANGES instead of NUM_VAR_RANGES
> - move array declaration to header file as an extern
> - add command line disable option "disable_mtrr_trim"
> - don't run the trim code if the MTRR default type is cacheable
> - don't run the trim code on non-Intel machines
>
> Justin, feel free to test again if you have time and add your
> "Tested-by" signoff.
>
> Andi, as for large pages, do you think this is ok as is, or should
> I trim a larger granularity? If so, what granularity?
>
> Signed-off-by: Jesse Barnes <jesse.barnes@...el.com>
>
> Thanks,
> Jesse
>
> diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
> index 5d0283c..cb728a8 100644
> --- a/Documentation/kernel-parameters.txt
> +++ b/Documentation/kernel-parameters.txt
> @@ -553,6 +553,12 @@ and is between 256 and 4096 characters. It is defined in the file
> See drivers/char/README.epca and
> Documentation/digiepca.txt.
>
> + disable_mtrr_trim [X86-64]
> + By default the kernel will trim any uncacheable
> + memory out of your available memory pool based on
> + MTRR settings. This parameter disables that behavior,
> + possibly causing your machine to run very slowly.
> +
> dmascc= [HW,AX25,SERIAL] AX.25 Z80SCC driver with DMA
> support available.
> Format: <io_dev0>[,<io_dev1>[,..<io_dev32>]]
> diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c
> index c4ebb51..8eb3085 100644
> --- a/arch/i386/kernel/cpu/mtrr/generic.c
> +++ b/arch/i386/kernel/cpu/mtrr/generic.c
> @@ -13,7 +13,7 @@
> #include "mtrr.h"
>
> struct mtrr_state {
> - struct mtrr_var_range *var_ranges;
> + struct mtrr_var_range var_ranges[MAX_VAR_RANGES];
> mtrr_type fixed_ranges[NUM_FIXED_RANGES];
> unsigned char enabled;
> unsigned char have_fixed;
> @@ -84,12 +84,6 @@ void get_mtrr_state(void)
> struct mtrr_var_range *vrs;
> unsigned lo, dummy;
>
> - if (!mtrr_state.var_ranges) {
> - mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range),
> - GFP_KERNEL);
> - if (!mtrr_state.var_ranges)
> - return;
> - }
> vrs = mtrr_state.var_ranges;
>
> rdmsr(MTRRcap_MSR, lo, dummy);
> diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c
> index c7d8f17..0e34a67 100644
> --- a/arch/i386/kernel/cpu/mtrr/if.c
> +++ b/arch/i386/kernel/cpu/mtrr/if.c
> @@ -11,10 +11,6 @@
> #include <asm/mtrr.h>
> #include "mtrr.h"
>
> -/* RED-PEN: this is accessed without any locking */
> -extern unsigned int *usage_table;
> -
> -
> #define FILE_FCOUNT(f) (((struct seq_file *)((f)->private_data))->private)
>
> static const char *const mtrr_strings[MTRR_NUM_TYPES] =
> diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c
> index 7202b98..ef552ba 100644
> --- a/arch/i386/kernel/cpu/mtrr/main.c
> +++ b/arch/i386/kernel/cpu/mtrr/main.c
> @@ -38,8 +38,8 @@
> #include <linux/cpu.h>
> #include <linux/mutex.h>
>
> +#include <asm/e820.h>
> #include <asm/mtrr.h>
> -
> #include <asm/uaccess.h>
> #include <asm/processor.h>
> #include <asm/msr.h>
> @@ -47,7 +47,7 @@
>
> u32 num_var_ranges = 0;
>
> -unsigned int *usage_table;
> +unsigned int usage_table[MAX_VAR_RANGES];
> static DEFINE_MUTEX(mtrr_mutex);
>
> u64 size_or_mask, size_and_mask;
> @@ -121,11 +121,6 @@ static void __init init_table(void)
> int i, max;
>
> max = num_var_ranges;
> - if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL))
> - == NULL) {
> - printk(KERN_ERR "mtrr: could not allocate\n");
> - return;
> - }
> for (i = 0; i < max; i++)
> usage_table[i] = 1;
> }
> @@ -589,16 +584,11 @@ struct mtrr_value {
> unsigned long lsize;
> };
>
> -static struct mtrr_value * mtrr_state;
> +static struct mtrr_value mtrr_state[MAX_VAR_RANGES];
>
> static int mtrr_save(struct sys_device * sysdev, pm_message_t state)
> {
> int i;
> - int size = num_var_ranges * sizeof(struct mtrr_value);
> -
> - mtrr_state = kzalloc(size,GFP_ATOMIC);
> - if (!mtrr_state)
> - return -ENOMEM;
>
> for (i = 0; i < num_var_ranges; i++) {
> mtrr_if->get(i,
> @@ -620,7 +610,6 @@ static int mtrr_restore(struct sys_device * sysdev)
> mtrr_state[i].lsize,
> mtrr_state[i].ltype);
> }
> - kfree(mtrr_state);
> return 0;
> }
>
> @@ -631,6 +620,57 @@ static struct sysdev_driver mtrr_sysdev_driver = {
> .resume = mtrr_restore,
> };
>
> +static int disable_mtrr_trim;
> +
> +static int __init disable_mtrr_trim_setup(char *str)
> +{
> + disable_mtrr_trim = 1;
> + return 0;
> +}
> +early_param("disable_mtrr_trim", disable_mtrr_trim_setup);
> +
> +/**
> + * mtrr_trim_uncached_memory - trim RAM not covered by MTRRs
> + *
> + * Some buggy BIOSes don't setup the MTRRs properly for systems with certain
> + * memory configurations. This routine checks to make sure the MTRRs having
> + * a write back type cover all of the memory the kernel is intending to use.
> + * If not, it'll trim any memory off the end by adjusting end_pfn, removing
> + * it from the kernel's allocation pools, warning the user with an obnoxious
> + * message.
> + */
> +void __init mtrr_trim_uncached_memory(void)
> +{
> + unsigned long i, base, size, highest_addr = 0, def, dummy;
> + mtrr_type type;
> +
> + /* Make sure we only trim uncachable memory on Intel machines */
> + rdmsr(MTRRdefType_MSR, def, dummy);
> + def &= 0xff;
> + if (!use_intel() || disable_mtrr_trim || def != MTRR_TYPE_UNCACHABLE)
> + return;
> +
> + /* Find highest cached pfn */
> + for (i = 0; i < num_var_ranges; i++) {
> + mtrr_if->get(i, &base, &size, &type);
> + if (type != MTRR_TYPE_WRBACK)
> + continue;
> + base <<= PAGE_SHIFT;
> + size <<= PAGE_SHIFT;
> + if (highest_addr < base + size)
> + highest_addr = base + size;
> + }
> +
> + if ((highest_addr >> PAGE_SHIFT) != end_pfn) {
> + printk(KERN_WARNING "***************\n");
> + printk(KERN_WARNING "**** WARNING: likely BIOS bug\n");
> + printk(KERN_WARNING "**** MTRRs don't cover all of "
> + "memory, trimmed %ld pages\n", end_pfn -
> + (highest_addr >> PAGE_SHIFT));
> + printk(KERN_WARNING "***************\n");
> + end_pfn = highest_addr >> PAGE_SHIFT;
> + }
> +}
>
> /**
> * mtrr_bp_init - initialize mtrrs on the boot CPU
> diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h
> index 289dfe6..627b339 100644
> --- a/arch/i386/kernel/cpu/mtrr/mtrr.h
> +++ b/arch/i386/kernel/cpu/mtrr/mtrr.h
> @@ -14,6 +14,7 @@
> #define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
>
> #define NUM_FIXED_RANGES 88
> +#define MAX_VAR_RANGES 256
> #define MTRRfix64K_00000_MSR 0x250
> #define MTRRfix16K_80000_MSR 0x258
> #define MTRRfix16K_A0000_MSR 0x259
> @@ -34,6 +35,8 @@
> an 8 bit field: */
> typedef u8 mtrr_type;
>
> +extern unsigned int usage_table[MAX_VAR_RANGES];
> +
> struct mtrr_ops {
> u32 vendor;
> u32 use_intel_if;
> diff --git a/arch/x86_64/kernel/bugs.c b/arch/x86_64/kernel/bugs.c
> index c3c6b91..c138eac 100644
> --- a/arch/x86_64/kernel/bugs.c
> +++ b/arch/x86_64/kernel/bugs.c
> @@ -14,7 +14,6 @@
> void __init check_bugs(void)
> {
> identify_cpu(&boot_cpu_data);
> - mtrr_bp_init();
> #if !defined(CONFIG_SMP)
> printk("CPU: ");
> print_cpu_info(&boot_cpu_data);
> diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
> index eb6524f..409b63c 100644
> --- a/arch/x86_64/kernel/setup.c
> +++ b/arch/x86_64/kernel/setup.c
> @@ -266,6 +266,10 @@ void __init setup_arch(char **cmdline_p)
> * we are rounding upwards:
> */
> end_pfn = e820_end_of_ram();
> + /* Trim memory not covered by WB MTRRs */
> + mtrr_bp_init();
> + mtrr_trim_uncached_memory();
> +
> num_physpages = end_pfn;
>
> check_efer();
> diff --git a/include/asm-x86_64/mtrr.h b/include/asm-x86_64/mtrr.h
> index b557c48..cc62bd8 100644
> --- a/include/asm-x86_64/mtrr.h
> +++ b/include/asm-x86_64/mtrr.h
> @@ -78,6 +78,7 @@ extern int mtrr_add_page (unsigned long base, unsigned long size,
> unsigned int type, char increment);
> extern int mtrr_del (int reg, unsigned long base, unsigned long size);
> extern int mtrr_del_page (int reg, unsigned long base, unsigned long size);
> +extern void mtrr_trim_uncached_memory(void);
> # else
> static __inline__ int mtrr_add (unsigned long base, unsigned long size,
> unsigned int type, char increment)
>
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists