lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <alpine.DEB.2.02.1211131707380.28049@kaball.uk.xensource.com>
Date:	Tue, 13 Nov 2012 17:26:20 +0000
From:	Stefano Stabellini <stefano.stabellini@...citrix.com>
To:	Yinghai Lu <yinghai@...nel.org>
CC:	Thomas Gleixner <tglx@...utronix.de>, Ingo Molnar <mingo@...e.hu>,
	"H. Peter Anvin" <hpa@...or.com>, Jacob Shin <jacob.shin@....com>,
	Andrew Morton <akpm@...ux-foundation.org>,
	Stefano Stabellini <Stefano.Stabellini@...citrix.com>,
	Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>,
	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH 21/46] x86, mm: setup page table in top-down

On Mon, 12 Nov 2012, Yinghai Lu wrote:
> Get pgt_buf early from BRK, and use it to map PMD_SIZE from top at first.
> Then use mapped pages to map more ranges below, and keep looping until
> all pages get mapped.
> 
> alloc_low_page will use page from BRK at first, after that buffer is used
> up, will use memblock to find and reserve pages for page table usage.
> 
> Introduce min_pfn_mapped to make sure find new pages from mapped ranges,
> that will be updated when lower pages get mapped.
> 
> Also add step_size to make sure that don't try to map too big range with
> limited mapped pages initially, and increase the step_size when we have
> more mapped pages on hand.
> 
> At last we can get rid of calculation and find early pgt related code.
> 
> -v2: update to after fix_xen change,
>      also use MACRO for initial pgt_buf size and add comments with it.
> -v3: skip big reserved range in memblock.reserved near end.
> -v4: don't need fix_xen change now.
> 
> Suggested-by: "H. Peter Anvin" <hpa@...or.com>
> Signed-off-by: Yinghai Lu <yinghai@...nel.org>

The changes to alloc_low_page and early_alloc_pgt_buf look OK to me.

The changes to init_mem_mapping are a bit iffy but they aren't too
unreasonable.
Overall the patch is OK even though I would certainly appreciate more
comments and better variable names (real_end?), see below.


>  arch/x86/include/asm/page_types.h |    1 +
>  arch/x86/include/asm/pgtable.h    |    1 +
>  arch/x86/kernel/setup.c           |    3 +
>  arch/x86/mm/init.c                |  210 +++++++++++--------------------------
>  arch/x86/mm/init_32.c             |   17 +++-
>  arch/x86/mm/init_64.c             |   17 +++-
>  6 files changed, 94 insertions(+), 155 deletions(-)
> 
> diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
> index 54c9787..9f6f3e6 100644
> --- a/arch/x86/include/asm/page_types.h
> +++ b/arch/x86/include/asm/page_types.h
> @@ -45,6 +45,7 @@ extern int devmem_is_allowed(unsigned long pagenr);
> 
>  extern unsigned long max_low_pfn_mapped;
>  extern unsigned long max_pfn_mapped;
> +extern unsigned long min_pfn_mapped;
> 
>  static inline phys_addr_t get_max_mapped(void)
>  {
> diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
> index dd1a888..6991a3e 100644
> --- a/arch/x86/include/asm/pgtable.h
> +++ b/arch/x86/include/asm/pgtable.h
> @@ -603,6 +603,7 @@ static inline int pgd_none(pgd_t pgd)
> 
>  extern int direct_gbpages;
>  void init_mem_mapping(void);
> +void early_alloc_pgt_buf(void);
> 
>  /* local pte updates need not use xchg for locking */
>  static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 94f922a..f7634092 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -124,6 +124,7 @@
>   */
>  unsigned long max_low_pfn_mapped;
>  unsigned long max_pfn_mapped;
> +unsigned long min_pfn_mapped;
> 
>  #ifdef CONFIG_DMI
>  RESERVE_BRK(dmi_alloc, 65536);
> @@ -900,6 +901,8 @@ void __init setup_arch(char **cmdline_p)
> 
>         reserve_ibft_region();
> 
> +       early_alloc_pgt_buf();
> +
>         /*
>          * Need to conclude brk, before memblock_x86_fill()
>          *  it could use memblock_find_in_range, could overlap with
> diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
> index 47a1ba2..76a6e82 100644
> --- a/arch/x86/mm/init.c
> +++ b/arch/x86/mm/init.c
> @@ -21,6 +21,21 @@ unsigned long __initdata pgt_buf_start;
>  unsigned long __meminitdata pgt_buf_end;
>  unsigned long __meminitdata pgt_buf_top;
> 
> +/* need 4 4k for initial PMD_SIZE, 4k for 0-ISA_END_ADDRESS */
> +#define INIT_PGT_BUF_SIZE      (5 * PAGE_SIZE)
> +RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
> +void  __init early_alloc_pgt_buf(void)
> +{
> +       unsigned long tables = INIT_PGT_BUF_SIZE;
> +       phys_addr_t base;
> +
> +       base = __pa(extend_brk(tables, PAGE_SIZE));
> +
> +       pgt_buf_start = base >> PAGE_SHIFT;
> +       pgt_buf_end = pgt_buf_start;
> +       pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
> +}
> +
>  int after_bootmem;
> 
>  int direct_gbpages
> @@ -228,105 +243,6 @@ static int __meminit split_mem_range(struct map_range *mr, int nr_range,
>         return nr_range;
>  }
> 
> -/*
> - * First calculate space needed for kernel direct mapping page tables to cover
> - * mr[0].start to mr[nr_range - 1].end, while accounting for possible 2M and 1GB
> - * pages. Then find enough contiguous space for those page tables.
> - */
> -static unsigned long __init calculate_table_space_size(unsigned long start, unsigned long end)
> -{
> -       int i;
> -       unsigned long puds = 0, pmds = 0, ptes = 0, tables;
> -       struct map_range mr[NR_RANGE_MR];
> -       int nr_range;
> -
> -       memset(mr, 0, sizeof(mr));
> -       nr_range = 0;
> -       nr_range = split_mem_range(mr, nr_range, start, end);
> -
> -       for (i = 0; i < nr_range; i++) {
> -               unsigned long range, extra;
> -
> -               range = mr[i].end - mr[i].start;
> -               puds += (range + PUD_SIZE - 1) >> PUD_SHIFT;
> -
> -               if (mr[i].page_size_mask & (1 << PG_LEVEL_1G)) {
> -                       extra = range - ((range >> PUD_SHIFT) << PUD_SHIFT);
> -                       pmds += (extra + PMD_SIZE - 1) >> PMD_SHIFT;
> -               } else {
> -                       pmds += (range + PMD_SIZE - 1) >> PMD_SHIFT;
> -               }
> -
> -               if (mr[i].page_size_mask & (1 << PG_LEVEL_2M)) {
> -                       extra = range - ((range >> PMD_SHIFT) << PMD_SHIFT);
> -#ifdef CONFIG_X86_32
> -                       extra += PMD_SIZE;
> -#endif
> -                       ptes += (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
> -               } else {
> -                       ptes += (range + PAGE_SIZE - 1) >> PAGE_SHIFT;
> -               }
> -       }
> -
> -       tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
> -       tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
> -       tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
> -
> -#ifdef CONFIG_X86_32
> -       /* for fixmap */
> -       tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
> -#endif
> -
> -       return tables;
> -}
> -
> -static unsigned long __init calculate_all_table_space_size(void)
> -{
> -       unsigned long start_pfn, end_pfn;
> -       unsigned long tables;
> -       int i;
> -
> -       /* the ISA range is always mapped regardless of memory holes */
> -       tables = calculate_table_space_size(0, ISA_END_ADDRESS);
> -
> -       for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
> -               u64 start = start_pfn << PAGE_SHIFT;
> -               u64 end = end_pfn << PAGE_SHIFT;
> -
> -               if (end <= ISA_END_ADDRESS)
> -                       continue;
> -
> -               if (start < ISA_END_ADDRESS)
> -                       start = ISA_END_ADDRESS;
> -#ifdef CONFIG_X86_32
> -               /* on 32 bit, we only map up to max_low_pfn */
> -               if ((start >> PAGE_SHIFT) >= max_low_pfn)
> -                       continue;
> -
> -               if ((end >> PAGE_SHIFT) > max_low_pfn)
> -                       end = max_low_pfn << PAGE_SHIFT;
> -#endif
> -               tables += calculate_table_space_size(start, end);
> -       }
> -
> -       return tables;
> -}
> -
> -static void __init find_early_table_space(unsigned long start,
> -                                         unsigned long good_end,
> -                                         unsigned long tables)
> -{
> -       phys_addr_t base;
> -
> -       base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
> -       if (!base)
> -               panic("Cannot find space for the kernel page tables");
> -
> -       pgt_buf_start = base >> PAGE_SHIFT;
> -       pgt_buf_end = pgt_buf_start;
> -       pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
> -}
> -
>  static struct range pfn_mapped[E820_X_MAX];
>  static int nr_pfn_mapped;
> 
> @@ -392,17 +308,14 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
>  }
> 
>  /*
> - * Iterate through E820 memory map and create direct mappings for only E820_RAM
> - * regions. We cannot simply create direct mappings for all pfns from
> - * [0 to max_low_pfn) and [4GB to max_pfn) because of possible memory holes in
> - * high addresses that cannot be marked as UC by fixed/variable range MTRRs.
> - * Depending on the alignment of E820 ranges, this may possibly result in using
> - * smaller size (i.e. 4K instead of 2M or 1G) page tables.
> + * this one could take range with hole in it.
>   */

this comment in particular is not very satisfactory


> -static void __init init_range_memory_mapping(unsigned long range_start,
> +static unsigned long __init init_range_memory_mapping(
> +                                          unsigned long range_start,
>                                            unsigned long range_end)
>  {
>         unsigned long start_pfn, end_pfn;
> +       unsigned long mapped_ram_size = 0;
>         int i;
> 
>         for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
> @@ -422,71 +335,70 @@ static void __init init_range_memory_mapping(unsigned long range_start,
>                         end = range_end;
> 
>                 init_memory_mapping(start, end);
> +
> +               mapped_ram_size += end - start;
>         }
> +
> +       return mapped_ram_size;
>  }
> 
> +/* (PUD_SHIFT-PMD_SHIFT)/2 */
> +#define STEP_SIZE_SHIFT 5
>  void __init init_mem_mapping(void)
>  {
> -       unsigned long tables, good_end, end;
> +       unsigned long end, real_end, start, last_start;
> +       unsigned long step_size;
> +       unsigned long addr;
> +       unsigned long mapped_ram_size = 0;
> +       unsigned long new_mapped_ram_size;
> 
>         probe_page_size_mask();
> 
> -       /*
> -        * Find space for the kernel direct mapping tables.
> -        *
> -        * Later we should allocate these tables in the local node of the
> -        * memory mapped. Unfortunately this is done currently before the
> -        * nodes are discovered.
> -        */
>  #ifdef CONFIG_X86_64
>         end = max_pfn << PAGE_SHIFT;
> -       good_end = end;
>  #else
>         end = max_low_pfn << PAGE_SHIFT;
> -       good_end = max_pfn_mapped << PAGE_SHIFT;
>  #endif
> -       tables = calculate_all_table_space_size();
> -       find_early_table_space(0, good_end, tables);
> -       printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] prealloc\n",
> -               end - 1, pgt_buf_start << PAGE_SHIFT,
> -               (pgt_buf_top << PAGE_SHIFT) - 1);
> 
> -       max_pfn_mapped = 0; /* will get exact value next */
>         /* the ISA range is always mapped regardless of memory holes */
>         init_memory_mapping(0, ISA_END_ADDRESS);
> -       init_range_memory_mapping(ISA_END_ADDRESS, end);
> +
> +       /* xen has big range in reserved near end of ram, skip it at first */
> +       addr = memblock_find_in_range(ISA_END_ADDRESS, end, PMD_SIZE,
> +                        PAGE_SIZE);
> +       real_end = addr + PMD_SIZE;
> +
> +       /* step_size need to be small so pgt_buf from BRK could cover it */
> +       step_size = PMD_SIZE;
> +       max_pfn_mapped = 0; /* will get exact value next */
> +       min_pfn_mapped = real_end >> PAGE_SHIFT;
> +       last_start = start = real_end;
> +       while (last_start > ISA_END_ADDRESS) {
> +               if (last_start > step_size) {
> +                       start = round_down(last_start - 1, step_size);
> +                       if (start < ISA_END_ADDRESS)
> +                               start = ISA_END_ADDRESS;
> +               } else
> +                       start = ISA_END_ADDRESS;
> +               new_mapped_ram_size = init_range_memory_mapping(start,
> +                                                       last_start);
> +               last_start = start;
> +               min_pfn_mapped = last_start >> PAGE_SHIFT;
> +               /* only increase step_size after big range get mapped */
> +               if (new_mapped_ram_size > mapped_ram_size)
> +                       step_size <<= STEP_SIZE_SHIFT;
> +               mapped_ram_size += new_mapped_ram_size;
> +       }
> +
> +       if (real_end < end)
> +               init_range_memory_mapping(real_end, end);
> +
>  #ifdef CONFIG_X86_64
>         if (max_pfn > max_low_pfn) {
>                 /* can we preseve max_low_pfn ?*/
>                 max_low_pfn = max_pfn;
>         }
>  #endif
> -       /*
> -        * Reserve the kernel pagetable pages we used (pgt_buf_start -
> -        * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
> -        * so that they can be reused for other purposes.
> -        *
> -        * On native it just means calling memblock_reserve, on Xen it also
> -        * means marking RW the pagetable pages that we allocated before
> -        * but that haven't been used.
> -        *
> -        * In fact on xen we mark RO the whole range pgt_buf_start -
> -        * pgt_buf_top, because we have to make sure that when
> -        * init_memory_mapping reaches the pagetable pages area, it maps
> -        * RO all the pagetable pages, including the ones that are beyond
> -        * pgt_buf_end at that time.
> -        */
> -       if (pgt_buf_end > pgt_buf_start) {
> -               printk(KERN_DEBUG "kernel direct mapping tables up to %#lx @ [mem %#010lx-%#010lx] final\n",
> -                       end - 1, pgt_buf_start << PAGE_SHIFT,
> -                       (pgt_buf_end << PAGE_SHIFT) - 1);
> -               x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
> -                               PFN_PHYS(pgt_buf_end));
> -       }
> -
> -       /* stop the wrong using */
> -       pgt_buf_top = 0;
> -
>         early_memtest(0, max_pfn_mapped << PAGE_SHIFT);
>  }

you should say why we don't need to call pagetable_reserve anymore: is
it because alloc_low_page is going to reserve each page that it
allocates?


> diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
> index 27f7fc6..7bb1106 100644
> --- a/arch/x86/mm/init_32.c
> +++ b/arch/x86/mm/init_32.c
> @@ -61,11 +61,22 @@ bool __read_mostly __vmalloc_start_set = false;
> 
>  static __init void *alloc_low_page(void)
>  {
> -       unsigned long pfn = pgt_buf_end++;
> +       unsigned long pfn;
>         void *adr;
> 
> -       if (pfn >= pgt_buf_top)
> -               panic("alloc_low_page: ran out of memory");
> +       if ((pgt_buf_end + 1) >= pgt_buf_top) {
> +               unsigned long ret;
> +               if (min_pfn_mapped >= max_pfn_mapped)
> +                       panic("alloc_low_page: ran out of memory");
> +               ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
> +                                       max_pfn_mapped << PAGE_SHIFT,
> +                                       PAGE_SIZE, PAGE_SIZE);
> +               if (!ret)
> +                       panic("alloc_low_page: can not alloc memory");
> +               memblock_reserve(ret, PAGE_SIZE);
> +               pfn = ret >> PAGE_SHIFT;
> +       } else
> +               pfn = pgt_buf_end++;
> 
>         adr = __va(pfn * PAGE_SIZE);
>         clear_page(adr);
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index fa28e3e..eefaea6 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -316,7 +316,7 @@ void __init cleanup_highmap(void)
> 
>  static __ref void *alloc_low_page(unsigned long *phys)
>  {
> -       unsigned long pfn = pgt_buf_end++;
> +       unsigned long pfn;
>         void *adr;
> 
>         if (after_bootmem) {
> @@ -326,8 +326,19 @@ static __ref void *alloc_low_page(unsigned long *phys)
>                 return adr;
>         }
> 
> -       if (pfn >= pgt_buf_top)
> -               panic("alloc_low_page: ran out of memory");
> +       if ((pgt_buf_end + 1) >= pgt_buf_top) {
> +               unsigned long ret;
> +               if (min_pfn_mapped >= max_pfn_mapped)
> +                       panic("alloc_low_page: ran out of memory");
> +               ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
> +                                       max_pfn_mapped << PAGE_SHIFT,
> +                                       PAGE_SIZE, PAGE_SIZE);
> +               if (!ret)
> +                       panic("alloc_low_page: can not alloc memory");
> +               memblock_reserve(ret, PAGE_SIZE);
> +               pfn = ret >> PAGE_SHIFT;
> +       } else
> +               pfn = pgt_buf_end++;
> 
>         adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
>         clear_page(adr);
> --
> 1.7.7
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ