[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20120630052404.GH3975@localhost.localdomain>
Date: Sat, 30 Jun 2012 01:24:05 -0400
From: Konrad Rzeszutek Wilk <konrad@...nok.org>
To: Andrea Arcangeli <aarcange@...hat.com>
Cc: linux-kernel@...r.kernel.org, linux-mm@...ck.org,
Hillf Danton <dhillf@...il.com>, Dan Smith <danms@...ibm.com>,
Peter Zijlstra <a.p.zijlstra@...llo.nl>,
Linus Torvalds <torvalds@...ux-foundation.org>,
Andrew Morton <akpm@...ux-foundation.org>,
Thomas Gleixner <tglx@...utronix.de>,
Ingo Molnar <mingo@...e.hu>, Paul Turner <pjt@...gle.com>,
Suresh Siddha <suresh.b.siddha@...el.com>,
Mike Galbraith <efault@....de>,
"Paul E. McKenney" <paulmck@...ux.vnet.ibm.com>,
Lai Jiangshan <laijs@...fujitsu.com>,
Bharata B Rao <bharata.rao@...il.com>,
Lee Schermerhorn <Lee.Schermerhorn@...com>,
Rik van Riel <riel@...hat.com>,
Johannes Weiner <hannes@...xchg.org>,
Srivatsa Vaddagiri <vatsa@...ux.vnet.ibm.com>,
Christoph Lameter <cl@...ux.com>,
Alex Shi <alex.shi@...el.com>,
Mauricio Faria de Oliveira <mauricfo@...ux.vnet.ibm.com>,
Konrad Rzeszutek Wilk <konrad.wilk@...cle.com>,
Don Morris <don.morris@...com>,
Benjamin Herrenschmidt <benh@...nel.crashing.org>
Subject: Re: [PATCH 36/40] autonuma: page_autonuma
On Thu, Jun 28, 2012 at 02:56:16PM +0200, Andrea Arcangeli wrote:
> Move the AutoNUMA per page information from the "struct page" to a
> separate page_autonuma data structure allocated in the memsection
> (with sparsemem) or in the pgdat (with flatmem).
>
> This is done to avoid growing the size of the "struct page" and the
> page_autonuma data is only allocated if the kernel has been booted on
> real NUMA hardware (or if noautonuma is passed as parameter to the
> kernel).
>
> Signed-off-by: Andrea Arcangeli <aarcange@...hat.com>
> ---
> include/linux/autonuma.h | 18 +++-
> include/linux/autonuma_flags.h | 6 +
> include/linux/autonuma_types.h | 55 ++++++++++
> include/linux/mm_types.h | 26 -----
> include/linux/mmzone.h | 14 +++-
> include/linux/page_autonuma.h | 53 +++++++++
> init/main.c | 2 +
> mm/Makefile | 2 +-
> mm/autonuma.c | 98 ++++++++++-------
> mm/huge_memory.c | 26 +++--
> mm/page_alloc.c | 21 +---
> mm/page_autonuma.c | 234 ++++++++++++++++++++++++++++++++++++++++
> mm/sparse.c | 126 ++++++++++++++++++++-
> 13 files changed, 577 insertions(+), 104 deletions(-)
> create mode 100644 include/linux/page_autonuma.h
> create mode 100644 mm/page_autonuma.c
>
> diff --git a/include/linux/autonuma.h b/include/linux/autonuma.h
> index 85ca5eb..67af86a 100644
> --- a/include/linux/autonuma.h
> +++ b/include/linux/autonuma.h
> @@ -7,15 +7,26 @@
>
> extern void autonuma_enter(struct mm_struct *mm);
> extern void autonuma_exit(struct mm_struct *mm);
> -extern void __autonuma_migrate_page_remove(struct page *page);
> +extern void __autonuma_migrate_page_remove(struct page *,
> + struct page_autonuma *);
> extern void autonuma_migrate_split_huge_page(struct page *page,
> struct page *page_tail);
> extern void autonuma_setup_new_exec(struct task_struct *p);
> +extern struct page_autonuma *lookup_page_autonuma(struct page *page);
>
> static inline void autonuma_migrate_page_remove(struct page *page)
> {
> - if (ACCESS_ONCE(page->autonuma_migrate_nid) >= 0)
> - __autonuma_migrate_page_remove(page);
> + struct page_autonuma *page_autonuma = lookup_page_autonuma(page);
> + if (ACCESS_ONCE(page_autonuma->autonuma_migrate_nid) >= 0)
> + __autonuma_migrate_page_remove(page, page_autonuma);
> +}
> +
> +static inline void autonuma_free_page(struct page *page)
> +{
> + if (!autonuma_impossible()) {
I think you are better using a different name.
Perhaps 'if (autonuma_on())'
> + autonuma_migrate_page_remove(page);
> + lookup_page_autonuma(page)->autonuma_last_nid = -1;
> + }
> }
>
> #define autonuma_printk(format, args...) \
> @@ -29,6 +40,7 @@ static inline void autonuma_migrate_page_remove(struct page *page) {}
> static inline void autonuma_migrate_split_huge_page(struct page *page,
> struct page *page_tail) {}
> static inline void autonuma_setup_new_exec(struct task_struct *p) {}
> +static inline void autonuma_free_page(struct page *page) {}
>
> #endif /* CONFIG_AUTONUMA */
>
> diff --git a/include/linux/autonuma_flags.h b/include/linux/autonuma_flags.h
> index 5e29a75..035d993 100644
> --- a/include/linux/autonuma_flags.h
> +++ b/include/linux/autonuma_flags.h
> @@ -15,6 +15,12 @@ enum autonuma_flag {
>
> extern unsigned long autonuma_flags;
>
> +static inline bool autonuma_impossible(void)
> +{
> + return num_possible_nodes() <= 1 ||
> + test_bit(AUTONUMA_IMPOSSIBLE_FLAG, &autonuma_flags);
> +}
> +
> static inline bool autonuma_enabled(void)
> {
> return !!test_bit(AUTONUMA_FLAG, &autonuma_flags);
> diff --git a/include/linux/autonuma_types.h b/include/linux/autonuma_types.h
> index 9e697e3..1e860f6 100644
> --- a/include/linux/autonuma_types.h
> +++ b/include/linux/autonuma_types.h
> @@ -39,6 +39,61 @@ struct task_autonuma {
> unsigned long task_numa_fault[0];
> };
>
> +/*
> + * Per page (or per-pageblock) structure dynamically allocated only if
> + * autonuma is not impossible.
not impossible? So possible?
> + */
> +struct page_autonuma {
> + /*
> + * To modify autonuma_last_nid lockless the architecture,
> + * needs SMP atomic granularity < sizeof(long), not all archs
> + * have that, notably some ancient alpha (but none of those
> + * should run in NUMA systems). Archs without that requires
> + * autonuma_last_nid to be a long.
> + */
> +#if BITS_PER_LONG > 32
> + /*
> + * autonuma_migrate_nid is -1 if the page_autonuma structure
> + * is not linked into any
> + * pgdat->autonuma_migrate_head. Otherwise it means the
> + * page_autonuma structure is linked into the
> + * &NODE_DATA(autonuma_migrate_nid)->autonuma_migrate_head[page_nid].
> + * page_nid is the nid that the page (referenced by the
> + * page_autonuma structure) belongs to.
> + */
> + int autonuma_migrate_nid;
> + /*
> + * autonuma_last_nid records which is the NUMA nid that tried
> + * to access this page at the last NUMA hinting page fault.
> + * If it changed, AutoNUMA will not try to migrate the page to
> + * the nid where the thread is running on and to the contrary,
> + * it will make different threads trashing on the same pages,
> + * converge on the same NUMA node (if possible).
> + */
> + int autonuma_last_nid;
> +#else
> +#if MAX_NUMNODES >= 32768
> +#error "too many nodes"
> +#endif
> + short autonuma_migrate_nid;
> + short autonuma_last_nid;
> +#endif
> + /*
> + * This is the list node that links the page (referenced by
> + * the page_autonuma structure) in the
> + * &NODE_DATA(dst_nid)->autonuma_migrate_head[page_nid] lru.
> + */
> + struct list_head autonuma_migrate_node;
> +
> + /*
> + * To find the page starting from the autonuma_migrate_node we
> + * need a backlink.
> + *
> + * FIXME: drop it;
> + */
> + struct page *page;
> +};
> +
> extern int alloc_task_autonuma(struct task_struct *tsk,
> struct task_struct *orig,
> int node);
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index d1248cf..f0c6379 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -136,32 +136,6 @@ struct page {
> struct page *first_page; /* Compound tail pages */
> };
>
> -#ifdef CONFIG_AUTONUMA
> - /*
> - * FIXME: move to pgdat section along with the memcg and allocate
> - * at runtime only in presence of a numa system.
> - */
> - /*
> - * To modify autonuma_last_nid lockless the architecture,
> - * needs SMP atomic granularity < sizeof(long), not all archs
> - * have that, notably some ancient alpha (but none of those
> - * should run in NUMA systems). Archs without that requires
> - * autonuma_last_nid to be a long.
> - */
> -#if BITS_PER_LONG > 32
> - int autonuma_migrate_nid;
> - int autonuma_last_nid;
> -#else
> -#if MAX_NUMNODES >= 32768
> -#error "too many nodes"
> -#endif
> - /* FIXME: remember to check the updates are atomic */
> - short autonuma_migrate_nid;
> - short autonuma_last_nid;
> -#endif
> - struct list_head autonuma_migrate_node;
> -#endif
> -
> /*
> * On machines where all RAM is mapped into kernel address space,
> * we can simply calculate the virtual address. On machines with
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index d53b26a..e66da74 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -698,10 +698,13 @@ typedef struct pglist_data {
> int kswapd_max_order;
> enum zone_type classzone_idx;
> #ifdef CONFIG_AUTONUMA
> - spinlock_t autonuma_lock;
> +#if !defined(CONFIG_SPARSEMEM)
> + struct page_autonuma *node_page_autonuma;
> +#endif
> struct list_head autonuma_migrate_head[MAX_NUMNODES];
> unsigned long autonuma_nr_migrate_pages;
> wait_queue_head_t autonuma_knuma_migrated_wait;
> + spinlock_t autonuma_lock;
> #endif
> } pg_data_t;
>
> @@ -1064,6 +1067,15 @@ struct mem_section {
> * section. (see memcontrol.h/page_cgroup.h about this.)
> */
> struct page_cgroup *page_cgroup;
> +#endif
> +#ifdef CONFIG_AUTONUMA
> + /*
> + * If !SPARSEMEM, pgdat doesn't have page_autonuma pointer. We use
> + * section.
> + */
> + struct page_autonuma *section_page_autonuma;
> +#endif
> +#if defined(CONFIG_CGROUP_MEM_RES_CTLR) ^ defined(CONFIG_AUTONUMA)
> unsigned long pad;
> #endif
> };
> diff --git a/include/linux/page_autonuma.h b/include/linux/page_autonuma.h
> new file mode 100644
> index 0000000..d748aa2
> --- /dev/null
> +++ b/include/linux/page_autonuma.h
> @@ -0,0 +1,53 @@
> +#ifndef _LINUX_PAGE_AUTONUMA_H
> +#define _LINUX_PAGE_AUTONUMA_H
> +
> +#if defined(CONFIG_AUTONUMA) && !defined(CONFIG_SPARSEMEM)
> +extern void __init page_autonuma_init_flatmem(void);
> +#else
> +static inline void __init page_autonuma_init_flatmem(void) {}
> +#endif
> +
> +#ifdef CONFIG_AUTONUMA
> +
> +#include <linux/autonuma_flags.h>
> +
> +extern void __meminit page_autonuma_map_init(struct page *page,
> + struct page_autonuma *page_autonuma,
> + int nr_pages);
> +
> +#ifdef CONFIG_SPARSEMEM
> +#define PAGE_AUTONUMA_SIZE (sizeof(struct page_autonuma))
> +#define SECTION_PAGE_AUTONUMA_SIZE (PAGE_AUTONUMA_SIZE * \
> + PAGES_PER_SECTION)
> +#endif
> +
> +extern void __meminit pgdat_autonuma_init(struct pglist_data *);
> +
> +#else /* CONFIG_AUTONUMA */
> +
> +#ifdef CONFIG_SPARSEMEM
> +struct page_autonuma;
> +#define PAGE_AUTONUMA_SIZE 0
> +#define SECTION_PAGE_AUTONUMA_SIZE 0
> +
> +#define autonuma_impossible() true
> +
> +#endif
> +
> +static inline void pgdat_autonuma_init(struct pglist_data *pgdat) {}
> +
> +#endif /* CONFIG_AUTONUMA */
> +
> +#ifdef CONFIG_SPARSEMEM
> +extern struct page_autonuma * __meminit __kmalloc_section_page_autonuma(int nid,
> + unsigned long nr_pages);
> +extern void __kfree_section_page_autonuma(struct page_autonuma *page_autonuma,
> + unsigned long nr_pages);
> +extern void __init sparse_early_page_autonuma_alloc_node(struct page_autonuma **page_autonuma_map,
> + unsigned long pnum_begin,
> + unsigned long pnum_end,
> + unsigned long map_count,
> + int nodeid);
> +#endif
> +
> +#endif /* _LINUX_PAGE_AUTONUMA_H */
> diff --git a/init/main.c b/init/main.c
> index b5cc0a7..070a377 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -68,6 +68,7 @@
> #include <linux/shmem_fs.h>
> #include <linux/slab.h>
> #include <linux/perf_event.h>
> +#include <linux/page_autonuma.h>
>
> #include <asm/io.h>
> #include <asm/bugs.h>
> @@ -455,6 +456,7 @@ static void __init mm_init(void)
> * bigger than MAX_ORDER unless SPARSEMEM.
> */
> page_cgroup_init_flatmem();
> + page_autonuma_init_flatmem();
> mem_init();
> kmem_cache_init();
> percpu_init_late();
> diff --git a/mm/Makefile b/mm/Makefile
> index 15900fd..a4d8354 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -33,7 +33,7 @@ obj-$(CONFIG_FRONTSWAP) += frontswap.o
> obj-$(CONFIG_HAS_DMA) += dmapool.o
> obj-$(CONFIG_HUGETLBFS) += hugetlb.o
> obj-$(CONFIG_NUMA) += mempolicy.o
> -obj-$(CONFIG_AUTONUMA) += autonuma.o
> +obj-$(CONFIG_AUTONUMA) += autonuma.o page_autonuma.o
> obj-$(CONFIG_SPARSEMEM) += sparse.o
> obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
> obj-$(CONFIG_SLOB) += slob.o
> diff --git a/mm/autonuma.c b/mm/autonuma.c
> index f44272b..ec4d492 100644
> --- a/mm/autonuma.c
> +++ b/mm/autonuma.c
> @@ -51,12 +51,6 @@ static struct knumad_scan {
> .mm_head = LIST_HEAD_INIT(knumad_scan.mm_head),
> };
>
> -static inline bool autonuma_impossible(void)
> -{
> - return num_possible_nodes() <= 1 ||
> - test_bit(AUTONUMA_IMPOSSIBLE_FLAG, &autonuma_flags);
> -}
> -
> static inline void autonuma_migrate_lock(int nid)
> {
> spin_lock(&NODE_DATA(nid)->autonuma_lock);
> @@ -82,54 +76,63 @@ void autonuma_migrate_split_huge_page(struct page *page,
> struct page *page_tail)
> {
> int nid, last_nid;
> + struct page_autonuma *page_autonuma, *page_tail_autonuma;
>
> - nid = page->autonuma_migrate_nid;
> + if (autonuma_impossible())
Is it just better to call it 'autonuma_off()' ?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists