[PATCH] mm: alloc_page_mpol Introduce alloc_page_mpol(), to get rid of those mpol pseudo-vmas from shmem.c, which caused shmem_getpage() to show up in deep stack reports. Not-yet-Signed-off-by: Hugh Dickins --- include/linux/gfp.h | 6 +++ include/linux/mempolicy.h | 10 ++++++ include/linux/swap.h | 9 ++--- mm/memory.c | 5 +-- mm/mempolicy.c | 75 ++++++++++++++++++++++++++-------------------- mm/shmem.c | 66 ++++------------------------------------ mm/swap_state.c | 10 +++--- mm/swapfile.c | 4 +- 8 files changed, 80 insertions(+), 105 deletions(-) --- 2.6.30-rc8/include/linux/gfp.h 2009-04-08 18:26:14.000000000 +0100 +++ linux/include/linux/gfp.h 2009-06-07 13:56:58.000000000 +0100 @@ -7,6 +7,7 @@ #include struct vm_area_struct; +struct mempolicy; /* * GFP bitmasks.. @@ -213,10 +214,13 @@ alloc_pages(gfp_t gfp_mask, unsigned int } extern struct page *alloc_page_vma(gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr); +extern struct page *alloc_page_mpol(gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t pgoff); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) +#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) +#define alloc_page_mpol(gfp_mask, mpol, pgoff) alloc_pages(gfp_mask, 0) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) --- 2.6.30-rc8/include/linux/mempolicy.h 2008-10-09 23:13:53.000000000 +0100 +++ linux/include/linux/mempolicy.h 2009-06-07 13:56:58.000000000 +0100 @@ -62,6 +62,7 @@ enum { #include struct mm_struct; +struct vm_area_struct; #ifdef CONFIG_NUMA @@ -147,6 +148,9 @@ static inline struct mempolicy *mpol_dup return pol; } +extern struct mempolicy *get_vma_policy(struct task_struct *task, + struct vm_area_struct *vma, unsigned long addr); + #define vma_policy(vma) ((vma)->vm_policy) #define vma_set_policy(vma, pol) ((vma)->vm_policy = (pol)) @@ -294,6 +298,12 @@ mpol_shared_policy_lookup(struct shared_ { return NULL; } + +static inline struct mempolicy *get_vma_policy(struct task_struct *task, + struct vm_area_struct *vma, unsigned long addr) +{ + return NULL; +} #define vma_policy(vma) NULL #define vma_set_policy(vma, pol) do {} while(0) --- 2.6.30-rc8/include/linux/swap.h 2009-06-03 10:13:27.000000000 +0100 +++ linux/include/linux/swap.h 2009-06-07 13:56:58.000000000 +0100 @@ -12,9 +12,8 @@ #include #include -struct notifier_block; - struct bio; +struct mempolicy; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff @@ -290,9 +289,9 @@ extern void free_page_and_swap_cache(str extern void free_pages_and_swap_cache(struct page **, int); extern struct page *lookup_swap_cache(swp_entry_t); extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, - struct vm_area_struct *vma, unsigned long addr); + struct mempolicy *mpol, pgoff_t pgoff); extern struct page *swapin_readahead(swp_entry_t, gfp_t, - struct vm_area_struct *vma, unsigned long addr); + struct mempolicy *mpol, pgoff_t pgoff); /* linux/mm/swapfile.c */ extern long nr_swap_pages; @@ -377,7 +376,7 @@ static inline void swap_free(swp_entry_t } static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr) + struct mempolicy *mpol, pgoff_t pgoff) { return NULL; } --- 2.6.30-rc8/mm/memory.c 2009-05-09 09:06:44.000000000 +0100 +++ linux/mm/memory.c 2009-06-07 13:56:58.000000000 +0100 @@ -2467,8 +2467,9 @@ static int do_swap_page(struct mm_struct page = lookup_swap_cache(entry); if (!page) { grab_swap_token(); /* Contend for token _before_ read-in */ - page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, address); + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, + get_vma_policy(current, vma, address), + linear_page_index(vma, address)); if (!page) { /* * Back out if somebody else faulted in this pte --- 2.6.30-rc8/mm/mempolicy.c 2009-03-23 23:12:14.000000000 +0000 +++ linux/mm/mempolicy.c 2009-06-07 13:56:58.000000000 +0100 @@ -1304,7 +1304,7 @@ asmlinkage long compat_sys_mbind(compat_ * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -static struct mempolicy *get_vma_policy(struct task_struct *task, +struct mempolicy *get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = task->mempolicy; @@ -1425,9 +1425,8 @@ unsigned slab_node(struct mempolicy *pol } } -/* Do static interleaving for a VMA with known offset. */ -static unsigned offset_il_node(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long off) +/* Determine a node number for interleave */ +static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t pgoff) { unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; @@ -1436,7 +1435,7 @@ static unsigned offset_il_node(struct me if (!nnodes) return numa_node_id(); - target = (unsigned int)off % nnodes; + target = (unsigned int)pgoff % nnodes; c = 0; do { nid = next_node(nid, pol->v.nodes); @@ -1445,28 +1444,6 @@ static unsigned offset_il_node(struct me return nid; } -/* Determine a node number for interleave */ -static inline unsigned interleave_nid(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long addr, int shift) -{ - if (vma) { - unsigned long off; - - /* - * for small pages, there is no difference between - * shift and PAGE_SHIFT, so the bit-shift is safe. - * for huge pages, since vm_pgoff is in units of small - * pages, we need to shift off the always 0 bits to get - * a useful offset. - */ - BUG_ON(shift < PAGE_SHIFT); - off = vma->vm_pgoff >> (shift - PAGE_SHIFT); - off += (addr - vma->vm_start) >> shift; - return offset_il_node(pol, vma, off); - } else - return interleave_nodes(pol); -} - #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) @@ -1491,8 +1468,9 @@ struct zonelist *huge_zonelist(struct vm *nodemask = NULL; /* assume !MPOL_BIND */ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { - zl = node_zonelist(interleave_nid(*mpol, vma, addr, - huge_page_shift(hstate_vma(vma))), gfp_flags); + pgoff_t pgoff = linear_page_index(vma, addr); + pgoff >>= huge_page_shift(hstate_vma(vma)); + zl = node_zonelist(interleave_nid(*mpol, pgoff), gfp_flags); } else { zl = policy_zonelist(gfp_flags, *mpol); if ((*mpol)->mode == MPOL_BIND) @@ -1550,7 +1528,40 @@ alloc_page_vma(gfp_t gfp, struct vm_area if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); + if (vma) + nid = interleave_nid(pol, linear_page_index(vma, addr)); + else + nid = interleave_nodes(pol); + mpol_cond_put(pol); + return alloc_page_interleave(gfp, 0, nid); + } + zl = policy_zonelist(gfp, pol); + if (unlikely(mpol_needs_cond_ref(pol))) { + /* + * slow path: ref counted shared policy + */ + struct page *page = __alloc_pages_nodemask(gfp, 0, + zl, policy_nodemask(gfp, pol)); + __mpol_put(pol); + return page; + } + /* + * fast path: default or task policy + */ + return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); +} + +struct page * +alloc_page_mpol(gfp_t gfp, struct mempolicy *pol, pgoff_t pgoff) +{ + struct zonelist *zl; + + cpuset_update_task_memory_state(); + + if (unlikely(pol->mode == MPOL_INTERLEAVE)) { + unsigned int nid; + + nid = interleave_nid(pol, pgoff); mpol_cond_put(pol); return alloc_page_interleave(gfp, 0, nid); } @@ -1757,11 +1768,11 @@ static void sp_insert(struct shared_poli struct mempolicy * mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) { - struct mempolicy *pol = NULL; + struct mempolicy *pol = &default_policy; struct sp_node *sn; if (!sp->root.rb_node) - return NULL; + return pol; spin_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { --- 2.6.30-rc8/mm/shmem.c 2009-05-09 09:06:44.000000000 +0100 +++ linux/mm/shmem.c 2009-06-07 13:56:58.000000000 +0100 @@ -1106,8 +1106,7 @@ redirty: return 0; } -#ifdef CONFIG_NUMA -#ifdef CONFIG_TMPFS +#if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) { char buffer[64]; @@ -1131,64 +1130,11 @@ static struct mempolicy *shmem_get_sbmpo } return mpol; } -#endif /* CONFIG_TMPFS */ - -static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, - struct shmem_inode_info *info, unsigned long idx) -{ - struct mempolicy mpol, *spol; - struct vm_area_struct pvma; - struct page *page; - - spol = mpol_cond_copy(&mpol, - mpol_shared_policy_lookup(&info->policy, idx)); - - /* Create a pseudo vma that just contains the policy */ - pvma.vm_start = 0; - pvma.vm_pgoff = idx; - pvma.vm_ops = NULL; - pvma.vm_policy = spol; - page = swapin_readahead(entry, gfp, &pvma, 0); - return page; -} - -static struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, unsigned long idx) -{ - struct vm_area_struct pvma; - - /* Create a pseudo vma that just contains the policy */ - pvma.vm_start = 0; - pvma.vm_pgoff = idx; - pvma.vm_ops = NULL; - pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); - - /* - * alloc_page_vma() will drop the shared policy reference - */ - return alloc_page_vma(gfp, &pvma, 0); -} -#else /* !CONFIG_NUMA */ -#ifdef CONFIG_TMPFS +#else static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p) { } -#endif /* CONFIG_TMPFS */ - -static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, - struct shmem_inode_info *info, unsigned long idx) -{ - return swapin_readahead(entry, gfp, NULL, 0); -} - -static inline struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, unsigned long idx) -{ - return alloc_page(gfp); -} -#endif /* CONFIG_NUMA */ -#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS) static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { return NULL; @@ -1268,7 +1214,9 @@ repeat: *type |= VM_FAULT_MAJOR; } spin_unlock(&info->lock); - swappage = shmem_swapin(swap, gfp, info, idx); + swappage = swapin_readahead(swap, gfp, + mpol_shared_policy_lookup(&info->policy, idx), + idx); if (!swappage) { spin_lock(&info->lock); entry = shmem_swp_alloc(info, idx, sgp); @@ -1395,7 +1343,9 @@ repeat: int ret; spin_unlock(&info->lock); - filepage = shmem_alloc_page(gfp, info, idx); + filepage = alloc_page_mpol(gfp, + mpol_shared_policy_lookup(&info->policy, idx), + idx); if (!filepage) { shmem_unacct_blocks(info->flags, 1); shmem_free_blocks(inode, 1); --- 2.6.30-rc8/mm/swap_state.c 2009-06-03 10:13:27.000000000 +0100 +++ linux/mm/swap_state.c 2009-06-07 13:56:58.000000000 +0100 @@ -266,7 +266,7 @@ struct page * lookup_swap_cache(swp_entr * the swap entry is no longer in use. */ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr) + struct mempolicy *mpol, pgoff_t pgoff) { struct page *found_page, *new_page = NULL; int err; @@ -285,7 +285,7 @@ struct page *read_swap_cache_async(swp_e * Get a new page to read into from swap. */ if (!new_page) { - new_page = alloc_page_vma(gfp_mask, vma, addr); + new_page = alloc_page_mpol(gfp_mask, mpol, pgoff); if (!new_page) break; /* Out of memory */ } @@ -345,7 +345,7 @@ struct page *read_swap_cache_async(swp_e * Caller must hold down_read on the vma->vm_mm if vma is not NULL. */ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr) + struct mempolicy *mpol, pgoff_t pgoff) { int nr_pages; struct page *page; @@ -363,11 +363,11 @@ struct page *swapin_readahead(swp_entry_ for (end_offset = offset + nr_pages; offset < end_offset; offset++) { /* Ok, do the async read-ahead now */ page = read_swap_cache_async(swp_entry(swp_type(entry), offset), - gfp_mask, vma, addr); + gfp_mask, mpol, pgoff); if (!page) break; page_cache_release(page); } lru_add_drain(); /* Push any new pages onto the LRU now */ - return read_swap_cache_async(entry, gfp_mask, vma, addr); + return read_swap_cache_async(entry, gfp_mask, mpol, pgoff); } --- 2.6.30-rc8/mm/swapfile.c 2009-03-23 23:12:14.000000000 +0000 +++ linux/mm/swapfile.c 2009-06-07 13:56:58.000000000 +0100 @@ -951,8 +951,8 @@ static int try_to_unuse(unsigned int typ */ swap_map = &si->swap_map[i]; entry = swp_entry(type, i); - page = read_swap_cache_async(entry, - GFP_HIGHUSER_MOVABLE, NULL, 0); + page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, + get_vma_policy(current, NULL, 0), 0); if (!page) { /* * Either swap_duplicate() failed because entry