[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <alpine.DEB.2.00.1211071003150.27451@chino.kir.corp.google.com>
Date: Wed, 7 Nov 2012 10:07:14 -0800 (PST)
From: David Rientjes <rientjes@...gle.com>
To: Sha Zhengju <handai.szj@...il.com>
cc: linux-mm@...ck.org, cgroups@...r.kernel.org, mhocko@...e.cz,
kamezawa.hiroyu@...fujitsu.com, akpm@...ux-foundation.org,
linux-kernel@...r.kernel.org, Sha Zhengju <handai.szj@...bao.com>
Subject: Re: [PATCH 2/2] oom: rework dump_tasks to optimize memcg-oom
situation
On Wed, 7 Nov 2012, Sha Zhengju wrote:
> From: Sha Zhengju <handai.szj@...bao.com>
>
> If memcg oom happening, don't scan all system tasks to dump memory state of
> eligible tasks, instead we iterates only over the process attached to the oom
> memcg and avoid the rcu lock.
>
Avoiding the rcu lock isn't actually that impressive here, the cgroup
iterator will use it's own lock for that memcg.
> Signed-off-by: Sha Zhengju <handai.szj@...bao.com>
> Cc: Michal Hocko <mhocko@...e.cz>
> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@...fujitsu.com>
> Cc: David Rientjes <rientjes@...gle.com>
> Cc: Andrew Morton <akpm@...ux-foundation.org>
> ---
> include/linux/memcontrol.h | 7 +++++
> include/linux/oom.h | 2 +
> mm/memcontrol.c | 14 +++++++++++
> mm/oom_kill.c | 55 ++++++++++++++++++++++++++-----------------
> 4 files changed, 56 insertions(+), 22 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index c91e3c1..4322ca8 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -122,6 +122,8 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
> void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
> extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
> struct task_struct *p);
> +extern void dump_tasks_memcg(const struct mem_cgroup *memcg,
> + const nodemask_t *nodemask);
Shouldn't need the nodemask parameter, just have dump_tasks_memcg() pass
NULL to dump_per_task(), we won't be isolating to tasks with mempolicies
restricted to a particular set of nodes since we're in the memcg oom path
here, not the global page allocator oom path.
> extern void mem_cgroup_replace_page_cache(struct page *oldpage,
> struct page *newpage);
>
> @@ -337,6 +339,11 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
> {
> }
>
> +static inline void
> +dump_tasks_memcg(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
> +{
> +}
> +
> static inline void mem_cgroup_begin_update_page_stat(struct page *page,
> bool *locked, unsigned long *flags)
> {
> diff --git a/include/linux/oom.h b/include/linux/oom.h
> index 20b5c46..9ba3344 100644
> --- a/include/linux/oom.h
> +++ b/include/linux/oom.h
> @@ -57,6 +57,8 @@ extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
> unsigned long totalpages, const nodemask_t *nodemask,
> bool force_kill);
>
> +extern inline void dump_per_task(struct task_struct *p,
> + const nodemask_t *nodemask);
This is a global symbol, so dump_per_task() doesn't make a lot of sense:
it would need to be prefixed with "oom_" so perhaps oom_dump_task() is
better?
> extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
> int order, nodemask_t *mask, bool force_kill);
> extern int register_oom_notifier(struct notifier_block *nb);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 2df5e72..fe648f8 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -1665,6 +1665,20 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
> return min(limit, memsw);
> }
>
> +void dump_tasks_memcg(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
> +{
> + struct cgroup_iter it;
> + struct task_struct *task;
> + struct cgroup *cgroup = memcg->css.cgroup;
> +
> + cgroup_iter_start(cgroup, &it);
> + while ((task = cgroup_iter_next(cgroup, &it))) {
> + dump_per_task(task, nodemask);
> + }
> +
> + cgroup_iter_end(cgroup, &it);
> +}
> +
> static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
> int order)
> {
> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 4b8a6dd..aaf6237 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -367,6 +367,32 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
> return chosen;
> }
>
> +inline void dump_per_task(struct task_struct *p, const nodemask_t *nodemask)
No inline.
> +{
> + struct task_struct *task;
> +
> + if (oom_unkillable_task(p, NULL, nodemask))
> + return;
> +
> + task = find_lock_task_mm(p);
> + if (!task) {
> + /*
> + * This is a kthread or all of p's threads have already
> + * detached their mm's. There's no need to report
> + * them; they can't be oom killed anyway.
> + */
> + return;
> + }
> +
> + pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
> + task->pid, from_kuid(&init_user_ns, task_uid(task)),
> + task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
> + task->mm->nr_ptes,
> + get_mm_counter(task->mm, MM_SWAPENTS),
> + task->signal->oom_score_adj, task->comm);
> + task_unlock(task);
> +}
> +
> /**
> * dump_tasks - dump current memory state of all system tasks
> * @memcg: current's memory controller, if constrained
> @@ -381,32 +407,17 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
> static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask)
> {
> struct task_struct *p;
> - struct task_struct *task;
>
> pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n");
> - rcu_read_lock();
> - for_each_process(p) {
> - if (oom_unkillable_task(p, memcg, nodemask))
> - continue;
> -
> - task = find_lock_task_mm(p);
> - if (!task) {
> - /*
> - * This is a kthread or all of p's threads have already
> - * detached their mm's. There's no need to report
> - * them; they can't be oom killed anyway.
> - */
> - continue;
> - }
>
> - pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n",
> - task->pid, from_kuid(&init_user_ns, task_uid(task)),
> - task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
> - task->mm->nr_ptes,
> - get_mm_counter(task->mm, MM_SWAPENTS),
> - task->signal->oom_score_adj, task->comm);
> - task_unlock(task);
> + if (memcg) {
> + dump_tasks_memcg(memcg, nodemask);
> + return;
> }
> +
> + rcu_read_lock();
> + for_each_process(p)
> + dump_per_task(p, nodemask);
> rcu_read_unlock();
> }
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists