[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20141008140327.GE4592@dhcp22.suse.cz>
Date: Wed, 8 Oct 2014 16:03:27 +0200
From: Michal Hocko <mhocko@...e.cz>
To: Johannes Weiner <hannes@...xchg.org>
Cc: linux-mm@...ck.org, Vladimir Davydov <vdavydov@...allels.com>,
Greg Thelen <gthelen@...gle.com>, Tejun Heo <tj@...nel.org>,
cgroups@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: Re: [patch 3/3] mm: memcontrol: continue cache reclaim from offlined
groups
On Sat 20-09-14 16:00:35, Johannes Weiner wrote:
> On cgroup deletion, outstanding page cache charges are moved to the
> parent group so that they're not lost and can be reclaimed during
> pressure on/inside said parent. But this reparenting is fairly tricky
> and its synchroneous nature has led to several lock-ups in the past.
>
> Since css iterators now also include offlined css, memcg iterators can
This is since 3.16 and the closest match seems to be c2931b70a32c
(cgroup: iterate cgroup_subsys_states directly) right?
> be changed to include offlined children during reclaim of a group, and
> leftover cache can just stay put.
>
> There is a slight change of behavior in that charges of deleted groups
> no longer show up as local charges in the parent. But they are still
> included in the parent's hierarchical statistics.
This also addresses http://marc.info/?l=linux-mm&m=140027762114865&w=2
after force_empty is gone.
> Signed-off-by: Johannes Weiner <hannes@...xchg.org>
I would prefer if the drain_stock code cleanup was in a separate patch
to make this one smaller and easier to review.
Anyway
Acked-by: Michal Hocko <mhocko@...e.cz>
> ---
> mm/memcontrol.c | 260 ++------------------------------------------------------
> 1 file changed, 5 insertions(+), 255 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 019a44ac25d6..48531433a2fc 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -736,8 +736,6 @@ static void disarm_static_keys(struct mem_cgroup *memcg)
> disarm_kmem_keys(memcg);
> }
>
> -static void drain_all_stock_async(struct mem_cgroup *memcg);
> -
> static struct mem_cgroup_per_zone *
> mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
> {
> @@ -1208,7 +1206,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
> goto out_unlock;
> continue;
> }
> - if (css == &root->css || css_tryget_online(css)) {
> + if (css == &root->css || css_tryget(css)) {
> memcg = mem_cgroup_from_css(css);
> break;
> }
> @@ -2349,10 +2347,12 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
> * of the hierarchy under it. sync flag says whether we should block
> * until the work is done.
> */
> -static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
> +static void drain_all_stock(struct mem_cgroup *root_memcg)
> {
> int cpu, curcpu;
>
> + if (!mutex_trylock(&percpu_charge_mutex))
> + return;
> /* Notify other cpus that system-wide "drain" is running */
> get_online_cpus();
> curcpu = get_cpu();
> @@ -2373,41 +2373,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync)
> }
> }
> put_cpu();
> -
> - if (!sync)
> - goto out;
> -
> - for_each_online_cpu(cpu) {
> - struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
> - if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags))
> - flush_work(&stock->work);
> - }
> -out:
> put_online_cpus();
> -}
> -
> -/*
> - * Tries to drain stocked charges in other cpus. This function is asynchronous
> - * and just put a work per cpu for draining localy on each cpu. Caller can
> - * expects some charges will be back later but cannot wait for it.
> - */
> -static void drain_all_stock_async(struct mem_cgroup *root_memcg)
> -{
> - /*
> - * If someone calls draining, avoid adding more kworker runs.
> - */
> - if (!mutex_trylock(&percpu_charge_mutex))
> - return;
> - drain_all_stock(root_memcg, false);
> - mutex_unlock(&percpu_charge_mutex);
> -}
> -
> -/* This is a synchronous drain interface. */
> -static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
> -{
> - /* called when force_empty is called */
> - mutex_lock(&percpu_charge_mutex);
> - drain_all_stock(root_memcg, true);
> mutex_unlock(&percpu_charge_mutex);
> }
>
> @@ -2515,7 +2481,7 @@ retry:
> goto retry;
>
> if (!drained) {
> - drain_all_stock_async(mem_over_limit);
> + drain_all_stock(mem_over_limit);
> drained = true;
> goto retry;
> }
> @@ -3366,79 +3332,6 @@ out:
> return ret;
> }
>
> -/**
> - * mem_cgroup_move_parent - moves page to the parent group
> - * @page: the page to move
> - * @pc: page_cgroup of the page
> - * @child: page's cgroup
> - *
> - * move charges to its parent or the root cgroup if the group has no
> - * parent (aka use_hierarchy==0).
> - * Although this might fail (get_page_unless_zero, isolate_lru_page or
> - * mem_cgroup_move_account fails) the failure is always temporary and
> - * it signals a race with a page removal/uncharge or migration. In the
> - * first case the page is on the way out and it will vanish from the LRU
> - * on the next attempt and the call should be retried later.
> - * Isolation from the LRU fails only if page has been isolated from
> - * the LRU since we looked at it and that usually means either global
> - * reclaim or migration going on. The page will either get back to the
> - * LRU or vanish.
> - * Finaly mem_cgroup_move_account fails only if the page got uncharged
> - * (!PageCgroupUsed) or moved to a different group. The page will
> - * disappear in the next attempt.
> - */
> -static int mem_cgroup_move_parent(struct page *page,
> - struct page_cgroup *pc,
> - struct mem_cgroup *child)
> -{
> - struct mem_cgroup *parent;
> - unsigned int nr_pages;
> - unsigned long uninitialized_var(flags);
> - int ret;
> -
> - VM_BUG_ON(mem_cgroup_is_root(child));
> -
> - ret = -EBUSY;
> - if (!get_page_unless_zero(page))
> - goto out;
> - if (isolate_lru_page(page))
> - goto put;
> -
> - nr_pages = hpage_nr_pages(page);
> -
> - parent = parent_mem_cgroup(child);
> - /*
> - * If no parent, move charges to root cgroup.
> - */
> - if (!parent)
> - parent = root_mem_cgroup;
> -
> - if (nr_pages > 1) {
> - VM_BUG_ON_PAGE(!PageTransHuge(page), page);
> - flags = compound_lock_irqsave(page);
> - }
> -
> - ret = mem_cgroup_move_account(page, nr_pages,
> - pc, child, parent);
> - if (!ret) {
> - if (!mem_cgroup_is_root(parent))
> - css_get_many(&parent->css, nr_pages);
> - /* Take charge off the local counters */
> - page_counter_cancel(&child->memory, nr_pages);
> - if (do_swap_account)
> - page_counter_cancel(&child->memsw, nr_pages);
> - css_put_many(&child->css, nr_pages);
> - }
> -
> - if (nr_pages > 1)
> - compound_unlock_irqrestore(page, flags);
> - putback_lru_page(page);
> -put:
> - put_page(page);
> -out:
> - return ret;
> -}
> -
> #ifdef CONFIG_MEMCG_SWAP
> static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
> bool charge)
> @@ -3732,105 +3625,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
> return nr_reclaimed;
> }
>
> -/**
> - * mem_cgroup_force_empty_list - clears LRU of a group
> - * @memcg: group to clear
> - * @node: NUMA node
> - * @zid: zone id
> - * @lru: lru to to clear
> - *
> - * Traverse a specified page_cgroup list and try to drop them all. This doesn't
> - * reclaim the pages page themselves - pages are moved to the parent (or root)
> - * group.
> - */
> -static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
> - int node, int zid, enum lru_list lru)
> -{
> - struct lruvec *lruvec;
> - unsigned long flags;
> - struct list_head *list;
> - struct page *busy;
> - struct zone *zone;
> -
> - zone = &NODE_DATA(node)->node_zones[zid];
> - lruvec = mem_cgroup_zone_lruvec(zone, memcg);
> - list = &lruvec->lists[lru];
> -
> - busy = NULL;
> - do {
> - struct page_cgroup *pc;
> - struct page *page;
> -
> - spin_lock_irqsave(&zone->lru_lock, flags);
> - if (list_empty(list)) {
> - spin_unlock_irqrestore(&zone->lru_lock, flags);
> - break;
> - }
> - page = list_entry(list->prev, struct page, lru);
> - if (busy == page) {
> - list_move(&page->lru, list);
> - busy = NULL;
> - spin_unlock_irqrestore(&zone->lru_lock, flags);
> - continue;
> - }
> - spin_unlock_irqrestore(&zone->lru_lock, flags);
> -
> - pc = lookup_page_cgroup(page);
> -
> - if (mem_cgroup_move_parent(page, pc, memcg)) {
> - /* found lock contention or "pc" is obsolete. */
> - busy = page;
> - } else
> - busy = NULL;
> - cond_resched();
> - } while (!list_empty(list));
> -}
> -
> -/*
> - * make mem_cgroup's charge to be 0 if there is no task by moving
> - * all the charges and pages to the parent.
> - * This enables deleting this mem_cgroup.
> - *
> - * Caller is responsible for holding css reference on the memcg.
> - */
> -static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
> -{
> - int node, zid;
> -
> - do {
> - /* This is for making all *used* pages to be on LRU. */
> - lru_add_drain_all();
> - drain_all_stock_sync(memcg);
> - mem_cgroup_start_move(memcg);
> - for_each_node_state(node, N_MEMORY) {
> - for (zid = 0; zid < MAX_NR_ZONES; zid++) {
> - enum lru_list lru;
> - for_each_lru(lru) {
> - mem_cgroup_force_empty_list(memcg,
> - node, zid, lru);
> - }
> - }
> - }
> - mem_cgroup_end_move(memcg);
> - memcg_oom_recover(memcg);
> - cond_resched();
> -
> - /*
> - * Kernel memory may not necessarily be trackable to a specific
> - * process. So they are not migrated, and therefore we can't
> - * expect their value to drop to 0 here.
> - * Having res filled up with kmem only is enough.
> - *
> - * This is a safety check because mem_cgroup_force_empty_list
> - * could have raced with mem_cgroup_replace_page_cache callers
> - * so the lru seemed empty but the page could have been added
> - * right after the check. RES_USAGE should be safe as we always
> - * charge before adding to the LRU.
> - */
> - } while (atomic_long_read(&memcg->memory.count) -
> - atomic_long_read(&memcg->kmem.count) > 0);
> -}
> -
> /*
> * Test whether @memcg has children, dead or alive. Note that this
> * function doesn't care whether @memcg has use_hierarchy enabled and
> @@ -5359,7 +5153,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> struct mem_cgroup_event *event, *tmp;
> - struct cgroup_subsys_state *iter;
>
> /*
> * Unregister events and notify userspace.
> @@ -5373,13 +5166,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
> }
> spin_unlock(&memcg->event_list_lock);
>
> - /*
> - * This requires that offlining is serialized. Right now that is
> - * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
> - */
> - css_for_each_descendant_post(iter, css)
> - mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
> -
> memcg_unregister_all_caches(memcg);
> vmpressure_cleanup(&memcg->vmpressure);
> }
> @@ -5387,42 +5173,6 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
> static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(css);
> - /*
> - * XXX: css_offline() would be where we should reparent all
> - * memory to prepare the cgroup for destruction. However,
> - * memcg does not do css_tryget_online() and page_counter charging
> - * under the same RCU lock region, which means that charging
> - * could race with offlining. Offlining only happens to
> - * cgroups with no tasks in them but charges can show up
> - * without any tasks from the swapin path when the target
> - * memcg is looked up from the swapout record and not from the
> - * current task as it usually is. A race like this can leak
> - * charges and put pages with stale cgroup pointers into
> - * circulation:
> - *
> - * #0 #1
> - * lookup_swap_cgroup_id()
> - * rcu_read_lock()
> - * mem_cgroup_lookup()
> - * css_tryget_online()
> - * rcu_read_unlock()
> - * disable css_tryget_online()
> - * call_rcu()
> - * offline_css()
> - * reparent_charges()
> - * page_counter_charge()
> - * css_put()
> - * css_free()
> - * pc->mem_cgroup = dead memcg
> - * add page to lru
> - *
> - * The bulk of the charges are still moved in offline_css() to
> - * avoid pinning a lot of pages in case a long-term reference
> - * like a swapout record is deferring the css_free() to long
> - * after offlining. But this makes sure we catch any charges
> - * made after offlining:
> - */
> - mem_cgroup_reparent_charges(memcg);
>
> memcg_destroy_kmem(memcg);
> __mem_cgroup_free(memcg);
> --
> 2.1.0
>
--
Michal Hocko
SUSE Labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists