[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20191029235103.GA12385@tower.DHCP.thefacebook.com>
Date: Tue, 29 Oct 2019 23:51:07 +0000
From: Roman Gushchin <guro@...com>
To: Shakeel Butt <shakeelb@...gle.com>
CC: Greg Thelen <gthelen@...gle.com>,
Johannes Weiner <hannes@...xchg.org>,
Michal Hocko <mhocko@...nel.org>,
Andrew Morton <akpm@...ux-foundation.org>,
"linux-mm@...ck.org" <linux-mm@...ck.org>,
"cgroups@...r.kernel.org" <cgroups@...r.kernel.org>,
"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>,
"syzbot+13f93c99c06988391efe@...kaller.appspotmail.com"
<syzbot+13f93c99c06988391efe@...kaller.appspotmail.com>
Subject: Re: [PATCH] mm: vmscan: memcontrol: remove
mem_cgroup_select_victim_node()
On Tue, Oct 29, 2019 at 04:47:53PM -0700, Shakeel Butt wrote:
> Since commit 1ba6fc9af35b ("mm: vmscan: do not share cgroup iteration
> between reclaimers"), the memcg reclaim does not bail out earlier based
> on sc->nr_reclaimed and will traverse all the nodes. All the reclaimable
> pages of the memcg on all the nodes will be scanned relative to the
> reclaim priority. So, there is no need to maintain state regarding which
> node to start the memcg reclaim from. Also KCSAN complains data races in
> the code maintaining the state.
>
> This patch effectively reverts the commit 889976dbcb12 ("memcg: reclaim
> memory from nodes in round-robin order") and the commit 453a9bf347f1
> ("memcg: fix numa scan information update to be triggered by memory
> event").
>
> Signed-off-by: Shakeel Butt <shakeelb@...gle.com>
> Reported-by: <syzbot+13f93c99c06988391efe@...kaller.appspotmail.com>
Acked-by: Roman Gushchin <guro@...com>
Thanks!
> ---
> include/linux/memcontrol.h | 8 ---
> mm/memcontrol.c | 112 -------------------------------------
> mm/vmscan.c | 11 +---
> 3 files changed, 1 insertion(+), 130 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index e82928deea88..239e752a7817 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -80,7 +80,6 @@ struct mem_cgroup_id {
> enum mem_cgroup_events_target {
> MEM_CGROUP_TARGET_THRESH,
> MEM_CGROUP_TARGET_SOFTLIMIT,
> - MEM_CGROUP_TARGET_NUMAINFO,
> MEM_CGROUP_NTARGETS,
> };
>
> @@ -312,13 +311,6 @@ struct mem_cgroup {
> struct list_head kmem_caches;
> #endif
>
> - int last_scanned_node;
> -#if MAX_NUMNODES > 1
> - nodemask_t scan_nodes;
> - atomic_t numainfo_events;
> - atomic_t numainfo_updating;
> -#endif
> -
> #ifdef CONFIG_CGROUP_WRITEBACK
> struct list_head cgwb_list;
> struct wb_domain cgwb_domain;
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index ea085877c548..aaa19bf5cf0f 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -100,7 +100,6 @@ static bool do_memsw_account(void)
>
> #define THRESHOLDS_EVENTS_TARGET 128
> #define SOFTLIMIT_EVENTS_TARGET 1024
> -#define NUMAINFO_EVENTS_TARGET 1024
>
> /*
> * Cgroups above their limits are maintained in a RB-Tree, independent of
> @@ -869,9 +868,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
> case MEM_CGROUP_TARGET_SOFTLIMIT:
> next = val + SOFTLIMIT_EVENTS_TARGET;
> break;
> - case MEM_CGROUP_TARGET_NUMAINFO:
> - next = val + NUMAINFO_EVENTS_TARGET;
> - break;
> default:
> break;
> }
> @@ -891,21 +887,12 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
> if (unlikely(mem_cgroup_event_ratelimit(memcg,
> MEM_CGROUP_TARGET_THRESH))) {
> bool do_softlimit;
> - bool do_numainfo __maybe_unused;
>
> do_softlimit = mem_cgroup_event_ratelimit(memcg,
> MEM_CGROUP_TARGET_SOFTLIMIT);
> -#if MAX_NUMNODES > 1
> - do_numainfo = mem_cgroup_event_ratelimit(memcg,
> - MEM_CGROUP_TARGET_NUMAINFO);
> -#endif
> mem_cgroup_threshold(memcg);
> if (unlikely(do_softlimit))
> mem_cgroup_update_tree(memcg, page);
> -#if MAX_NUMNODES > 1
> - if (unlikely(do_numainfo))
> - atomic_inc(&memcg->numainfo_events);
> -#endif
> }
> }
>
> @@ -1590,104 +1577,6 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
> return ret;
> }
>
> -#if MAX_NUMNODES > 1
> -
> -/**
> - * test_mem_cgroup_node_reclaimable
> - * @memcg: the target memcg
> - * @nid: the node ID to be checked.
> - * @noswap : specify true here if the user wants flle only information.
> - *
> - * This function returns whether the specified memcg contains any
> - * reclaimable pages on a node. Returns true if there are any reclaimable
> - * pages in the node.
> - */
> -static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
> - int nid, bool noswap)
> -{
> - struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
> -
> - if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
> - lruvec_page_state(lruvec, NR_ACTIVE_FILE))
> - return true;
> - if (noswap || !total_swap_pages)
> - return false;
> - if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
> - lruvec_page_state(lruvec, NR_ACTIVE_ANON))
> - return true;
> - return false;
> -
> -}
> -
> -/*
> - * Always updating the nodemask is not very good - even if we have an empty
> - * list or the wrong list here, we can start from some node and traverse all
> - * nodes based on the zonelist. So update the list loosely once per 10 secs.
> - *
> - */
> -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
> -{
> - int nid;
> - /*
> - * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
> - * pagein/pageout changes since the last update.
> - */
> - if (!atomic_read(&memcg->numainfo_events))
> - return;
> - if (atomic_inc_return(&memcg->numainfo_updating) > 1)
> - return;
> -
> - /* make a nodemask where this memcg uses memory from */
> - memcg->scan_nodes = node_states[N_MEMORY];
> -
> - for_each_node_mask(nid, node_states[N_MEMORY]) {
> -
> - if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
> - node_clear(nid, memcg->scan_nodes);
> - }
> -
> - atomic_set(&memcg->numainfo_events, 0);
> - atomic_set(&memcg->numainfo_updating, 0);
> -}
> -
> -/*
> - * Selecting a node where we start reclaim from. Because what we need is just
> - * reducing usage counter, start from anywhere is O,K. Considering
> - * memory reclaim from current node, there are pros. and cons.
> - *
> - * Freeing memory from current node means freeing memory from a node which
> - * we'll use or we've used. So, it may make LRU bad. And if several threads
> - * hit limits, it will see a contention on a node. But freeing from remote
> - * node means more costs for memory reclaim because of memory latency.
> - *
> - * Now, we use round-robin. Better algorithm is welcomed.
> - */
> -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
> -{
> - int node;
> -
> - mem_cgroup_may_update_nodemask(memcg);
> - node = memcg->last_scanned_node;
> -
> - node = next_node_in(node, memcg->scan_nodes);
> - /*
> - * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
> - * last time it really checked all the LRUs due to rate limiting.
> - * Fallback to the current node in that case for simplicity.
> - */
> - if (unlikely(node == MAX_NUMNODES))
> - node = numa_node_id();
> -
> - memcg->last_scanned_node = node;
> - return node;
> -}
> -#else
> -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
> -{
> - return 0;
> -}
> -#endif
> -
> static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
> pg_data_t *pgdat,
> gfp_t gfp_mask,
> @@ -5056,7 +4945,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
> goto fail;
>
> INIT_WORK(&memcg->high_work, high_work_func);
> - memcg->last_scanned_node = MAX_NUMNODES;
> INIT_LIST_HEAD(&memcg->oom_notify);
> mutex_init(&memcg->thresholds_lock);
> spin_lock_init(&memcg->move_lock);
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 1154b3a2b637..cb4dc52cfb88 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -3344,10 +3344,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> gfp_t gfp_mask,
> bool may_swap)
> {
> - struct zonelist *zonelist;
> unsigned long nr_reclaimed;
> unsigned long pflags;
> - int nid;
> unsigned int noreclaim_flag;
> struct scan_control sc = {
> .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
> @@ -3360,16 +3358,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> .may_unmap = 1,
> .may_swap = may_swap,
> };
> + struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
>
> set_task_reclaim_state(current, &sc.reclaim_state);
> - /*
> - * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
> - * take care of from where we get pages. So the node where we start the
> - * scan does not need to be the current node.
> - */
> - nid = mem_cgroup_select_victim_node(memcg);
> -
> - zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
>
> trace_mm_vmscan_memcg_reclaim_begin(
> cgroup_ino(memcg->css.cgroup),
> --
> 2.24.0.rc0.303.g954a862665-goog
>
Powered by blists - more mailing lists