[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <20240702084423.1717904-2-link@vivo.com>
Date: Tue, 2 Jul 2024 16:44:04 +0800
From: Huan Yang <link@...o.com>
To: Johannes Weiner <hannes@...xchg.org>,
Michal Hocko <mhocko@...nel.org>,
Roman Gushchin <roman.gushchin@...ux.dev>,
Shakeel Butt <shakeel.butt@...ux.dev>,
Muchun Song <muchun.song@...ux.dev>,
Andrew Morton <akpm@...ux-foundation.org>,
"Matthew Wilcox (Oracle)" <willy@...radead.org>,
David Hildenbrand <david@...hat.com>,
Ryan Roberts <ryan.roberts@....com>,
Chris Li <chrisl@...nel.org>,
Dan Schatzberg <schatzberg.dan@...il.com>,
Huan Yang <link@...o.com>,
Kairui Song <kasong@...cent.com>,
cgroups@...r.kernel.org,
linux-mm@...ck.org,
linux-kernel@...r.kernel.org,
Christian Brauner <brauner@...nel.org>
Cc: opensource.kernel@...o.com
Subject: [RFC PATCH 1/4] mm: memcg: pmc framework
pmc - per memcg cache
This patch add a feature pmc in each memcg unless root memcg.
User can enable pmc in a target memcg, so all task in this memcg
will share a cache pool, the alloc/free order 0 page will high
priority turn in this cache pool.
Signed-off-by: Huan Yang <link@...o.com>
---
include/linux/memcontrol.h | 41 +++++++
include/linux/mmzone.h | 25 ++++
include/linux/swap.h | 1 +
mm/memcontrol.c | 237 +++++++++++++++++++++++++++++++++++++
mm/page_alloc.c | 146 +++++++++++++++++++++++
5 files changed, 450 insertions(+)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8f332b4ae84c..5ec4c64bc515 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -130,6 +130,7 @@ struct mem_cgroup_per_node {
bool on_tree;
struct mem_cgroup *memcg; /* Back pointer, we cannot */
/* use container_of */
+ struct mem_cgroup_per_node_cache *cachep;
};
struct mem_cgroup_threshold {
@@ -336,6 +337,8 @@ struct mem_cgroup {
struct lru_gen_mm_list mm_list;
#endif
+ bool cache_enabled;
+
struct mem_cgroup_per_node *nodeinfo[];
};
@@ -557,6 +560,8 @@ static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *ob
return memcg;
}
+extern struct static_key_true pmc_key;
+
#ifdef CONFIG_MEMCG_KMEM
/*
* folio_memcg_kmem - Check if the folio has the memcg_kmem flag set.
@@ -1185,6 +1190,25 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
gfp_t gfp_mask,
unsigned long *total_scanned);
+static inline bool pmc_disabled(void)
+{
+ return static_branch_likely(&pmc_key);
+}
+
+static inline bool mem_cgroup_cache_disabled(struct mem_cgroup *memcg)
+{
+ return !READ_ONCE(memcg->cache_enabled);
+}
+
+
+static inline struct mem_cgroup_per_node_cache *
+mem_cgroup_get_node_cachep(struct mem_cgroup *memcg, int nid)
+{
+ struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid];
+
+ return nodeinfo->cachep;
+}
+
#else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0
@@ -1648,6 +1672,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
{
return 0;
}
+
+static inline bool pmc_disabled(void)
+{
+ return true;
+}
+
+static inline bool mem_cgroup_cache_disabled(struct mem_cgroup *memcg)
+{
+ return true;
+}
+
+
+static inline struct mem_cgroup_per_node_cache *
+mem_cgroup_get_node_cachep(struct mem_cgroup *memcg, int nid)
+{
+ return NULL;
+}
#endif /* CONFIG_MEMCG */
/*
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c11b7cde81ef..773b89e214c9 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -603,6 +603,31 @@ static inline void lru_gen_soft_reclaim(struct mem_cgroup *memcg, int nid)
#endif /* CONFIG_LRU_GEN */
+struct mem_cgroup_zone_cache {
+ /* cache pages, current only hold order 0 */
+ struct list_head pages;
+ spinlock_t pages_lock;
+ atomic_t nr_pages;
+ atomic_t nr_alloced;
+};
+
+struct mem_cgroup_per_node_cache {
+ /* per zone cache */
+ struct mem_cgroup_zone_cache zone_cachep[MAX_NR_ZONES];
+ struct mem_cgroup *memcg;
+
+ /* max number to hold page, unit page, default 100MB */
+#define DEFAULT_PMC_HOLD_LIMIX ((100 << 20) >> PAGE_SHIFT)
+ unsigned int hold_limit;
+
+#define DEFAULT_PMC_GAP_WATERMARK ((50 << 20) >> PAGE_SHIFT)
+ /*
+ * Only when zone free pages above high+allow watermark, can hold cache,
+ * unit page, default 50MB
+ */
+ unsigned int allow_watermark;
+};
+
struct lruvec {
struct list_head lists[NR_LRU_LISTS];
/* per lruvec lru_lock for memcg */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 11c53692f65f..d7b5e0a8317c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -420,6 +420,7 @@ extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
long remove_mapping(struct address_space *mapping, struct folio *folio);
+extern int mem_cgroup_release_cache(struct mem_cgroup_per_node_cache *fc);
#ifdef CONFIG_NUMA
extern int node_reclaim_mode;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1b3c3394a2ba..404fcb96bf68 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -95,6 +95,15 @@ static bool cgroup_memory_nokmem __ro_after_init;
/* BPF memory accounting disabled? */
static bool cgroup_memory_nobpf __ro_after_init;
+/*
+ * How many memcg enabled cache? If none, static branch will enable
+ * so none task free/alloc will into PMC path.
+ * Else, hold/free cache in target memcg, disable static branch.
+ */
+static atomic_t pmc_nr_enabled;
+DEFINE_STATIC_KEY_TRUE(pmc_key);
+
+
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
@@ -5738,6 +5747,8 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
lru_gen_release_memcg(memcg);
}
+static int __disable_mem_cgroup_cache(struct mem_cgroup *memcg);
+
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
@@ -5762,6 +5773,8 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
cancel_work_sync(&memcg->high_work);
mem_cgroup_remove_from_trees(memcg);
free_shrinker_info(memcg);
+ if (READ_ONCE(memcg->cache_enabled))
+ __disable_mem_cgroup_cache(memcg);
mem_cgroup_free(memcg);
}
@@ -7088,6 +7101,223 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
return nbytes;
}
+static int __enable_mem_cgroup_cache(struct mem_cgroup *memcg)
+{
+ int nid, idx;
+
+ if (!mem_cgroup_cache_disabled(memcg))
+ return -EINVAL;
+
+ for_each_node(nid) {
+ struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid];
+ struct mem_cgroup_per_node_cache *p = kvzalloc_node(
+ sizeof(struct mem_cgroup_per_node_cache),
+ GFP_KERNEL, nid);
+
+ if (unlikely(!p))
+ goto fail;
+
+ nodeinfo->cachep = p;
+ }
+
+ for_each_node(nid) {
+ struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid];
+ pg_data_t *pgdat = NODE_DATA(nid);
+ struct mem_cgroup_per_node_cache *p = nodeinfo->cachep;
+
+ for (idx = 0; idx < MAX_NR_ZONES; idx++) {
+ struct zone *z = &pgdat->node_zones[idx];
+ struct mem_cgroup_zone_cache *zc;
+
+ if (!populated_zone(z))
+ continue;
+
+ zc = &p->zone_cachep[idx];
+
+ INIT_LIST_HEAD(&zc->pages);
+ spin_lock_init(&zc->pages_lock);
+ }
+
+ p->memcg = memcg;
+ p->hold_limit = DEFAULT_PMC_HOLD_LIMIX;
+ p->allow_watermark = DEFAULT_PMC_GAP_WATERMARK;
+
+ atomic_inc(&pmc_nr_enabled);
+ }
+
+ if (static_branch_likely(&pmc_key))
+ static_branch_disable(&pmc_key);
+
+ //online
+ smp_wmb();
+ WRITE_ONCE(memcg->cache_enabled, true);
+ atomic_inc(&pmc_nr_enabled);
+
+ return 0;
+
+fail:
+ for_each_node(nid) {
+ struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid];
+
+ if (nodeinfo->cachep) {
+ kvfree(nodeinfo->cachep);
+ nodeinfo->cachep = NULL;
+ }
+ }
+
+ return -ENOMEM;
+}
+
+static int __disable_mem_cgroup_cache(struct mem_cgroup *memcg)
+{
+ int nid;
+
+ if (unlikely(mem_cgroup_cache_disabled(memcg)))
+ return -EINVAL;
+
+ //offline
+ WRITE_ONCE(memcg->cache_enabled, false);
+
+ for_each_node(nid) {
+ struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid];
+ struct mem_cgroup_per_node_cache *p;
+
+ p = nodeinfo->cachep;
+
+ mem_cgroup_release_cache(p);
+
+ kfree(p);
+ }
+
+ if (atomic_dec_and_test(&pmc_nr_enabled))
+ static_branch_enable(&pmc_key);
+
+ return 0;
+}
+
+static int mem_cgroup_cache_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg;
+ int nid;
+
+ if (static_branch_likely(&pmc_key))
+ return -EINVAL;
+
+ memcg = mem_cgroup_from_seq(m);
+ if (!READ_ONCE(memcg->cache_enabled))
+ return -EINVAL;
+
+ seq_printf(m, "%4s %16s %16s\n", "NODE", "WATERMARK", "HOLD_LIMIT");
+ for_each_online_node(nid) {
+ struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid];
+ struct mem_cgroup_per_node_cache *p;
+
+ p = nodeinfo->cachep;
+ if (!p)
+ continue;
+
+ seq_printf(m, "%4d %14uKB %14uKB\n", nid,
+ (READ_ONCE(p->allow_watermark) << (PAGE_SHIFT - 10)),
+ (READ_ONCE(p->hold_limit) << (PAGE_SHIFT - 10)));
+ }
+
+ seq_puts(m, "===========\n");
+ seq_printf(m, "%4s %16s %16s %16s\n", "NODE", "ZONE", "CACHE", "HIT");
+
+ for_each_online_node(nid) {
+ struct mem_cgroup_per_node *nodeinfo = memcg->nodeinfo[nid];
+ struct mem_cgroup_per_node_cache *p;
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int idx;
+
+ p = nodeinfo->cachep;
+ if (!p)
+ continue;
+
+ for (idx = 0; idx < MAX_NR_ZONES; idx++) {
+ struct mem_cgroup_zone_cache *zc;
+ struct zone *z = &pgdat->node_zones[idx];
+
+ if (!populated_zone(z))
+ continue;
+
+ zc = &p->zone_cachep[idx];
+ seq_printf(m, "%4d %16s %14dKB %14dKB\n", nid, z->name,
+ (atomic_read(&zc->nr_pages)
+ << (PAGE_SHIFT - 10)),
+ (atomic_read(&zc->nr_alloced)
+ << (PAGE_SHIFT - 10)));
+ }
+ }
+
+ return 0;
+}
+
+enum {
+ OPT_CTRL_ENABLE,
+ OPT_CTRL_ERR,
+ OPR_CTRL_NR = OPT_CTRL_ERR,
+};
+
+static const match_table_t ctrl_tokens = {
+ { OPT_CTRL_ENABLE, "enable=%s" },
+ { OPT_CTRL_ERR, NULL } };
+
+/**
+ * This function can control target memcg's cache. include enable\keys set.
+ * To enable\disable this cache, by `echo enable=[y|n] > memory.cace`
+ * in target memcg.
+ */
+static ssize_t mem_cgroup_cache_control(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ bool enable;
+ bool opt_enable_set = false;
+ int err = 0;
+ char *sub;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+ buf = strstrip(buf);
+ if (!strlen(buf))
+ return -EINVAL;
+
+ while ((sub = strsep(&buf, " ")) != NULL) {
+ int token;
+ substring_t args[MAX_OPT_ARGS];
+ char tbuf[256];
+
+ sub = strstrip(sub);
+
+ token = match_token(sub, ctrl_tokens, args);
+ switch (token) {
+ case OPT_CTRL_ENABLE:
+ if (match_strlcpy(tbuf, &args[0], sizeof(tbuf)) >=
+ sizeof(tbuf))
+ return -EINVAL;
+
+ err = kstrtobool(tbuf, &enable);
+ if (err)
+ return -EINVAL;
+ opt_enable_set = true;
+ break;
+ case OPT_CTRL_ERR:
+ default:
+ return -EINVAL;
+ }
+ }
+
+ if (opt_enable_set) {
+ if (enable) {
+ __enable_mem_cgroup_cache(memcg);
+ } else {
+ __disable_mem_cgroup_cache(memcg);
+ return nbytes;
+ }
+ }
+
+ return err ? err : nbytes;
+}
+
static struct cftype memory_files[] = {
{
.name = "current",
@@ -7156,6 +7386,13 @@ static struct cftype memory_files[] = {
.flags = CFTYPE_NS_DELEGATABLE,
.write = memory_reclaim,
},
+ /* free cache field */
+ {
+ .name = "cache",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .write = mem_cgroup_cache_control,
+ .seq_show = mem_cgroup_cache_show,
+ },
{ } /* terminate */
};
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 1beb56f75319..54c4d00c2506 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -530,6 +530,14 @@ static inline int pindex_to_order(unsigned int pindex)
return order;
}
+/**
+ * Per memcg cache currently only allow order 0.
+ */
+static inline bool pmc_allow_order(unsigned int order)
+{
+ return !order;
+}
+
static inline bool pcp_allowed_order(unsigned int order)
{
if (order <= PAGE_ALLOC_COSTLY_ORDER)
@@ -1271,6 +1279,43 @@ void __free_pages_core(struct page *page, unsigned int order)
__free_pages_ok(page, order, FPI_TO_TAIL);
}
+int mem_cgroup_release_cache(struct mem_cgroup_per_node_cache *nodep)
+{
+ LIST_HEAD(temp_list);
+ int zid, num = 0;
+
+ for (zid = 0; zid < MAX_NR_ZONES; ++zid) {
+ struct mem_cgroup_zone_cache *zc = &nodep->zone_cachep[zid];
+ int i = 0;
+
+ if (!atomic_read(&zc->nr_pages))
+ continue;
+
+ spin_lock(&zc->pages_lock);
+ list_splice_init(&zc->pages, &temp_list);
+ spin_unlock(&zc->pages_lock);
+
+ while (!list_empty(&temp_list)) {
+ struct page *page =
+ list_first_entry(&temp_list, struct page, lru);
+ struct zone *zone = page_zone(page);
+ unsigned long pfn = page_to_pfn(page);
+
+ list_del(&page->lru);
+
+
+ // is good to put into pcp?
+ free_one_page(zone, page, pfn, 0, FPI_NONE);
+ ++i;
+ }
+
+ num += i;
+ atomic_sub(i, &zc->nr_pages);
+ }
+
+ return num;
+}
+
/*
* Check that the whole (or subset of) a pageblock given by the interval of
* [start_pfn, end_pfn) is valid and within the same zone, before scanning it
@@ -2603,6 +2648,41 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
}
}
+static bool free_unref_page_to_pmc(struct page *page, struct zone *zone,
+ int order)
+{
+ struct mem_cgroup *memcg;
+ struct mem_cgroup_per_node_cache *cachp;
+ struct mem_cgroup_zone_cache *zc;
+ unsigned long flags;
+ bool ret = false;
+
+ if (pmc_disabled())
+ return false;
+
+ memcg = get_mem_cgroup_from_current();
+ if (!memcg || mem_cgroup_is_root(memcg) ||
+ mem_cgroup_cache_disabled(memcg))
+ goto out;
+
+ cachp = mem_cgroup_get_node_cachep(memcg, page_to_nid(page));
+ zc = &cachp->zone_cachep[page_zonenum(page)];
+
+ if (high_wmark_pages(zone) + READ_ONCE(cachp->allow_watermark) >=
+ zone_page_state(zone, NR_FREE_PAGES))
+ goto out;
+
+ spin_lock_irqsave(&zc->pages_lock, flags);
+ list_add(&page->lru, &zc->pages);
+ spin_unlock_irqrestore(&zc->pages_lock, flags);
+ atomic_inc(&zc->nr_pages);
+
+ ret = true;
+out:
+ mem_cgroup_put(memcg);
+ return ret;
+}
+
/*
* Free a pcp page
*/
@@ -2634,6 +2714,17 @@ void free_unref_page(struct page *page, unsigned int order)
}
zone = page_zone(page);
+
+ /**
+ * This function can cache release page before free into pcp if current
+ * memcg enabled cache feature.
+ * Compared to PCP, PMC is unique, only processes in PMC can access it.
+ * So, if the conditions are met, it should be prioritized to be
+ * released to PMC before being released to the public CPU cache.
+ */
+ if (pmc_allow_order(order) && free_unref_page_to_pmc(page, zone, order))
+ return;
+
pcp_trylock_prepare(UP_flags);
pcp = pcp_spin_trylock(zone->per_cpu_pageset);
if (pcp) {
@@ -3012,6 +3103,49 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
return page;
}
+static struct page *rmqueue_mem_cgroup_cache(struct zone *preferred_zone,
+ struct zone *zone,
+ unsigned int order,
+ int migratetype)
+{
+ struct mem_cgroup *memcg;
+ struct mem_cgroup_per_node_cache *cachp;
+ struct mem_cgroup_zone_cache *zc;
+ unsigned long flags;
+ int nid = zone->zone_pgdat->node_id;
+ struct page *page = NULL;
+
+ if (pmc_disabled())
+ return NULL;
+
+ memcg = get_mem_cgroup_from_current();
+ if (!memcg || mem_cgroup_is_root(memcg) ||
+ mem_cgroup_cache_disabled(memcg))
+ goto out;
+
+ cachp = mem_cgroup_get_node_cachep(memcg, nid);
+
+ zc = &cachp->zone_cachep[zone_idx(zone)];
+ if (!atomic_read(&zc->nr_pages))
+ goto out;
+
+ spin_lock_irqsave(&zc->pages_lock, flags);
+ if (list_empty(&zc->pages)) {
+ spin_unlock_irqrestore(&zc->pages_lock, flags);
+ goto out;
+ }
+ page = list_first_entry(&zc->pages, struct page, lru);
+ list_del(&page->lru);
+ spin_unlock_irqrestore(&zc->pages_lock, flags);
+
+ atomic_dec(&zc->nr_pages);
+ atomic_inc(&zc->nr_alloced);
+
+out:
+ mem_cgroup_put(memcg);
+ return page;
+}
+
/*
* Allocate a page from the given zone.
* Use pcplists for THP or "cheap" high-order allocations.
@@ -3038,6 +3172,18 @@ struct page *rmqueue(struct zone *preferred_zone,
*/
WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
+ /*
+ * Before disturb public pcp or buddy, current may in a memcg
+ * which already enabled cache feature.
+ * If that's true, first get page from private pool can boost alloc.
+ */
+ if (pmc_allow_order(order)) {
+ page = rmqueue_mem_cgroup_cache(preferred_zone, zone, order,
+ migratetype);
+ if (page)
+ goto out;
+ }
+
if (likely(pcp_allowed_order(order))) {
page = rmqueue_pcplist(preferred_zone, zone, order,
migratetype, alloc_flags);
--
2.45.2
Powered by blists - more mailing lists