[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <20220407031525.2368067-11-yuzhao@google.com>
Date: Wed, 6 Apr 2022 21:15:22 -0600
From: Yu Zhao <yuzhao@...gle.com>
To: Stephen Rothwell <sfr@...hwell.id.au>, linux-mm@...ck.org
Cc: Andi Kleen <ak@...ux.intel.com>,
Andrew Morton <akpm@...ux-foundation.org>,
Aneesh Kumar <aneesh.kumar@...ux.ibm.com>,
Barry Song <21cnbao@...il.com>,
Catalin Marinas <catalin.marinas@....com>,
Dave Hansen <dave.hansen@...ux.intel.com>,
Hillf Danton <hdanton@...a.com>, Jens Axboe <axboe@...nel.dk>,
Jesse Barnes <jsbarnes@...gle.com>,
Johannes Weiner <hannes@...xchg.org>,
Jonathan Corbet <corbet@....net>,
Linus Torvalds <torvalds@...ux-foundation.org>,
Matthew Wilcox <willy@...radead.org>,
Mel Gorman <mgorman@...e.de>,
Michael Larabel <Michael@...haellarabel.com>,
Michal Hocko <mhocko@...nel.org>,
Mike Rapoport <rppt@...nel.org>,
Rik van Riel <riel@...riel.com>,
Vlastimil Babka <vbabka@...e.cz>,
Will Deacon <will@...nel.org>,
Ying Huang <ying.huang@...el.com>,
linux-arm-kernel@...ts.infradead.org, linux-doc@...r.kernel.org,
linux-kernel@...r.kernel.org, page-reclaim@...gle.com,
x86@...nel.org, Yu Zhao <yuzhao@...gle.com>,
Brian Geffon <bgeffon@...gle.com>,
Jan Alexander Steffens <heftig@...hlinux.org>,
Oleksandr Natalenko <oleksandr@...alenko.name>,
Steven Barrett <steven@...uorix.net>,
Suleiman Souhlal <suleiman@...gle.com>,
Daniel Byrne <djbyrne@....edu>,
Donald Carr <d@...os-reins.com>,
"Holger Hoffstätte"
<holger@...lied-asynchrony.com>,
Konstantin Kharlamov <Hi-Angel@...dex.ru>,
Shuang Zhai <szhai2@...rochester.edu>,
Sofia Trinh <sofia.trinh@....works>,
Vaibhav Jain <vaibhav@...ux.ibm.com>
Subject: [PATCH v10 10/14] mm: multi-gen LRU: kill switch
Add /sys/kernel/mm/lru_gen/enabled as a kill switch. Components that
can be disabled include:
0x0001: the multi-gen LRU core
0x0002: walking page table, when arch_has_hw_pte_young() returns
true
0x0004: clearing the accessed bit in non-leaf PMD entries, when
CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y
[yYnN]: apply to all the components above
E.g.,
echo y >/sys/kernel/mm/lru_gen/enabled
cat /sys/kernel/mm/lru_gen/enabled
0x0007
echo 5 >/sys/kernel/mm/lru_gen/enabled
cat /sys/kernel/mm/lru_gen/enabled
0x0005
NB: the page table walks happen on the scale of seconds under heavy
memory pressure, in which case the mmap_lock contention is a lesser
concern, compared with the LRU lock contention and the I/O congestion.
So far the only well-known case of the mmap_lock contention happens on
Android, due to Scudo [1] which allocates several thousand VMAs for
merely a few hundred MBs. The SPF and the Maple Tree also have
provided their own assessments [2][3]. However, if walking page tables
does worsen the mmap_lock contention, the kill switch can be used to
disable it. In this case the multi-gen LRU will suffer a minor
performance degradation, as shown previously.
Clearing the accessed bit in non-leaf PMD entries can also be
disabled, since this behavior was not tested on x86 varieties other
than Intel and AMD.
[1] https://source.android.com/devices/tech/debug/scudo
[2] https://lore.kernel.org/r/20220128131006.67712-1-michel@lespinasse.org/
[3] https://lore.kernel.org/r/20220202024137.2516438-1-Liam.Howlett@oracle.com/
Signed-off-by: Yu Zhao <yuzhao@...gle.com>
Acked-by: Brian Geffon <bgeffon@...gle.com>
Acked-by: Jan Alexander Steffens (heftig) <heftig@...hlinux.org>
Acked-by: Oleksandr Natalenko <oleksandr@...alenko.name>
Acked-by: Steven Barrett <steven@...uorix.net>
Acked-by: Suleiman Souhlal <suleiman@...gle.com>
Tested-by: Daniel Byrne <djbyrne@....edu>
Tested-by: Donald Carr <d@...os-reins.com>
Tested-by: Holger Hoffstätte <holger@...lied-asynchrony.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@...dex.ru>
Tested-by: Shuang Zhai <szhai2@...rochester.edu>
Tested-by: Sofia Trinh <sofia.trinh@....works>
Tested-by: Vaibhav Jain <vaibhav@...ux.ibm.com>
---
include/linux/cgroup.h | 15 +-
include/linux/mm_inline.h | 12 +-
include/linux/mmzone.h | 9 ++
kernel/cgroup/cgroup-internal.h | 1 -
mm/Kconfig | 6 +
mm/vmscan.c | 238 +++++++++++++++++++++++++++++++-
6 files changed, 272 insertions(+), 9 deletions(-)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 0d1ada8968d7..1bc0cabf993f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgroup *cgrp)
css_put(&cgrp->self);
}
+extern struct mutex cgroup_mutex;
+
+static inline void cgroup_lock(void)
+{
+ mutex_lock(&cgroup_mutex);
+}
+
+static inline void cgroup_unlock(void)
+{
+ mutex_unlock(&cgroup_mutex);
+}
+
/**
* task_css_set_check - obtain a task's css_set with extra access conditions
* @task: the task to obtain css_set for
@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgroup *cgrp)
* as locks used during the cgroup_subsys::attach() methods.
*/
#ifdef CONFIG_PROVE_RCU
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
#define task_css_set_check(task, __c) \
rcu_dereference_check((task)->cgroups, \
@@ -708,6 +719,8 @@ struct cgroup;
static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; }
static inline void css_get(struct cgroup_subsys_state *css) {}
static inline void css_put(struct cgroup_subsys_state *css) {}
+static inline void cgroup_lock(void) {}
+static inline void cgroup_unlock(void) {}
static inline int cgroup_attach_task_all(struct task_struct *from,
struct task_struct *t) { return 0; }
static inline int cgroupstats_build(struct cgroupstats *stats,
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8a8f87b72540..352862af2669 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -106,7 +106,15 @@ static __always_inline enum lru_list folio_lru_list(struct folio *folio)
static inline bool lru_gen_enabled(void)
{
- return true;
+#ifdef CONFIG_LRU_GEN_ENABLED
+ DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]);
+
+ return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]);
+#else
+ DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]);
+
+ return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]);
+#endif
}
static inline bool lru_gen_in_fault(void)
@@ -196,7 +204,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio,
int zone = folio_zonenum(folio);
struct lru_gen_struct *lrugen = &lruvec->lrugen;
- if (folio_test_unevictable(folio))
+ if (folio_test_unevictable(folio) || !lrugen->enabled)
return false;
/*
* There are three common cases for this page:
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a1a99971ff9c..525dbf5e14fa 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -384,6 +384,13 @@ enum {
LRU_GEN_FILE,
};
+enum {
+ LRU_GEN_CORE,
+ LRU_GEN_MM_WALK,
+ LRU_GEN_NONLEAF_YOUNG,
+ NR_LRU_GEN_CAPS
+};
+
#define MIN_LRU_BATCH BITS_PER_LONG
#define MAX_LRU_BATCH (MIN_LRU_BATCH * 128)
@@ -422,6 +429,8 @@ struct lru_gen_struct {
/* can be modified without holding the LRU lock */
atomic_long_t evicted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
atomic_long_t refaulted[NR_HIST_GENS][ANON_AND_FILE][MAX_NR_TIERS];
+ /* whether the multi-gen LRU is enabled */
+ bool enabled;
};
enum {
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 6e36e854b512..929ed3bf1a7c 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -165,7 +165,6 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
diff --git a/mm/Kconfig b/mm/Kconfig
index c40777d098a8..b55546191369 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -918,6 +918,12 @@ config LRU_GEN
help
A high performance LRU implementation to overcommit memory.
+config LRU_GEN_ENABLED
+ bool "Enable by default"
+ depends on LRU_GEN
+ help
+ This option enables the multi-gen LRU by default.
+
config LRU_GEN_STATS
bool "Full stats for debugging"
depends on LRU_GEN
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0663f1a3f72a..33845796d100 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,6 +52,7 @@
#include <linux/psi.h>
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
+#include <linux/ctype.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -2989,6 +2990,12 @@ static bool can_age_anon_pages(struct pglist_data *pgdat,
#ifdef CONFIG_LRU_GEN
+#ifdef CONFIG_LRU_GEN_ENABLED
+DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
+#else
+DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
+#endif
+
/******************************************************************************
* shorthand helpers
******************************************************************************/
@@ -3025,6 +3032,15 @@ static int folio_lru_tier(struct folio *folio)
return lru_tier_from_refs(refs);
}
+static bool get_cap(int cap)
+{
+#ifdef CONFIG_LRU_GEN_ENABLED
+ return static_branch_likely(&lru_gen_caps[cap]);
+#else
+ return static_branch_unlikely(&lru_gen_caps[cap]);
+#endif
+}
+
static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
{
struct pglist_data *pgdat = NODE_DATA(nid);
@@ -3833,7 +3849,8 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
goto next;
if (!pmd_trans_huge(pmd[i])) {
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG))
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
+ get_cap(LRU_GEN_NONLEAF_YOUNG))
pmdp_test_and_clear_young(vma, addr, pmd + i);
goto next;
}
@@ -3940,10 +3957,12 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
priv->mm_stats[MM_PMD_TOTAL]++;
#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
- if (!pmd_young(val))
- continue;
+ if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+ if (!pmd_young(val))
+ continue;
- walk_pmd_range_locked(pud, addr, vma, walk, &pos);
+ walk_pmd_range_locked(pud, addr, vma, walk, &pos);
+ }
#endif
if (!priv->full_scan && !test_bloom_filter(priv->lruvec, priv->max_seq, pmd + i))
continue;
@@ -4180,7 +4199,7 @@ static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
* handful of PTEs. Spreading the work out over a period of time usually
* is less efficient, but it avoids bursty page faults.
*/
- if (!full_scan && !arch_has_hw_pte_young()) {
+ if (!full_scan && (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK))) {
success = iterate_mm_list_nowalk(lruvec, max_seq);
goto done;
}
@@ -4910,6 +4929,211 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc
blk_finish_plug(&plug);
}
+/******************************************************************************
+ * state change
+ ******************************************************************************/
+
+static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
+{
+ struct lru_gen_struct *lrugen = &lruvec->lrugen;
+
+ if (lrugen->enabled) {
+ enum lru_list lru;
+
+ for_each_evictable_lru(lru) {
+ if (!list_empty(&lruvec->lists[lru]))
+ return false;
+ }
+ } else {
+ int gen, type, zone;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ if (!list_empty(&lrugen->lists[gen][type][zone]))
+ return false;
+
+ /* unlikely but not a bug when reset_batch_size() is pending */
+ VM_WARN_ON(lrugen->nr_pages[gen][type][zone]);
+ }
+ }
+
+ return true;
+}
+
+static bool fill_evictable(struct lruvec *lruvec)
+{
+ enum lru_list lru;
+ int remaining = MAX_LRU_BATCH;
+
+ for_each_evictable_lru(lru) {
+ int type = is_file_lru(lru);
+ bool active = is_active_lru(lru);
+ struct list_head *head = &lruvec->lists[lru];
+
+ while (!list_empty(head)) {
+ bool success;
+ struct folio *folio = lru_to_folio(head);
+
+ VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_active(folio) != active, folio);
+ VM_BUG_ON_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_BUG_ON_FOLIO(folio_lru_gen(folio) < MAX_NR_GENS, folio);
+
+ lruvec_del_folio(lruvec, folio);
+ success = lru_gen_add_folio(lruvec, folio, false);
+ VM_BUG_ON(!success);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static bool drain_evictable(struct lruvec *lruvec)
+{
+ int gen, type, zone;
+ int remaining = MAX_LRU_BATCH;
+
+ for_each_gen_type_zone(gen, type, zone) {
+ struct list_head *head = &lruvec->lrugen.lists[gen][type][zone];
+
+ while (!list_empty(head)) {
+ bool success;
+ struct folio *folio = lru_to_folio(head);
+
+ VM_BUG_ON_FOLIO(folio_test_unevictable(folio), folio);
+ VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
+ VM_BUG_ON_FOLIO(folio_is_file_lru(folio) != type, folio);
+ VM_BUG_ON_FOLIO(folio_zonenum(folio) != zone, folio);
+
+ success = lru_gen_del_folio(lruvec, folio, false);
+ VM_BUG_ON(!success);
+ lruvec_add_folio(lruvec, folio);
+
+ if (!--remaining)
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static void lru_gen_change_state(bool enable)
+{
+ static DEFINE_MUTEX(state_mutex);
+
+ struct mem_cgroup *memcg;
+
+ cgroup_lock();
+ cpus_read_lock();
+ get_online_mems();
+ mutex_lock(&state_mutex);
+
+ if (enable == lru_gen_enabled())
+ goto unlock;
+
+ if (enable)
+ static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
+ else
+ static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
+
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
+ do {
+ int nid;
+
+ for_each_node(nid) {
+ struct lruvec *lruvec = get_lruvec(memcg, nid);
+
+ if (!lruvec)
+ continue;
+
+ spin_lock_irq(&lruvec->lru_lock);
+
+ VM_BUG_ON(!seq_is_valid(lruvec));
+ VM_BUG_ON(!state_is_valid(lruvec));
+
+ lruvec->lrugen.enabled = enable;
+
+ while (!(enable ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
+ spin_unlock_irq(&lruvec->lru_lock);
+ cond_resched();
+ spin_lock_irq(&lruvec->lru_lock);
+ }
+
+ spin_unlock_irq(&lruvec->lru_lock);
+ }
+
+ cond_resched();
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+unlock:
+ mutex_unlock(&state_mutex);
+ put_online_mems();
+ cpus_read_unlock();
+ cgroup_unlock();
+}
+
+/******************************************************************************
+ * sysfs interface
+ ******************************************************************************/
+
+static ssize_t show_enable(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
+{
+ unsigned int caps = 0;
+
+ if (get_cap(LRU_GEN_CORE))
+ caps |= BIT(LRU_GEN_CORE);
+
+ if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
+ caps |= BIT(LRU_GEN_MM_WALK);
+
+ if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
+ caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
+
+ return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
+}
+
+static ssize_t store_enable(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t len)
+{
+ int i;
+ unsigned int caps;
+
+ if (tolower(*buf) == 'n')
+ caps = 0;
+ else if (tolower(*buf) == 'y')
+ caps = -1;
+ else if (kstrtouint(buf, 0, &caps))
+ return -EINVAL;
+
+ for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
+ bool enable = caps & BIT(i);
+
+ if (i == LRU_GEN_CORE)
+ lru_gen_change_state(enable);
+ else if (enable)
+ static_branch_enable(&lru_gen_caps[i]);
+ else
+ static_branch_disable(&lru_gen_caps[i]);
+ }
+
+ return len;
+}
+
+static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
+ enabled, 0644, show_enable, store_enable
+);
+
+static struct attribute *lru_gen_attrs[] = {
+ &lru_gen_enabled_attr.attr,
+ NULL
+};
+
+static struct attribute_group lru_gen_attr_group = {
+ .name = "lru_gen",
+ .attrs = lru_gen_attrs,
+};
+
/******************************************************************************
* initialization
******************************************************************************/
@@ -4920,6 +5144,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec)
struct lru_gen_struct *lrugen = &lruvec->lrugen;
lrugen->max_seq = MIN_NR_GENS + 1;
+ lrugen->enabled = lru_gen_enabled();
for_each_gen_type_zone(gen, type, zone)
INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
@@ -4960,6 +5185,9 @@ static int __init init_lru_gen(void)
BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
+ if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
+ pr_err("lru_gen: failed to create sysfs group\n");
+
return 0;
};
late_initcall(init_lru_gen);
--
2.35.1.1094.g7c7d902a7c-goog
Powered by blists - more mailing lists