[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f0ad229d-0846-d60b-2508-88c5878e733a@oppo.com>
Date: Tue, 20 Feb 2024 10:11:37 +0800
From: 李培锋 <lipeifeng@...o.com>
To: akpm@...ux-foundation.org, david@...hat.com, osalvador@...e.de,
Matthew Wilcox <willy@...radead.org>
Cc: linux-mm@...ck.org, linux-kernel@...r.kernel.org, tkjos@...gle.com,
gregkh@...gle.com, v-songbaohua@...o.com, surenb@...gle.com
Subject: Re: [PATCH 2/2] mm: support kshrinkd
add experts from Linux and Google.
在 2024/2/19 22:17, lipeifeng@...o.com 写道:
> From: lipeifeng <lipeifeng@...o.com>
>
> 'commit 6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path")'
> The above patch would avoid reclaim path to stuck rmap lock.
> But it would cause some folios in LRU not sorted by aging because
> the contended-folio in rmap_walk would be putback to the head of LRU
> when shrink_folio_list even if the folio is very cold.
>
> Monkey-test in phone for 300 hours shows that almost one-third of the
> contended-pages can be freed successfully next time, putting back those
> folios to LRU's head would break the rules of LRU.
> - pgsteal_kshrinkd 262577
> - pgscan_kshrinkd 795503
>
> For the above reason, the patch setups new kthread:kshrinkd to reclaim
> the contended-folio in rmap_walk when shrink_folio_list, to avoid to
> break the rules of LRU.
>
> Signed-off-by: lipeifeng <lipeifeng@...o.com>
> ---
> include/linux/mmzone.h | 6 ++
> include/linux/swap.h | 3 +
> include/linux/vm_event_item.h | 2 +
> mm/memory_hotplug.c | 2 +
> mm/vmscan.c | 189 +++++++++++++++++++++++++++++++++++++++++-
> mm/vmstat.c | 2 +
> 6 files changed, 201 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index a497f18..83d7202 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1329,6 +1329,12 @@ typedef struct pglist_data {
>
> int kswapd_failures; /* Number of 'reclaimed == 0' runs */
>
> + struct list_head kshrinkd_folios; /* rmap_walk contended folios list*/
> + spinlock_t kf_lock; /* Protect kshrinkd_folios list*/
> +
> + struct task_struct *kshrinkd; /* reclaim kshrinkd_folios*/
> + wait_queue_head_t kshrinkd_wait;
> +
> #ifdef CONFIG_COMPACTION
> int kcompactd_max_order;
> enum zone_type kcompactd_highest_zoneidx;
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 4db00dd..155fcb6 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -435,6 +435,9 @@ void check_move_unevictable_folios(struct folio_batch *fbatch);
> extern void __meminit kswapd_run(int nid);
> extern void __meminit kswapd_stop(int nid);
>
> +extern void kshrinkd_run(int nid);
> +extern void kshrinkd_stop(int nid);
> +
> #ifdef CONFIG_SWAP
>
> int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
> index 747943b..ee95ab1 100644
> --- a/include/linux/vm_event_item.h
> +++ b/include/linux/vm_event_item.h
> @@ -38,9 +38,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
> PGLAZYFREED,
> PGREFILL,
> PGREUSE,
> + PGSTEAL_KSHRINKD,
> PGSTEAL_KSWAPD,
> PGSTEAL_DIRECT,
> PGSTEAL_KHUGEPAGED,
> + PGSCAN_KSHRINKD,
> PGSCAN_KSWAPD,
> PGSCAN_DIRECT,
> PGSCAN_KHUGEPAGED,
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 2189099..1b6c4c6 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1209,6 +1209,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
>
> kswapd_run(nid);
> kcompactd_run(nid);
> + kshrinkd_run(nid);
>
> writeback_set_ratelimit();
>
> @@ -2092,6 +2093,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
> }
>
> if (arg.status_change_nid >= 0) {
> + kshrinkd_stop(node);
> kcompactd_stop(node);
> kswapd_stop(node);
> }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 0296d48..63e4fd4 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -139,6 +139,9 @@ struct scan_control {
> /* if try_lock in rmap_walk */
> unsigned int rw_try_lock:1;
>
> + /* need kshrinkd to reclaim if rwc trylock contended*/
> + unsigned int need_kshrinkd:1;
> +
> /* Allocation order */
> s8 order;
>
> @@ -190,6 +193,17 @@ struct scan_control {
> */
> int vm_swappiness = 60;
>
> +/*
> + * Wakeup kshrinkd those folios which lock-contended in ramp_walk
> + * during shrink_folio_list, instead of putting back to the head
> + * of LRU, to avoid to break the rules of LRU.
> + */
> +static void wakeup_kshrinkd(struct pglist_data *pgdat)
> +{
> + if (likely(pgdat->kshrinkd))
> + wake_up_interruptible(&pgdat->kshrinkd_wait);
> +}
> +
> #ifdef CONFIG_MEMCG
>
> /* Returns true for reclaim through cgroup limits or cgroup interfaces. */
> @@ -821,6 +835,7 @@ enum folio_references {
> FOLIOREF_RECLAIM_CLEAN,
> FOLIOREF_KEEP,
> FOLIOREF_ACTIVATE,
> + FOLIOREF_LOCK_CONTENDED,
> };
>
> static enum folio_references folio_check_references(struct folio *folio,
> @@ -841,8 +856,12 @@ static enum folio_references folio_check_references(struct folio *folio,
> return FOLIOREF_ACTIVATE;
>
> /* rmap lock contention: rotate */
> - if (referenced_ptes == -1)
> - return FOLIOREF_KEEP;
> + if (referenced_ptes == -1) {
> + if (sc->need_kshrinkd && folio_pgdat(folio)->kshrinkd)
> + return FOLIOREF_LOCK_CONTENDED;
> + else
> + return FOLIOREF_KEEP;
> + }
>
> if (referenced_ptes) {
> /*
> @@ -1012,6 +1031,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> LIST_HEAD(ret_folios);
> LIST_HEAD(free_folios);
> LIST_HEAD(demote_folios);
> + LIST_HEAD(contended_folios);
> unsigned int nr_reclaimed = 0;
> unsigned int pgactivate = 0;
> bool do_demote_pass;
> @@ -1028,6 +1048,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> enum folio_references references = FOLIOREF_RECLAIM;
> bool dirty, writeback;
> unsigned int nr_pages;
> + bool lock_contended = false;
>
> cond_resched();
>
> @@ -1169,6 +1190,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> case FOLIOREF_KEEP:
> stat->nr_ref_keep += nr_pages;
> goto keep_locked;
> + case FOLIOREF_LOCK_CONTENDED:
> + lock_contended = true;
> + goto keep_locked;
> case FOLIOREF_RECLAIM:
> case FOLIOREF_RECLAIM_CLEAN:
> ; /* try to reclaim the folio below */
> @@ -1449,7 +1473,10 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> keep_locked:
> folio_unlock(folio);
> keep:
> - list_add(&folio->lru, &ret_folios);
> + if (unlikely(lock_contended))
> + list_add(&folio->lru, &contended_folios);
> + else
> + list_add(&folio->lru, &ret_folios);
> VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
> folio_test_unevictable(folio), folio);
> }
> @@ -1491,6 +1518,14 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> free_unref_page_list(&free_folios);
>
> list_splice(&ret_folios, folio_list);
> +
> + if (!list_empty(&contended_folios)) {
> + spin_lock_irq(&pgdat->kf_lock);
> + list_splice(&contended_folios, &pgdat->kshrinkd_folios);
> + spin_unlock_irq(&pgdat->kf_lock);
> + wakeup_kshrinkd(pgdat);
> + }
> +
> count_vm_events(PGACTIVATE, pgactivate);
>
> if (plug)
> @@ -1505,6 +1540,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
> .gfp_mask = GFP_KERNEL,
> .may_unmap = 1,
> .rw_try_lock = 1,
> + .need_kshrinkd = 0,
> };
> struct reclaim_stat stat;
> unsigned int nr_reclaimed;
> @@ -2101,6 +2137,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
> .may_swap = 1,
> .no_demotion = 1,
> .rw_try_lock = 1,
> + .need_kshrinkd = 0,
> };
>
> nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
> @@ -5448,6 +5485,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
> .reclaim_idx = MAX_NR_ZONES - 1,
> .gfp_mask = GFP_KERNEL,
> .rw_try_lock = 1,
> + .need_kshrinkd = 0,
> };
>
> buf = kvmalloc(len + 1, GFP_KERNEL);
> @@ -6421,6 +6459,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
> .may_unmap = 1,
> .may_swap = 1,
> .rw_try_lock = 1,
> + .need_kshrinkd = 1,
> };
>
> /*
> @@ -6467,6 +6506,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
> .reclaim_idx = MAX_NR_ZONES - 1,
> .may_swap = !noswap,
> .rw_try_lock = 1,
> + .need_kshrinkd = 0,
> };
>
> WARN_ON_ONCE(!current->reclaim_state);
> @@ -6512,6 +6552,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
> .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
> .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
> .rw_try_lock = 1,
> + .need_kshrinkd = 0,
> };
> /*
> * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
> @@ -6774,6 +6815,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
> .order = order,
> .may_unmap = 1,
> .rw_try_lock = 1,
> + .need_kshrinkd = 1,
> };
>
> set_task_reclaim_state(current, &sc.reclaim_state);
> @@ -7234,6 +7276,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
> .may_swap = 1,
> .hibernation_mode = 1,
> .rw_try_lock = 1,
> + .need_kshrinkd = 0,
> };
> struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
> unsigned long nr_reclaimed;
> @@ -7304,6 +7347,145 @@ static int __init kswapd_init(void)
>
> module_init(kswapd_init)
>
> +static int kshrinkd_should_run(pg_data_t *pgdat)
> +{
> + int should_run;
> +
> + spin_lock_irq(&pgdat->kf_lock);
> + should_run = !list_empty(&pgdat->kshrinkd_folios);
> + spin_unlock_irq(&pgdat->kf_lock);
> +
> + return should_run;
> +}
> +
> +static unsigned long kshrinkd_reclaim_folios(struct list_head *folio_list,
> + struct pglist_data *pgdat)
> +{
> + struct reclaim_stat dummy_stat;
> + unsigned int nr_reclaimed = 0;
> + struct scan_control sc = {
> + .gfp_mask = GFP_KERNEL,
> + .may_writepage = 1,
> + .may_unmap = 1,
> + .may_swap = 1,
> + .no_demotion = 1,
> + .rw_try_lock = 0,
> + .need_kshrinkd = 0,
> + };
> +
> + if (list_empty(folio_list))
> + return nr_reclaimed;
> +
> + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
> +
> + return nr_reclaimed;
> +}
> +
> +/*
> + * The background kshrink daemon, started as a kernel thread
> + * from the init process.
> + *
> + * Kshrinkd is to reclaim the contended-folio in rmap_walk when
> + * shrink_folio_list instead of putting back into the head of LRU
> + * directly, to avoid to break the rules of LRU.
> + */
> +
> +static int kshrinkd(void *p)
> +{
> + pg_data_t *pgdat;
> + LIST_HEAD(tmp_contended_folios);
> +
> + pgdat = (pg_data_t *)p;
> +
> + current->flags |= PF_MEMALLOC | PF_KSWAPD;
> + set_freezable();
> +
> + while (!kthread_should_stop()) {
> + unsigned long nr_reclaimed = 0;
> + unsigned long nr_putback = 0;
> +
> + wait_event_freezable(pgdat->kshrinkd_wait,
> + kshrinkd_should_run(pgdat));
> +
> + /* splice rmap_walk contended folios to tmp-list */
> + spin_lock_irq(&pgdat->kf_lock);
> + list_splice(&pgdat->kshrinkd_folios, &tmp_contended_folios);
> + INIT_LIST_HEAD(&pgdat->kshrinkd_folios);
> + spin_unlock_irq(&pgdat->kf_lock);
> +
> + /* reclaim rmap_walk contended folios */
> + nr_reclaimed = kshrinkd_reclaim_folios(&tmp_contended_folios, pgdat);
> + __count_vm_events(PGSTEAL_KSHRINKD, nr_reclaimed);
> +
> + /* putback the folios which failed to reclaim to lru */
> + while (!list_empty(&tmp_contended_folios)) {
> + struct folio *folio = lru_to_folio(&tmp_contended_folios);
> +
> + nr_putback += folio_nr_pages(folio);
> + list_del(&folio->lru);
> + folio_putback_lru(folio);
> + }
> +
> + __count_vm_events(PGSCAN_KSHRINKD, nr_reclaimed + nr_putback);
> + }
> +
> + current->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
> +
> + return 0;
> +}
> +
> +/*
> + * This kshrinkd start function will be called by init and node-hot-add.
> + */
> +void kshrinkd_run(int nid)
> +{
> + pg_data_t *pgdat = NODE_DATA(nid);
> +
> + if (pgdat->kshrinkd)
> + return;
> +
> + pgdat->kshrinkd = kthread_run(kshrinkd, pgdat, "kshrinkd%d", nid);
> + if (IS_ERR(pgdat->kshrinkd)) {
> + /* failure to start kshrinkd */
> + WARN_ON_ONCE(system_state < SYSTEM_RUNNING);
> + pr_err("Failed to start kshrinkd on node %d\n", nid);
> + pgdat->kshrinkd = NULL;
> + }
> +}
> +
> +/*
> + * Called by memory hotplug when all memory in a node is offlined. Caller must
> + * be holding mem_hotplug_begin/done().
> + */
> +void kshrinkd_stop(int nid)
> +{
> + struct task_struct *kshrinkd = NODE_DATA(nid)->kshrinkd;
> +
> + if (kshrinkd) {
> + kthread_stop(kshrinkd);
> + NODE_DATA(nid)->kshrinkd = NULL;
> + }
> +}
> +
> +static int __init kshrinkd_init(void)
> +{
> + int nid;
> +
> + for_each_node_state(nid, N_MEMORY) {
> + pg_data_t *pgdat = NODE_DATA(nid);
> +
> + spin_lock_init(&pgdat->kf_lock);
> + init_waitqueue_head(&pgdat->kshrinkd_wait);
> + INIT_LIST_HEAD(&pgdat->kshrinkd_folios);
> +
> + kshrinkd_run(nid);
> + }
> +
> + return 0;
> +}
> +
> +module_init(kshrinkd_init)
> +
> #ifdef CONFIG_NUMA
> /*
> * Node reclaim mode
> @@ -7393,6 +7575,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
> .may_swap = 1,
> .reclaim_idx = gfp_zone(gfp_mask),
> .rw_try_lock = 1,
> + .need_kshrinkd = 1,
> };
> unsigned long pflags;
>
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index db79935..76d8a3b 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1279,9 +1279,11 @@ const char * const vmstat_text[] = {
>
> "pgrefill",
> "pgreuse",
> + "pgsteal_kshrinkd",
> "pgsteal_kswapd",
> "pgsteal_direct",
> "pgsteal_khugepaged",
> + "pgscan_kshrinkd",
> "pgscan_kswapd",
> "pgscan_direct",
> "pgscan_khugepaged",
Powered by blists - more mailing lists