linux-kernel - Re: [PATCH 2/2] mm: support kshrinkd

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <f0ad229d-0846-d60b-2508-88c5878e733a@oppo.com>
Date: Tue, 20 Feb 2024 10:11:37 +0800
From: 李培锋 <lipeifeng@...o.com>
To: akpm@...ux-foundation.org, david@...hat.com, osalvador@...e.de,
 Matthew Wilcox <willy@...radead.org>
Cc: linux-mm@...ck.org, linux-kernel@...r.kernel.org, tkjos@...gle.com,
 gregkh@...gle.com, v-songbaohua@...o.com, surenb@...gle.com
Subject: Re: [PATCH 2/2] mm: support kshrinkd

add experts from Linux and Google.


在 2024/2/19 22:17, lipeifeng@...o.com 写道:
> From: lipeifeng <lipeifeng@...o.com>
>
> 'commit 6d4675e60135 ("mm: don't be stuck to rmap lock on reclaim path")'
> The above patch would avoid reclaim path to stuck rmap lock.
> But it would cause some folios in LRU not sorted by aging because
> the contended-folio in rmap_walk would be putback to the head of LRU
> when shrink_folio_list even if the folio is very cold.
>
> Monkey-test in phone for 300 hours shows that almost one-third of the
> contended-pages can be freed successfully next time, putting back those
> folios to LRU's head would break the rules of LRU.
> - pgsteal_kshrinkd 262577
> - pgscan_kshrinkd 795503
>
> For the above reason, the patch setups new kthread:kshrinkd to reclaim
> the contended-folio in rmap_walk when shrink_folio_list, to avoid to
> break the rules of LRU.
>
> Signed-off-by: lipeifeng <lipeifeng@...o.com>
> ---
>   include/linux/mmzone.h        |   6 ++
>   include/linux/swap.h          |   3 +
>   include/linux/vm_event_item.h |   2 +
>   mm/memory_hotplug.c           |   2 +
>   mm/vmscan.c                   | 189 +++++++++++++++++++++++++++++++++++++++++-
>   mm/vmstat.c                   |   2 +
>   6 files changed, 201 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index a497f18..83d7202 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -1329,6 +1329,12 @@ typedef struct pglist_data {
>   
>   	int kswapd_failures;		/* Number of 'reclaimed == 0' runs */
>   
> +	struct list_head kshrinkd_folios; /* rmap_walk contended folios list*/
> +	spinlock_t kf_lock; /* Protect kshrinkd_folios list*/
> +
> +	struct task_struct *kshrinkd; /* reclaim kshrinkd_folios*/
> +	wait_queue_head_t kshrinkd_wait;
> +
>   #ifdef CONFIG_COMPACTION
>   	int kcompactd_max_order;
>   	enum zone_type kcompactd_highest_zoneidx;
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 4db00dd..155fcb6 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -435,6 +435,9 @@ void check_move_unevictable_folios(struct folio_batch *fbatch);
>   extern void __meminit kswapd_run(int nid);
>   extern void __meminit kswapd_stop(int nid);
>   
> +extern void kshrinkd_run(int nid);
> +extern void kshrinkd_stop(int nid);
> +
>   #ifdef CONFIG_SWAP
>   
>   int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
> diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
> index 747943b..ee95ab1 100644
> --- a/include/linux/vm_event_item.h
> +++ b/include/linux/vm_event_item.h
> @@ -38,9 +38,11 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
>   		PGLAZYFREED,
>   		PGREFILL,
>   		PGREUSE,
> +		PGSTEAL_KSHRINKD,
>   		PGSTEAL_KSWAPD,
>   		PGSTEAL_DIRECT,
>   		PGSTEAL_KHUGEPAGED,
> +		PGSCAN_KSHRINKD,
>   		PGSCAN_KSWAPD,
>   		PGSCAN_DIRECT,
>   		PGSCAN_KHUGEPAGED,
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 2189099..1b6c4c6 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1209,6 +1209,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
>   
>   	kswapd_run(nid);
>   	kcompactd_run(nid);
> +	kshrinkd_run(nid);
>   
>   	writeback_set_ratelimit();
>   
> @@ -2092,6 +2093,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
>   	}
>   
>   	if (arg.status_change_nid >= 0) {
> +		kshrinkd_stop(node);
>   		kcompactd_stop(node);
>   		kswapd_stop(node);
>   	}
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 0296d48..63e4fd4 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -139,6 +139,9 @@ struct scan_control {
>   	/* if try_lock in rmap_walk */
>   	unsigned int rw_try_lock:1;
>   
> +	/* need kshrinkd to reclaim if rwc trylock contended*/
> +	unsigned int need_kshrinkd:1;
> +
>   	/* Allocation order */
>   	s8 order;
>   
> @@ -190,6 +193,17 @@ struct scan_control {
>    */
>   int vm_swappiness = 60;
>   
> +/*
> + * Wakeup kshrinkd those folios which lock-contended in ramp_walk
> + * during shrink_folio_list, instead of putting back to the head
> + * of LRU, to avoid to break the rules of LRU.
> + */
> +static void wakeup_kshrinkd(struct pglist_data *pgdat)
> +{
> +	if (likely(pgdat->kshrinkd))
> +		wake_up_interruptible(&pgdat->kshrinkd_wait);
> +}
> +
>   #ifdef CONFIG_MEMCG
>   
>   /* Returns true for reclaim through cgroup limits or cgroup interfaces. */
> @@ -821,6 +835,7 @@ enum folio_references {
>   	FOLIOREF_RECLAIM_CLEAN,
>   	FOLIOREF_KEEP,
>   	FOLIOREF_ACTIVATE,
> +	FOLIOREF_LOCK_CONTENDED,
>   };
>   
>   static enum folio_references folio_check_references(struct folio *folio,
> @@ -841,8 +856,12 @@ static enum folio_references folio_check_references(struct folio *folio,
>   		return FOLIOREF_ACTIVATE;
>   
>   	/* rmap lock contention: rotate */
> -	if (referenced_ptes == -1)
> -		return FOLIOREF_KEEP;
> +	if (referenced_ptes == -1) {
> +		if (sc->need_kshrinkd && folio_pgdat(folio)->kshrinkd)
> +			return FOLIOREF_LOCK_CONTENDED;
> +		else
> +			return FOLIOREF_KEEP;
> +	}
>   
>   	if (referenced_ptes) {
>   		/*
> @@ -1012,6 +1031,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
>   	LIST_HEAD(ret_folios);
>   	LIST_HEAD(free_folios);
>   	LIST_HEAD(demote_folios);
> +	LIST_HEAD(contended_folios);
>   	unsigned int nr_reclaimed = 0;
>   	unsigned int pgactivate = 0;
>   	bool do_demote_pass;
> @@ -1028,6 +1048,7 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
>   		enum folio_references references = FOLIOREF_RECLAIM;
>   		bool dirty, writeback;
>   		unsigned int nr_pages;
> +		bool lock_contended = false;
>   
>   		cond_resched();
>   
> @@ -1169,6 +1190,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
>   		case FOLIOREF_KEEP:
>   			stat->nr_ref_keep += nr_pages;
>   			goto keep_locked;
> +		case FOLIOREF_LOCK_CONTENDED:
> +			lock_contended = true;
> +			goto keep_locked;
>   		case FOLIOREF_RECLAIM:
>   		case FOLIOREF_RECLAIM_CLEAN:
>   			; /* try to reclaim the folio below */
> @@ -1449,7 +1473,10 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
>   keep_locked:
>   		folio_unlock(folio);
>   keep:
> -		list_add(&folio->lru, &ret_folios);
> +		if (unlikely(lock_contended))
> +			list_add(&folio->lru, &contended_folios);
> +		else
> +			list_add(&folio->lru, &ret_folios);
>   		VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
>   				folio_test_unevictable(folio), folio);
>   	}
> @@ -1491,6 +1518,14 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
>   	free_unref_page_list(&free_folios);
>   
>   	list_splice(&ret_folios, folio_list);
> +
> +	if (!list_empty(&contended_folios)) {
> +		spin_lock_irq(&pgdat->kf_lock);
> +		list_splice(&contended_folios, &pgdat->kshrinkd_folios);
> +		spin_unlock_irq(&pgdat->kf_lock);
> +		wakeup_kshrinkd(pgdat);
> +	}
> +
>   	count_vm_events(PGACTIVATE, pgactivate);
>   
>   	if (plug)
> @@ -1505,6 +1540,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
>   		.gfp_mask = GFP_KERNEL,
>   		.may_unmap = 1,
>   		.rw_try_lock = 1,
> +		.need_kshrinkd = 0,
>   	};
>   	struct reclaim_stat stat;
>   	unsigned int nr_reclaimed;
> @@ -2101,6 +2137,7 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list,
>   		.may_swap = 1,
>   		.no_demotion = 1,
>   		.rw_try_lock = 1,
> +		.need_kshrinkd = 0,
>   	};
>   
>   	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
> @@ -5448,6 +5485,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
>   		.reclaim_idx = MAX_NR_ZONES - 1,
>   		.gfp_mask = GFP_KERNEL,
>   		.rw_try_lock = 1,
> +		.need_kshrinkd = 0,
>   	};
>   
>   	buf = kvmalloc(len + 1, GFP_KERNEL);
> @@ -6421,6 +6459,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
>   		.may_unmap = 1,
>   		.may_swap = 1,
>   		.rw_try_lock = 1,
> +		.need_kshrinkd = 1,
>   	};
>   
>   	/*
> @@ -6467,6 +6506,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
>   		.reclaim_idx = MAX_NR_ZONES - 1,
>   		.may_swap = !noswap,
>   		.rw_try_lock = 1,
> +		.need_kshrinkd = 0,
>   	};
>   
>   	WARN_ON_ONCE(!current->reclaim_state);
> @@ -6512,6 +6552,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
>   		.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
>   		.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
>   		.rw_try_lock = 1,
> +		.need_kshrinkd = 0,
>   	};
>   	/*
>   	 * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
> @@ -6774,6 +6815,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
>   		.order = order,
>   		.may_unmap = 1,
>   		.rw_try_lock = 1,
> +		.need_kshrinkd = 1,
>   	};
>   
>   	set_task_reclaim_state(current, &sc.reclaim_state);
> @@ -7234,6 +7276,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
>   		.may_swap = 1,
>   		.hibernation_mode = 1,
>   		.rw_try_lock = 1,
> +		.need_kshrinkd = 0,
>   	};
>   	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
>   	unsigned long nr_reclaimed;
> @@ -7304,6 +7347,145 @@ static int __init kswapd_init(void)
>   
>   module_init(kswapd_init)
>   
> +static int kshrinkd_should_run(pg_data_t *pgdat)
> +{
> +	int should_run;
> +
> +	spin_lock_irq(&pgdat->kf_lock);
> +	should_run = !list_empty(&pgdat->kshrinkd_folios);
> +	spin_unlock_irq(&pgdat->kf_lock);
> +
> +	return should_run;
> +}
> +
> +static unsigned long kshrinkd_reclaim_folios(struct list_head *folio_list,
> +				struct pglist_data *pgdat)
> +{
> +	struct reclaim_stat dummy_stat;
> +	unsigned int nr_reclaimed = 0;
> +	struct scan_control sc = {
> +		.gfp_mask = GFP_KERNEL,
> +		.may_writepage = 1,
> +		.may_unmap = 1,
> +		.may_swap = 1,
> +		.no_demotion = 1,
> +		.rw_try_lock = 0,
> +		.need_kshrinkd = 0,
> +	};
> +
> +	if (list_empty(folio_list))
> +		return nr_reclaimed;
> +
> +	nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
> +
> +	return nr_reclaimed;
> +}
> +
> +/*
> + * The background kshrink daemon, started as a kernel thread
> + * from the init process.
> + *
> + * Kshrinkd is to reclaim the contended-folio in rmap_walk when
> + * shrink_folio_list instead of putting back into the head of LRU
> + * directly, to avoid to break the rules of LRU.
> + */
> +
> +static int kshrinkd(void *p)
> +{
> +	pg_data_t *pgdat;
> +	LIST_HEAD(tmp_contended_folios);
> +
> +	pgdat = (pg_data_t *)p;
> +
> +	current->flags |= PF_MEMALLOC | PF_KSWAPD;
> +	set_freezable();
> +
> +	while (!kthread_should_stop()) {
> +		unsigned long nr_reclaimed = 0;
> +		unsigned long nr_putback = 0;
> +
> +		wait_event_freezable(pgdat->kshrinkd_wait,
> +				kshrinkd_should_run(pgdat));
> +
> +		/* splice rmap_walk contended folios to tmp-list */
> +		spin_lock_irq(&pgdat->kf_lock);
> +		list_splice(&pgdat->kshrinkd_folios, &tmp_contended_folios);
> +		INIT_LIST_HEAD(&pgdat->kshrinkd_folios);
> +		spin_unlock_irq(&pgdat->kf_lock);
> +
> +		/* reclaim rmap_walk contended folios */
> +		nr_reclaimed = kshrinkd_reclaim_folios(&tmp_contended_folios, pgdat);
> +		__count_vm_events(PGSTEAL_KSHRINKD, nr_reclaimed);
> +
> +		/* putback the folios which failed to reclaim to lru */
> +		while (!list_empty(&tmp_contended_folios)) {
> +			struct folio *folio = lru_to_folio(&tmp_contended_folios);
> +
> +			nr_putback += folio_nr_pages(folio);
> +			list_del(&folio->lru);
> +			folio_putback_lru(folio);
> +		}
> +
> +		__count_vm_events(PGSCAN_KSHRINKD, nr_reclaimed + nr_putback);
> +	}
> +
> +	current->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
> +
> +	return 0;
> +}
> +
> +/*
> + * This kshrinkd start function will be called by init and node-hot-add.
> + */
> +void kshrinkd_run(int nid)
> +{
> +	pg_data_t *pgdat = NODE_DATA(nid);
> +
> +	if (pgdat->kshrinkd)
> +		return;
> +
> +	pgdat->kshrinkd = kthread_run(kshrinkd, pgdat, "kshrinkd%d", nid);
> +	if (IS_ERR(pgdat->kshrinkd)) {
> +		/* failure to start kshrinkd */
> +		WARN_ON_ONCE(system_state < SYSTEM_RUNNING);
> +		pr_err("Failed to start kshrinkd on node %d\n", nid);
> +		pgdat->kshrinkd = NULL;
> +	}
> +}
> +
> +/*
> + * Called by memory hotplug when all memory in a node is offlined.  Caller must
> + * be holding mem_hotplug_begin/done().
> + */
> +void kshrinkd_stop(int nid)
> +{
> +	struct task_struct *kshrinkd = NODE_DATA(nid)->kshrinkd;
> +
> +	if (kshrinkd) {
> +		kthread_stop(kshrinkd);
> +		NODE_DATA(nid)->kshrinkd = NULL;
> +	}
> +}
> +
> +static int __init kshrinkd_init(void)
> +{
> +	int nid;
> +
> +	for_each_node_state(nid, N_MEMORY) {
> +		pg_data_t *pgdat = NODE_DATA(nid);
> +
> +		spin_lock_init(&pgdat->kf_lock);
> +		init_waitqueue_head(&pgdat->kshrinkd_wait);
> +		INIT_LIST_HEAD(&pgdat->kshrinkd_folios);
> +
> +		kshrinkd_run(nid);
> +	}
> +
> +	return 0;
> +}
> +
> +module_init(kshrinkd_init)
> +
>   #ifdef CONFIG_NUMA
>   /*
>    * Node reclaim mode
> @@ -7393,6 +7575,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
>   		.may_swap = 1,
>   		.reclaim_idx = gfp_zone(gfp_mask),
>   		.rw_try_lock = 1,
> +		.need_kshrinkd = 1,
>   	};
>   	unsigned long pflags;
>   
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index db79935..76d8a3b 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1279,9 +1279,11 @@ const char * const vmstat_text[] = {
>   
>   	"pgrefill",
>   	"pgreuse",
> +	"pgsteal_kshrinkd",
>   	"pgsteal_kswapd",
>   	"pgsteal_direct",
>   	"pgsteal_khugepaged",
> +	"pgscan_kshrinkd",
>   	"pgscan_kswapd",
>   	"pgscan_direct",
>   	"pgscan_khugepaged",