lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <4BA28B97-9E5C-4CF5-B6E1-A6AD27A1B6AC@nvidia.com>
Date:   Fri, 26 Aug 2022 20:25:56 -0400
From:   Zi Yan <ziy@...dia.com>
To:     alexlzhu@...com
Cc:     linux-mm@...ck.org, willy@...radead.org, hannes@...xchg.org,
        akpm@...ux-foundation.org, riel@...riel.com, kernel-team@...com,
        linux-kernel@...r.kernel.org
Subject: Re: [RFC 3/3] mm: THP low utilization shrinker

On 25 Aug 2022, at 17:30, alexlzhu@...com wrote:

> From: Alexander Zhu <alexlzhu@...com>
>
> This patch introduces a shrinker that will remove THPs in the lowest
> utilization bucket. As previously mentioned, we have observed that
> almost all of the memory waste when THPs are always enabled
> is contained in the lowest utilization bucket. The shrinker will
> add these THPs to a list_lru and split anonymous THPs based off
> information from kswapd. It requires the changes from
> thp_utilization to identify the least utilized THPs, and the
> changes to split_huge_page to identify and free zero pages
> within THPs.

How stale could the information in the utilization bucket be? Is it
possible that THP shrinker splits a THP used to have a lot of
zero-filled subpages but now have all subpages filled with useful
values? In Patch 2, split_huge_page() only unmap zero-filled subpages,
but for THP shrinker, should it verify the utilization before it
splits the page?

>
> Signed-off-by: Alexander Zhu <alexlzhu@...com>
> ---
>  include/linux/huge_mm.h  |  7 +++
>  include/linux/list_lru.h | 24 +++++++++++
>  include/linux/mm_types.h |  5 +++
>  mm/huge_memory.c         | 92 ++++++++++++++++++++++++++++++++++++++--
>  mm/list_lru.c            | 49 +++++++++++++++++++++
>  mm/page_alloc.c          |  6 +++
>  6 files changed, 180 insertions(+), 3 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index c9086239deb7..13bd470173d2 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -192,6 +192,8 @@ static inline int split_huge_page(struct page *page)
>  }
>  void deferred_split_huge_page(struct page *page);
>
> +void add_underutilized_thp(struct page *page);
> +
>  void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
>  		unsigned long address, bool freeze, struct folio *folio);
>
> @@ -302,6 +304,11 @@ static inline struct list_head *page_deferred_list(struct page *page)
>  	return &page[2].deferred_list;
>  }
>
> +static inline struct list_head *page_underutilized_thp_list(struct page *page)
> +{
> +	return &page[3].underutilized_thp_list;
> +}
> +
>  #else /* CONFIG_TRANSPARENT_HUGEPAGE */
>  #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; })
>  #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; })
> diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
> index b35968ee9fb5..c2cf146ea880 100644
> --- a/include/linux/list_lru.h
> +++ b/include/linux/list_lru.h
> @@ -89,6 +89,18 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren
>   */
>  bool list_lru_add(struct list_lru *lru, struct list_head *item);
>
> +/**
> + * list_lru_add_page: add an element to the lru list's tail
> + * @list_lru: the lru pointer
> + * @page: the page containing the item
> + * @item: the item to be deleted.
> + *
> + * This function works the same as list_lru_add in terms of list
> + * manipulation. Used for non slab objects contained in the page.
> + *
> + * Return value: true if the list was updated, false otherwise
> + */
> +bool list_lru_add_page(struct list_lru *lru, struct page *page, struct list_head *item);
>  /**
>   * list_lru_del: delete an element to the lru list
>   * @list_lru: the lru pointer
> @@ -102,6 +114,18 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item);
>   */
>  bool list_lru_del(struct list_lru *lru, struct list_head *item);
>
> +/**
> + * list_lru_del_page: delete an element to the lru list
> + * @list_lru: the lru pointer
> + * @page: the page containing the item
> + * @item: the item to be deleted.
> + *
> + * This function works the same as list_lru_del in terms of list
> + * manipulation. Used for non slab objects contained in the page.
> + *
> + * Return value: true if the list was updated, false otherwise
> + */
> +bool list_lru_del_page(struct list_lru *lru, struct page *page, struct list_head *item);
>  /**
>   * list_lru_count_one: return the number of objects currently held by @lru
>   * @lru: the lru pointer.
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index cf97f3884fda..05667a2030c0 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -151,6 +151,11 @@ struct page {
>  			/* For both global and memcg */
>  			struct list_head deferred_list;
>  		};
> +		struct { /* Third tail page of compound page */
> +			unsigned long _compound_pad_3; /* compound_head */
> +			unsigned long _compound_pad_4;
> +			struct list_head underutilized_thp_list;
> +		};
>  		struct {	/* Page table pages */
>  			unsigned long _pt_pad_1;	/* compound_head */
>  			pgtable_t pmd_huge_pte; /* protected by page->ptl */
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 0f774a7c0727..03dc42eba0ba 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -8,6 +8,7 @@
>  #include <linux/mm.h>
>  #include <linux/sched.h>
>  #include <linux/sched/mm.h>
> +#include <linux/sched/clock.h>
>  #include <linux/sched/coredump.h>
>  #include <linux/sched/numa_balancing.h>
>  #include <linux/highmem.h>
> @@ -85,6 +86,8 @@ static atomic_t huge_zero_refcount;
>  struct page *huge_zero_page __read_mostly;
>  unsigned long huge_zero_pfn __read_mostly = ~0UL;
>
> +struct list_lru huge_low_util_page_lru;
> +
>  static void thp_utilization_workfn(struct work_struct *work);
>  static DECLARE_DELAYED_WORK(thp_utilization_work, thp_utilization_workfn);
>
> @@ -269,6 +272,46 @@ static struct shrinker huge_zero_page_shrinker = {
>  	.seeks = DEFAULT_SEEKS,
>  };
>
> +static enum lru_status low_util_free_page(struct list_head *item,
> +					  struct list_lru_one *lru,
> +					  spinlock_t *lock,
> +					  void *cb_arg)
> +{
> +	struct page *head = compound_head(list_entry(item,
> +									struct page,
> +									underutilized_thp_list));
> +
> +	if (get_page_unless_zero(head)) {
> +		lock_page(head);
> +		list_lru_isolate(lru, item);
> +		split_huge_page(head);
> +		unlock_page(head);
> +		put_page(head);
> +	}
> +
> +	return LRU_REMOVED_RETRY;
> +}
> +
> +static unsigned long shrink_huge_low_util_page_count(struct shrinker *shrink,
> +						     struct shrink_control *sc)
> +{
> +	return list_lru_shrink_count(&huge_low_util_page_lru, sc);
> +}
> +
> +static unsigned long shrink_huge_low_util_page_scan(struct shrinker *shrink,
> +						    struct shrink_control *sc)
> +{
> +	return list_lru_shrink_walk(&huge_low_util_page_lru, sc, low_util_free_page, NULL);
> +}
> +
> +static struct shrinker huge_low_util_page_shrinker = {
> +	.count_objects = shrink_huge_low_util_page_count,
> +	.scan_objects = shrink_huge_low_util_page_scan,
> +	.seeks = DEFAULT_SEEKS,
> +	.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
> +		SHRINKER_NONSLAB,
> +};
> +
>  #ifdef CONFIG_SYSFS
>  static ssize_t enabled_show(struct kobject *kobj,
>  			    struct kobj_attribute *attr, char *buf)
> @@ -521,13 +564,18 @@ static int __init hugepage_init(void)
>  		goto err_slab;
>
>  	schedule_delayed_work(&thp_utilization_work, HZ);
> +	err = register_shrinker(&huge_low_util_page_shrinker, "thp-low-util");
> +	if (err)
> +		goto err_low_util_shrinker;
>  	err = register_shrinker(&huge_zero_page_shrinker, "thp-zero");
>  	if (err)
>  		goto err_hzp_shrinker;
>  	err = register_shrinker(&deferred_split_shrinker, "thp-deferred_split");
>  	if (err)
>  		goto err_split_shrinker;
> -
> +	err = list_lru_init_memcg(&huge_low_util_page_lru, &huge_low_util_page_shrinker);
> +	if (err)
> +		goto err_low_util_list_lru;
>  	/*
>  	 * By default disable transparent hugepages on smaller systems,
>  	 * where the extra memory used could hurt more than TLB overhead
> @@ -543,11 +591,16 @@ static int __init hugepage_init(void)
>  		goto err_khugepaged;
>
>  	return 0;
> +
>  err_khugepaged:
> +	list_lru_destroy(&huge_low_util_page_lru);
> +err_low_util_list_lru:
>  	unregister_shrinker(&deferred_split_shrinker);
>  err_split_shrinker:
>  	unregister_shrinker(&huge_zero_page_shrinker);
>  err_hzp_shrinker:
> +	unregister_shrinker(&huge_low_util_page_shrinker);
> +err_low_util_shrinker:
>  	khugepaged_destroy();
>  err_slab:
>  	hugepage_exit_sysfs(hugepage_kobj);
> @@ -622,6 +675,7 @@ void prep_transhuge_page(struct page *page)
>  	 */
>
>  	INIT_LIST_HEAD(page_deferred_list(page));
> +	INIT_LIST_HEAD(page_underutilized_thp_list(page));
>  	set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
>  }
>
> @@ -2491,8 +2545,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
>  			 (1L << PG_dirty)));
>
>  	/* ->mapping in first tail page is compound_mapcount */
> -	VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING,
> -			page_tail);
> +	VM_BUG_ON_PAGE(tail > 3 && page_tail->mapping != TAIL_MAPPING, page_tail);
>  	page_tail->mapping = head->mapping;
>  	page_tail->index = head->index + tail;
>  	page_tail->private = 0;
> @@ -2698,6 +2751,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  	struct folio *folio = page_folio(page);
>  	struct page *head = &folio->page;
>  	struct deferred_split *ds_queue = get_deferred_split_queue(head);
> +	struct list_head *underutilized_thp_list = page_underutilized_thp_list(head);
>  	XA_STATE(xas, &head->mapping->i_pages, head->index);
>  	struct anon_vma *anon_vma = NULL;
>  	struct address_space *mapping = NULL;
> @@ -2796,6 +2850,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  			list_del(page_deferred_list(head));
>  		}
>  		spin_unlock(&ds_queue->split_queue_lock);
> +		if (!list_empty(underutilized_thp_list))
> +			list_lru_del_page(&huge_low_util_page_lru, head, underutilized_thp_list);
>  		if (mapping) {
>  			int nr = thp_nr_pages(head);
>
> @@ -2838,6 +2894,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
>  void free_transhuge_page(struct page *page)
>  {
>  	struct deferred_split *ds_queue = get_deferred_split_queue(page);
> +	struct list_head *underutilized_thp_list = page_underutilized_thp_list(page);
>  	unsigned long flags;
>
>  	spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
> @@ -2846,6 +2903,12 @@ void free_transhuge_page(struct page *page)
>  		list_del(page_deferred_list(page));
>  	}
>  	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
> +	if (!list_empty(underutilized_thp_list))
> +		list_lru_del_page(&huge_low_util_page_lru, page, underutilized_thp_list);
> +
> +	if (PageLRU(page))
> +		__clear_page_lru_flags(page);
> +
>  	free_compound_page(page);
>  }
>
> @@ -2886,6 +2949,26 @@ void deferred_split_huge_page(struct page *page)
>  	spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
>  }
>
> +void add_underutilized_thp(struct page *page)
> +{
> +	VM_BUG_ON_PAGE(!PageTransHuge(page), page);
> +
> +	if (PageSwapCache(page))
> +		return;
> +
> +	/*
> +	 * Need to take a reference on the page to prevent the page from getting free'd from
> +	 * under us while we are adding the THP to the shrinker.
> +	 */
> +	if (!get_page_unless_zero(page))
> +		return;
> +
> +	if (!is_huge_zero_page(page) && is_anon_transparent_hugepage(page))
> +		list_lru_add_page(&huge_low_util_page_lru, page, page_underutilized_thp_list(page));
> +
> +	put_page(page);
> +}
> +
>  static unsigned long deferred_split_count(struct shrinker *shrink,
>  		struct shrink_control *sc)
>  {
> @@ -3424,6 +3507,9 @@ static void thp_util_scan(unsigned long pfn_end)
>  		/* Group THPs into utilization buckets */
>  		bucket = num_utilized_pages * THP_UTIL_BUCKET_NR / HPAGE_PMD_NR;
>  		bucket = min(bucket, THP_UTIL_BUCKET_NR - 1);
> +		if (bucket == 0)
> +			add_underutilized_thp(page);
> +
>  		thp_scan.buckets[bucket].nr_thps++;
>  		thp_scan.buckets[bucket].nr_zero_pages += (HPAGE_PMD_NR - num_utilized_pages);
>  	}
> diff --git a/mm/list_lru.c b/mm/list_lru.c
> index a05e5bef3b40..7e8b324cc840 100644
> --- a/mm/list_lru.c
> +++ b/mm/list_lru.c
> @@ -140,6 +140,32 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item)
>  }
>  EXPORT_SYMBOL_GPL(list_lru_add);
>
> +bool list_lru_add_page(struct list_lru *lru, struct page *page, struct list_head *item)
> +{
> +	int nid = page_to_nid(page);
> +	struct list_lru_node *nlru = &lru->node[nid];
> +	struct list_lru_one *l;
> +	struct mem_cgroup *memcg;
> +
> +	spin_lock(&nlru->lock);
> +	if (list_empty(item)) {
> +		memcg = page_memcg(page);
> +		memcg_list_lru_alloc(memcg, lru, GFP_KERNEL);
> +		l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
> +		list_add_tail(item, &l->list);
> +		/* Set shrinker bit if the first element was added */
> +		if (!l->nr_items++)
> +			set_shrinker_bit(memcg, nid,
> +					 lru_shrinker_id(lru));
> +		nlru->nr_items++;
> +		spin_unlock(&nlru->lock);
> +		return true;
> +	}
> +	spin_unlock(&nlru->lock);
> +	return false;
> +}
> +EXPORT_SYMBOL_GPL(list_lru_add_page);
> +
>  bool list_lru_del(struct list_lru *lru, struct list_head *item)
>  {
>  	int nid = page_to_nid(virt_to_page(item));
> @@ -160,6 +186,29 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
>  }
>  EXPORT_SYMBOL_GPL(list_lru_del);
>
> +bool list_lru_del_page(struct list_lru *lru, struct page *page, struct list_head *item)
> +{
> +	int nid = page_to_nid(page);
> +	struct list_lru_node *nlru = &lru->node[nid];
> +	struct list_lru_one *l;
> +	struct mem_cgroup *memcg;
> +
> +	spin_lock(&nlru->lock);
> +	if (!list_empty(item)) {
> +		memcg = page_memcg(page);
> +		memcg_list_lru_alloc(memcg, lru, GFP_KERNEL);
> +		l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
> +		list_del_init(item);
> +		l->nr_items--;
> +		nlru->nr_items--;
> +		spin_unlock(&nlru->lock);
> +		return true;
> +	}
> +	spin_unlock(&nlru->lock);
> +	return false;
> +}
> +EXPORT_SYMBOL_GPL(list_lru_del_page);
> +
>  void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
>  {
>  	list_del_init(item);
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index e5486d47406e..a2a33b4d71db 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1327,6 +1327,12 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
>  		 * deferred_list.next -- ignore value.
>  		 */
>  		break;
> +	case 3:
> +		/*
> +		 * the third tail page: ->mapping is
> +		 * underutilized_thp_list.next -- ignore value.
> +		 */
> +		break;
>  	default:
>  		if (page->mapping != TAIL_MAPPING) {
>  			bad_page(page, "corrupted mapping in tail page");
> -- 
> 2.30.2


--
Best Regards,
Yan, Zi

Download attachment "signature.asc" of type "application/pgp-signature" (855 bytes)

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ