linux-kernel - Re: [PATCH v4 3/6] mm: shmem: add multi-size THP sysfs interface for anonymous shmem

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <denilwdvfb772l432ezexwmy46rzv7disxhryf2ktqmtfk5khe@ghq3sohl5z3w>
Date: Mon, 10 Jun 2024 12:23:03 +0000
From: Daniel Gomez <da.gomez@...sung.com>
To: Baolin Wang <baolin.wang@...ux.alibaba.com>
CC: "akpm@...ux-foundation.org" <akpm@...ux-foundation.org>,
	"hughd@...gle.com" <hughd@...gle.com>, "willy@...radead.org"
	<willy@...radead.org>, "david@...hat.com" <david@...hat.com>,
	"wangkefeng.wang@...wei.com" <wangkefeng.wang@...wei.com>,
	"ying.huang@...el.com" <ying.huang@...el.com>, "21cnbao@...il.com"
	<21cnbao@...il.com>, "ryan.roberts@....com" <ryan.roberts@....com>,
	"shy828301@...il.com" <shy828301@...il.com>, "ziy@...dia.com"
	<ziy@...dia.com>, "ioworker0@...il.com" <ioworker0@...il.com>, Pankaj Raghav
	<p.raghav@...sung.com>, "linux-mm@...ck.org" <linux-mm@...ck.org>,
	"linux-kernel@...r.kernel.org" <linux-kernel@...r.kernel.org>
Subject: Re: [PATCH v4 3/6] mm: shmem: add multi-size THP sysfs interface
 for anonymous shmem

Hi Baolin,
On Tue, Jun 04, 2024 at 06:17:47PM +0800, Baolin Wang wrote:
> To support the use of mTHP with anonymous shmem, add a new sysfs interface
> 'shmem_enabled' in the '/sys/kernel/mm/transparent_hugepage/hugepages-kB/'
> directory for each mTHP to control whether shmem is enabled for that mTHP,
> with a value similar to the top level 'shmem_enabled', which can be set to:
> "always", "inherit (to inherit the top level setting)", "within_size", "advise",
> "never". An 'inherit' option is added to ensure compatibility with these
> global settings, and the options 'force' and 'deny' are dropped, which are
> rather testing artifacts from the old ages.
> 
> By default, PMD-sized hugepages have enabled="inherit" and all other hugepage
> sizes have enabled="never" for '/sys/kernel/mm/transparent_hugepage/hugepages-xxkB/shmem_enabled'.
> 
> In addition, if top level value is 'force', then only PMD-sized hugepages
> have enabled="inherit", otherwise configuration will be failed and vice versa.
> That means now we will avoid using non-PMD sized THP to override the global
> huge allocation.
> 
> Signed-off-by: Baolin Wang <baolin.wang@...ux.alibaba.com>
> ---
>  Documentation/admin-guide/mm/transhuge.rst | 23 ++++++
>  include/linux/huge_mm.h                    | 10 +++
>  mm/huge_memory.c                           | 11 +--
>  mm/shmem.c                                 | 96 ++++++++++++++++++++++
>  4 files changed, 132 insertions(+), 8 deletions(-)
> 
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index d414d3f5592a..b76d15e408b3 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -332,6 +332,29 @@ deny
>  force
>      Force the huge option on for all - very useful for testing;
>  
> +Shmem can also use "multi-size THP" (mTHP) by adding a new sysfs knob to control
> +mTHP allocation: '/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/shmem_enabled',
> +and its value for each mTHP is essentially consistent with the global setting.
> +An 'inherit' option is added to ensure compatibility with these global settings.
> +Conversely, the options 'force' and 'deny' are dropped, which are rather testing
> +artifacts from the old ages.
> +always
> +    Attempt to allocate <size> huge pages every time we need a new page;
> +
> +inherit
> +    Inherit the top-level "shmem_enabled" value. By default, PMD-sized hugepages
> +    have enabled="inherit" and all other hugepage sizes have enabled="never";
> +
> +never
> +    Do not allocate <size> huge pages;
> +
> +within_size
> +    Only allocate <size> huge page if it will be fully within i_size.
> +    Also respect fadvise()/madvise() hints;
> +
> +advise
> +    Only allocate <size> huge pages if requested with fadvise()/madvise();
> +
>  Need of application restart
>  ===========================
>  
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 020e2344eb86..fac21548c5de 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -6,6 +6,7 @@
>  #include <linux/mm_types.h>
>  
>  #include <linux/fs.h> /* only for vma_is_dax() */
> +#include <linux/kobject.h>
>  
>  vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf);
>  int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
> @@ -63,6 +64,7 @@ ssize_t single_hugepage_flag_show(struct kobject *kobj,
>  				  struct kobj_attribute *attr, char *buf,
>  				  enum transparent_hugepage_flag flag);
>  extern struct kobj_attribute shmem_enabled_attr;
> +extern struct kobj_attribute thpsize_shmem_enabled_attr;
>  
>  /*
>   * Mask of all large folio orders supported for anonymous THP; all orders up to
> @@ -265,6 +267,14 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma,
>  	return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders);
>  }
>  
> +struct thpsize {
> +	struct kobject kobj;
> +	struct list_head node;
> +	int order;
> +};
> +
> +#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)
> +
>  enum mthp_stat_item {
>  	MTHP_STAT_ANON_FAULT_ALLOC,
>  	MTHP_STAT_ANON_FAULT_FALLBACK,
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 8e49f402d7c7..1360a1903b66 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -449,14 +449,6 @@ static void thpsize_release(struct kobject *kobj);
>  static DEFINE_SPINLOCK(huge_anon_orders_lock);
>  static LIST_HEAD(thpsize_list);
>  
> -struct thpsize {
> -	struct kobject kobj;
> -	struct list_head node;
> -	int order;
> -};
> -
> -#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj)
> -
>  static ssize_t thpsize_enabled_show(struct kobject *kobj,
>  				    struct kobj_attribute *attr, char *buf)
>  {
> @@ -517,6 +509,9 @@ static struct kobj_attribute thpsize_enabled_attr =
>  
>  static struct attribute *thpsize_attrs[] = {
>  	&thpsize_enabled_attr.attr,
> +#ifdef CONFIG_SHMEM
> +	&thpsize_shmem_enabled_attr.attr,
> +#endif
>  	NULL,
>  };
>  
> diff --git a/mm/shmem.c b/mm/shmem.c
> index ae358efc397a..643ff7516b4d 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -131,6 +131,14 @@ struct shmem_options {
>  #define SHMEM_SEEN_QUOTA 32
>  };
>  
> +#ifdef CONFIG_TRANSPARENT_HUGEPAGE
> +static unsigned long huge_anon_shmem_orders_always __read_mostly;
> +static unsigned long huge_anon_shmem_orders_madvise __read_mostly;
> +static unsigned long huge_anon_shmem_orders_inherit __read_mostly;
> +static unsigned long huge_anon_shmem_orders_within_size __read_mostly;
> +static DEFINE_SPINLOCK(huge_anon_shmem_orders_lock);
> +#endif

Since we are also applying the new sysfs knob controls to tmpfs and anon mm,
should we rename this to get rid of the anon prefix?

> +
>  #ifdef CONFIG_TMPFS
>  static unsigned long shmem_default_max_blocks(void)
>  {
> @@ -4672,6 +4680,12 @@ void __init shmem_init(void)
>  		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
>  	else
>  		shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
> +
> +	/*
> +	 * Default to setting PMD-sized THP to inherit the global setting and
> +	 * disable all other multi-size THPs, when anonymous shmem uses mTHP.
> +	 */
> +	huge_anon_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER);
>  #endif
>  	return;
>  
> @@ -4731,6 +4745,11 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
>  			huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY)
>  		return -EINVAL;
>  
> +	/* Do not override huge allocation policy with non-PMD sized mTHP */
> +	if (huge == SHMEM_HUGE_FORCE &&
> +	    huge_anon_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER))
> +		return -EINVAL;
> +
>  	shmem_huge = huge;
>  	if (shmem_huge > SHMEM_HUGE_DENY)
>  		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
> @@ -4738,6 +4757,83 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
>  }
>  
>  struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
> +
> +static ssize_t thpsize_shmem_enabled_show(struct kobject *kobj,
> +					  struct kobj_attribute *attr, char *buf)
> +{
> +	int order = to_thpsize(kobj)->order;
> +	const char *output;
> +
> +	if (test_bit(order, &huge_anon_shmem_orders_always))
> +		output = "[always] inherit within_size advise never";
> +	else if (test_bit(order, &huge_anon_shmem_orders_inherit))
> +		output = "always [inherit] within_size advise never";
> +	else if (test_bit(order, &huge_anon_shmem_orders_within_size))
> +		output = "always inherit [within_size] advise never";
> +	else if (test_bit(order, &huge_anon_shmem_orders_madvise))
> +		output = "always inherit within_size [advise] never";
> +	else
> +		output = "always inherit within_size advise [never]";
> +
> +	return sysfs_emit(buf, "%s\n", output);
> +}
> +
> +static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj,
> +					   struct kobj_attribute *attr,
> +					   const char *buf, size_t count)
> +{
> +	int order = to_thpsize(kobj)->order;
> +	ssize_t ret = count;
> +
> +	if (sysfs_streq(buf, "always")) {
> +		spin_lock(&huge_anon_shmem_orders_lock);
> +		clear_bit(order, &huge_anon_shmem_orders_inherit);
> +		clear_bit(order, &huge_anon_shmem_orders_madvise);
> +		clear_bit(order, &huge_anon_shmem_orders_within_size);
> +		set_bit(order, &huge_anon_shmem_orders_always);
> +		spin_unlock(&huge_anon_shmem_orders_lock);
> +	} else if (sysfs_streq(buf, "inherit")) {
> +		/* Do not override huge allocation policy with non-PMD sized mTHP */
> +		if (shmem_huge == SHMEM_HUGE_FORCE &&
> +		    order != HPAGE_PMD_ORDER)
> +			return -EINVAL;
> +
> +		spin_lock(&huge_anon_shmem_orders_lock);
> +		clear_bit(order, &huge_anon_shmem_orders_always);
> +		clear_bit(order, &huge_anon_shmem_orders_madvise);
> +		clear_bit(order, &huge_anon_shmem_orders_within_size);
> +		set_bit(order, &huge_anon_shmem_orders_inherit);
> +		spin_unlock(&huge_anon_shmem_orders_lock);
> +	} else if (sysfs_streq(buf, "within_size")) {
> +		spin_lock(&huge_anon_shmem_orders_lock);
> +		clear_bit(order, &huge_anon_shmem_orders_always);
> +		clear_bit(order, &huge_anon_shmem_orders_inherit);
> +		clear_bit(order, &huge_anon_shmem_orders_madvise);
> +		set_bit(order, &huge_anon_shmem_orders_within_size);
> +		spin_unlock(&huge_anon_shmem_orders_lock);
> +	} else if (sysfs_streq(buf, "madvise")) {
> +		spin_lock(&huge_anon_shmem_orders_lock);
> +		clear_bit(order, &huge_anon_shmem_orders_always);
> +		clear_bit(order, &huge_anon_shmem_orders_inherit);
> +		clear_bit(order, &huge_anon_shmem_orders_within_size);
> +		set_bit(order, &huge_anon_shmem_orders_madvise);
> +		spin_unlock(&huge_anon_shmem_orders_lock);
> +	} else if (sysfs_streq(buf, "never")) {
> +		spin_lock(&huge_anon_shmem_orders_lock);
> +		clear_bit(order, &huge_anon_shmem_orders_always);
> +		clear_bit(order, &huge_anon_shmem_orders_inherit);
> +		clear_bit(order, &huge_anon_shmem_orders_within_size);
> +		clear_bit(order, &huge_anon_shmem_orders_madvise);
> +		spin_unlock(&huge_anon_shmem_orders_lock);
> +	} else {
> +		ret = -EINVAL;
> +	}
> +
> +	return ret;
> +}
> +
> +struct kobj_attribute thpsize_shmem_enabled_attr =
> +	__ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store);
>  #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
>  
>  #else /* !CONFIG_SHMEM */
> -- 
> 2.39.3
>