[<prev] [next>] [<thread-prev] [day] [month] [year] [list]
Message-ID: <d52dc882-efdf-42d7-a477-46580650cf80@linux.alibaba.com>
Date: Wed, 4 Sep 2024 15:47:14 +0800
From: Baolin Wang <baolin.wang@...ux.alibaba.com>
To: Rik van Riel <riel@...riel.com>, Hugh Dickins <hughd@...gle.com>
Cc: Andrew Morton <akpm@...ux-foundation.org>,
"Darrick J. Wong" <djwong@...nel.org>, Vlastimil Babka <vbabka@...e.cz>,
Matthew Wilcox <willy@...radead.org>, kernel-team@...a.com,
linux-kernel@...r.kernel.org, linux-mm@...ck.org
Subject: Re: [PATCH v2] mm,tmpfs: consider end of file write in shmem_is_huge
On 2024/9/3 23:19, Rik van Riel wrote:
> Take the end of a file write into consideration when deciding whether
> or not to use huge pages for tmpfs files when the tmpfs filesystem is
> mounted with huge=within_size
>
> This allows large writes that append to the end of a file to automatically
> use large pages.
>
> Doing 4MB squential writes without fallocate to a 16GB tmpfs file
> with fio. The numbers without THP or with huge=always stay the same,
> but the performance with huge=within_size now matches that of huge=always.
>
> huge before after
> 4kB pages 1560 MB/s 1560 MB/s
> within_size 1560 MB/s 4720 MB/s
> always: 4720 MB/s 4720 MB/s
>
> Signed-off-by: Rik van Riel <riel@...riel.com>
LGTM. Feel free to add:
Reviewed-by: Baolin Wang <baolin.wang@...ux.alibaba.com>
Tested-by: Baolin Wang <baolin.wang@...ux.alibaba.com>
> ---
> v2: rebased on mm-unstable, fixed up changelog as suggested by Andrew Morton
>
> fs/xfs/scrub/xfile.c | 6 ++--
> fs/xfs/xfs_buf_mem.c | 2 +-
> include/linux/shmem_fs.h | 8 +++---
> mm/huge_memory.c | 2 +-
> mm/khugepaged.c | 2 +-
> mm/shmem.c | 59 +++++++++++++++++++++-------------------
> mm/userfaultfd.c | 2 +-
> 7 files changed, 42 insertions(+), 39 deletions(-)
>
> diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
> index 9b5d98fe1f8a..c753c79df203 100644
> --- a/fs/xfs/scrub/xfile.c
> +++ b/fs/xfs/scrub/xfile.c
> @@ -126,7 +126,7 @@ xfile_load(
> unsigned int len;
> unsigned int offset;
>
> - if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
> + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,
> SGP_READ) < 0)
> break;
> if (!folio) {
> @@ -196,7 +196,7 @@ xfile_store(
> unsigned int len;
> unsigned int offset;
>
> - if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
> + if (shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,
> SGP_CACHE) < 0)
> break;
> if (filemap_check_wb_err(inode->i_mapping, 0)) {
> @@ -267,7 +267,7 @@ xfile_get_folio(
> i_size_write(inode, pos + len);
>
> pflags = memalloc_nofs_save();
> - error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
> + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio,
> (flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ);
> memalloc_nofs_restore(pflags);
> if (error)
> diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
> index 9bb2d24de709..07bebbfb16ee 100644
> --- a/fs/xfs/xfs_buf_mem.c
> +++ b/fs/xfs/xfs_buf_mem.c
> @@ -149,7 +149,7 @@ xmbuf_map_page(
> return -ENOMEM;
> }
>
> - error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE);
> + error = shmem_get_folio(inode, pos >> PAGE_SHIFT, 0, &folio, SGP_CACHE);
> if (error)
> return error;
>
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index 1564d7d3ca61..515a9a6a3c6f 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -113,11 +113,11 @@ int shmem_unuse(unsigned int type);
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> unsigned long shmem_allowable_huge_orders(struct inode *inode,
> struct vm_area_struct *vma, pgoff_t index,
> - bool shmem_huge_force);
> + loff_t write_end, bool shmem_huge_force);
> #else
> static inline unsigned long shmem_allowable_huge_orders(struct inode *inode,
> struct vm_area_struct *vma, pgoff_t index,
> - bool shmem_huge_force)
> + loff_t write_end, bool shmem_huge_force)
> {
> return 0;
> }
> @@ -143,8 +143,8 @@ enum sgp_type {
> SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */
> };
>
> -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
> - enum sgp_type sgp);
> +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
> + struct folio **foliop, enum sgp_type sgp);
> struct folio *shmem_read_folio_gfp(struct address_space *mapping,
> pgoff_t index, gfp_t gfp);
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 0993dfe9ae94..382938e46f96 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -164,7 +164,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
> */
> if (!in_pf && shmem_file(vma->vm_file))
> return shmem_allowable_huge_orders(file_inode(vma->vm_file),
> - vma, vma->vm_pgoff,
> + vma, vma->vm_pgoff, 0,
> !enforce_sysfs);
>
> if (!vma_is_anonymous(vma)) {
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 32100041aef3..f9c39898eaff 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1870,7 +1870,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
> if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
> xas_unlock_irq(&xas);
> /* swap in or instantiate fallocated page */
> - if (shmem_get_folio(mapping->host, index,
> + if (shmem_get_folio(mapping->host, index, 0,
> &folio, SGP_NOALLOC)) {
> result = SCAN_FAIL;
> goto xa_unlocked;
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 553b99cb265e..375ae3f170a3 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -549,7 +549,8 @@ static bool shmem_confirm_swap(struct address_space *mapping,
> static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
>
> static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
> - bool shmem_huge_force, struct vm_area_struct *vma,
> + loff_t write_end, bool shmem_huge_force,
> + struct vm_area_struct *vma,
> unsigned long vm_flags)
> {
> struct mm_struct *mm = vma ? vma->vm_mm : NULL;
> @@ -569,7 +570,8 @@ static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
> return true;
> case SHMEM_HUGE_WITHIN_SIZE:
> index = round_up(index + 1, HPAGE_PMD_NR);
> - i_size = round_up(i_size_read(inode), PAGE_SIZE);
> + i_size = max(write_end, i_size_read(inode));
> + i_size = round_up(i_size, PAGE_SIZE);
> if (i_size >> PAGE_SHIFT >= index)
> return true;
> fallthrough;
> @@ -583,14 +585,14 @@ static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
> }
>
> static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
> - bool shmem_huge_force, struct vm_area_struct *vma,
> - unsigned long vm_flags)
> + loff_t write_end, bool shmem_huge_force,
> + struct vm_area_struct *vma, unsigned long vm_flags)
> {
> if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
> return false;
>
> - return __shmem_huge_global_enabled(inode, index, shmem_huge_force,
> - vma, vm_flags);
> + return __shmem_huge_global_enabled(inode, index, write_end,
> + shmem_huge_force, vma, vm_flags);
> }
>
> #if defined(CONFIG_SYSFS)
> @@ -770,8 +772,8 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
> }
>
> static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
> - bool shmem_huge_force, struct vm_area_struct *vma,
> - unsigned long vm_flags)
> + loff_t write_end, bool shmem_huge_force,
> + struct vm_area_struct *vma, unsigned long vm_flags)
> {
> return false;
> }
> @@ -978,7 +980,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
> * (although in some cases this is just a waste of time).
> */
> folio = NULL;
> - shmem_get_folio(inode, index, &folio, SGP_READ);
> + shmem_get_folio(inode, index, 0, &folio, SGP_READ);
> return folio;
> }
>
> @@ -1166,7 +1168,7 @@ static int shmem_getattr(struct mnt_idmap *idmap,
> STATX_ATTR_NODUMP);
> generic_fillattr(idmap, request_mask, inode, stat);
>
> - if (shmem_huge_global_enabled(inode, 0, false, NULL, 0))
> + if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
> stat->blksize = HPAGE_PMD_SIZE;
>
> if (request_mask & STATX_BTIME) {
> @@ -1653,7 +1655,7 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
> #ifdef CONFIG_TRANSPARENT_HUGEPAGE
> unsigned long shmem_allowable_huge_orders(struct inode *inode,
> struct vm_area_struct *vma, pgoff_t index,
> - bool shmem_huge_force)
> + loff_t write_end, bool shmem_huge_force)
> {
> unsigned long mask = READ_ONCE(huge_shmem_orders_always);
> unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
> @@ -1670,8 +1672,8 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode,
> if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
> return 0;
>
> - global_huge = shmem_huge_global_enabled(inode, index, shmem_huge_force,
> - vma, vm_flags);
> + global_huge = shmem_huge_global_enabled(inode, index, write_end,
> + shmem_huge_force, vma, vm_flags);
> if (!vma || !vma_is_anon_shmem(vma)) {
> /*
> * For tmpfs, we now only support PMD sized THP if huge page
> @@ -2231,8 +2233,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
> * vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
> */
> static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
> - struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
> - struct vm_fault *vmf, vm_fault_t *fault_type)
> + loff_t write_end, struct folio **foliop, enum sgp_type sgp,
> + gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type)
> {
> struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
> struct mm_struct *fault_mm;
> @@ -2312,7 +2314,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
> }
>
> /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */
> - orders = shmem_allowable_huge_orders(inode, vma, index, false);
> + orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false);
> if (orders > 0) {
> gfp_t huge_gfp;
>
> @@ -2413,6 +2415,7 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
> * shmem_get_folio - find, and lock a shmem folio.
> * @inode: inode to search
> * @index: the page index.
> + * @write_end: end of a write, could extend inode size
> * @foliop: pointer to the folio if found
> * @sgp: SGP_* flags to control behavior
> *
> @@ -2432,10 +2435,10 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
> * Context: May sleep.
> * Return: 0 if successful, else a negative error code.
> */
> -int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
> - enum sgp_type sgp)
> +int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
> + struct folio **foliop, enum sgp_type sgp)
> {
> - return shmem_get_folio_gfp(inode, index, foliop, sgp,
> + return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp,
> mapping_gfp_mask(inode->i_mapping), NULL, NULL);
> }
> EXPORT_SYMBOL_GPL(shmem_get_folio);
> @@ -2530,7 +2533,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
> }
>
> WARN_ON_ONCE(vmf->page != NULL);
> - err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
> + err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE,
> gfp, vmf, &ret);
> if (err)
> return vmf_error(err);
> @@ -3040,7 +3043,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
> return -EPERM;
> }
>
> - ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
> + ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
> if (ret)
> return ret;
>
> @@ -3111,7 +3114,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
> break;
> }
>
> - error = shmem_get_folio(inode, index, &folio, SGP_READ);
> + error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
> if (error) {
> if (error == -EINVAL)
> error = 0;
> @@ -3287,7 +3290,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
> if (*ppos >= i_size_read(inode))
> break;
>
> - error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio,
> + error = shmem_get_folio(inode, *ppos / PAGE_SIZE, 0, &folio,
> SGP_READ);
> if (error) {
> if (error == -EINVAL)
> @@ -3477,8 +3480,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
> else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
> error = -ENOMEM;
> else
> - error = shmem_get_folio(inode, index, &folio,
> - SGP_FALLOC);
> + error = shmem_get_folio(inode, index, offset + len,
> + &folio, SGP_FALLOC);
> if (error) {
> info->fallocend = undo_fallocend;
> /* Remove the !uptodate folios we added */
> @@ -3829,7 +3832,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
> } else {
> inode_nohighmem(inode);
> inode->i_mapping->a_ops = &shmem_aops;
> - error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
> + error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE);
> if (error)
> goto out_remove_offset;
> inode->i_op = &shmem_symlink_inode_operations;
> @@ -3875,7 +3878,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
> return ERR_PTR(-ECHILD);
> }
> } else {
> - error = shmem_get_folio(inode, 0, &folio, SGP_READ);
> + error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ);
> if (error)
> return ERR_PTR(error);
> if (!folio)
> @@ -5343,7 +5346,7 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping,
> struct folio *folio;
> int error;
>
> - error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
> + error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE,
> gfp, NULL, NULL);
> if (error)
> return ERR_PTR(error);
> diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
> index 6ef42d9cd482..ce13c4062647 100644
> --- a/mm/userfaultfd.c
> +++ b/mm/userfaultfd.c
> @@ -391,7 +391,7 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
> struct page *page;
> int ret;
>
> - ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC);
> + ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
> /* Our caller expects us to return -EFAULT if we failed to find folio */
> if (ret == -ENOENT)
> ret = -EFAULT;
Powered by blists - more mailing lists