[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-ID: <ZHBowMEDfyrAAOWH@bombadil.infradead.org>
Date: Fri, 26 May 2023 01:07:28 -0700
From: Luis Chamberlain <mcgrof@...nel.org>
To: hughd@...gle.com, akpm@...ux-foundation.org, willy@...radead.org,
brauner@...nel.org, djwong@...nel.org
Cc: p.raghav@...sung.com, da.gomez@...sung.com, rohan.puri@...sung.com,
rpuri.linux@...il.com, a.manzanares@...sung.com, dave@...olabs.net,
yosryahmed@...gle.com, keescook@...omium.org, hare@...e.de,
kbusch@...nel.org, patches@...ts.linux.dev,
linux-block@...r.kernel.org, linux-fsdevel@...r.kernel.org,
linux-mm@...ck.org, linux-kernel@...r.kernel.org
Subject: Re: [RFC v2 0/8] add support for blocksize > PAGE_SIZE
On Fri, May 26, 2023 at 12:55:44AM -0700, Luis Chamberlain wrote:
> Future work:
>
> o shmem_file_read_iter()
And as for this, this is what I'm up to, but for the life of me I can't
figure out why I end up with an empty new line at the end of my test
with this, the same simple test as described in the patch "shmem: add
support to customize block size order".
I end up with:
root@...ap ~ # ./run.sh
2dcc06b7ca3b7dd8b5626af83c1be3cb08ddc76c /root/ordered.txt
a0466a798f2d967c143f0f716c344660dc360f78 /data-tmpfs/ordered.txt
File: /data-tmpfs/ordered.txt
Size: 6888896 Blocks: 16384 IO Block: 4194304 regular
file
Device: 0,44 Inode: 2 Links: 1
Access: (0644/-rw-r--r--) Uid: ( 0/ root) Gid: ( 0/
root)
Access: 2023-05-26 01:06:15.566330524 -0700
Modify: 2023-05-26 01:06:15.554330477 -0700
Change: 2023-05-26 01:06:15.554330477 -0700
Birth: 2023-05-26 01:06:15.534330399 -0700
root@...ap ~ # diff -u /root/ordered.txt /data-tmpfs/ordered.txt
--- /root/ordered.txt 2023-05-25 16:50:53.755019418 -0700
+++ /data-tmpfs/ordered.txt 2023-05-26 01:06:15.554330477 -0700
@@ -999998,3 +999998,4 @@
999998
999999
1000000
+
\ No newline at end of file
root@...ap ~ # cat run.sh
#!/bin/bash
# time for i in $(seq 1 1000000); do echo $i >>
# /root/ordered.txt; done
sha1sum /root/ordered.txt
mount -t tmpfs -o size=8M,border=22 -o noswap tmpfs
/data-tmpfs/
cp /root/ordered.txt /data-tmpfs/
sha1sum /data-tmpfs/ordered.txt
stat /data-tmpfs/ordered.txt
>From 61008f03217b1524da317928885ef68a67abc773 Mon Sep 17 00:00:00 2001
From: Luis Chamberlain <mcgrof@...nel.org>
Date: Wed, 19 Apr 2023 20:42:54 -0700
Subject: [PATCH] shmem: convert shmem_file_read_iter() to folios
Signed-off-by: Luis Chamberlain <mcgrof@...nel.org>
---
mm/shmem.c | 74 +++++++++++++++++++++++++++++++++++++++++-------------
1 file changed, 56 insertions(+), 18 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c
index 777e953df62e..2d3512f6dd30 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2431,6 +2431,10 @@ static struct inode *shmem_get_inode(struct mnt_idmap *idmap, struct super_block
inode->i_ino = ino;
inode_init_owner(idmap, inode, dir, mode);
inode->i_blocks = 0;
+ if (sb->s_flags & SB_KERNMOUNT)
+ inode->i_blkbits = PAGE_SHIFT;
+ else
+ inode->i_blkbits = sb->s_blocksize_bits;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
inode->i_generation = get_random_u32();
info = SHMEM_I(inode);
@@ -2676,19 +2680,42 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct address_space *mapping = inode->i_mapping;
+ struct super_block *sb = inode->i_sb;
+ u64 bsize = i_blocksize(inode);
pgoff_t index;
unsigned long offset;
int error = 0;
ssize_t retval = 0;
loff_t *ppos = &iocb->ki_pos;
+ /*
+ * Although our index is page specific, we can read a blocksize at a
+ * time as we use a folio per block.
+ */
index = *ppos >> PAGE_SHIFT;
- offset = *ppos & ~PAGE_MASK;
+
+ /*
+ * We're going to read a folio at a time of size blocksize.
+ *
+ * The offset represents the position in the folio where we are
+ * currently doing reads on. It starts off by the offset position in the
+ * first folio where we were asked to start our read. It later gets
+ * incremented by the number of bytes we read per folio. After the
+ * first folio is read offset would be 0 as we are starting to read the
+ * next folio at offset 0. We'd then read a full blocksize at a time
+ * until we're done.
+ */
+ offset = *ppos & (bsize - 1);
for (;;) {
struct folio *folio = NULL;
- struct page *page = NULL;
pgoff_t end_index;
+ /*
+ * nr represents the number of bytes we can read per folio,
+ * and this will depend on the blocksize set. On the last
+ * folio nr represents how much data on the last folio is
+ * valid to be read on the inode.
+ */
unsigned long nr, ret;
loff_t i_size = i_size_read(inode);
@@ -2696,7 +2723,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (index > end_index)
break;
if (index == end_index) {
- nr = i_size & ~PAGE_MASK;
+ nr = i_size & (bsize - 1);
if (nr <= offset)
break;
}
@@ -2709,9 +2736,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
if (folio) {
folio_unlock(folio);
-
- page = folio_file_page(folio, index);
- if (PageHWPoison(page)) {
+ if (is_folio_hwpoison(folio)) {
folio_put(folio);
error = -EIO;
break;
@@ -2722,49 +2747,56 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
* We must evaluate after, since reads (unlike writes)
* are called without i_rwsem protection against truncate
*/
- nr = PAGE_SIZE;
+ nr = bsize;
+ WARN_ON(!(sb->s_flags & SB_KERNMOUNT) && folio && bsize != folio_size(folio));
i_size = i_size_read(inode);
end_index = i_size >> PAGE_SHIFT;
if (index == end_index) {
- nr = i_size & ~PAGE_MASK;
+ nr = i_size & (bsize - 1);
if (nr <= offset) {
if (folio)
folio_put(folio);
break;
}
}
+
+ /*
+ * On the first folio read the number of bytes we can read
+ * will be blocksize - offset. On subsequent reads we can read
+ * blocksize at time until iov_iter_count(to) == 0.
+ */
nr -= offset;
if (folio) {
/*
- * If users can be writing to this page using arbitrary
+ * If users can be writing to this folio using arbitrary
* virtual addresses, take care about potential aliasing
- * before reading the page on the kernel side.
+ * before reading the folio on the kernel side.
*/
if (mapping_writably_mapped(mapping))
- flush_dcache_page(page);
+ flush_dcache_folio(folio);
/*
- * Mark the page accessed if we read the beginning.
+ * Mark the folio accessed if we read the beginning.
*/
if (!offset)
folio_mark_accessed(folio);
/*
- * Ok, we have the page, and it's up-to-date, so
+ * Ok, we have the folio, and it's up-to-date, so
* now we can copy it to user space...
*/
- ret = copy_page_to_iter(page, offset, nr, to);
+ ret = copy_folio_to_iter(folio, offset, nr, to);
folio_put(folio);
} else if (user_backed_iter(to)) {
/*
* Copy to user tends to be so well optimized, but
* clear_user() not so much, that it is noticeably
- * faster to copy the zero page instead of clearing.
+ * faster to copy the zero folio instead of clearing.
*/
- ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
+ ret = copy_folio_to_iter(page_folio(ZERO_PAGE(0)), offset, nr, to);
} else {
/*
- * But submitting the same page twice in a row to
+ * But submitting the same folio twice in a row to
* splice() - or others? - can result in confusion:
* so don't attempt that optimization on pipes etc.
*/
@@ -2773,8 +2805,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
retval += ret;
offset += ret;
+
+ /*
+ * Due to usage of folios per blocksize we know this will
+ * actually read blocksize at a time after the first block read
+ * at offset.
+ */
index += offset >> PAGE_SHIFT;
- offset &= ~PAGE_MASK;
+ offset &= (bsize - 1);
if (!iov_iter_count(to))
break;
--
2.39.2
Powered by blists - more mailing lists