linux-kernel - patch: improve generic_file_buffered

lists.openwall.net		lists / announce owl-users owl-dev john-users john-dev passwdqc-users yescrypt popa3d-users / oss-security kernel-hardening musl sabotage tlsify passwords / crypt-dev xvendor / Bugtraq Full-Disclosure linux-kernel linux-netdev linux-ext4 linux-hardening linux-cve-announce PHC
Open Source and information security mailing list archives
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <200709051546.06224.bs@q-leap.de>
Date:	Wed, 5 Sep 2007 15:45:36 +0200
From:	Bernd Schubert <bs@...eap.de>
To:	linux-kernel@...r.kernel.org
Cc:	"J. Bruce Fields" <bfields@...ldses.org>, brian@...sterfs.com
Subject: patch: improve generic_file_buffered_write()

Hi,

recently we discovered writing to a nfs-exported lustre filesystem is rather 
slow (20-40 MB/s writing, but over 200 MB/s reading).

As I already explained on the nfs mailing list, this happens since there is an 
offset on the very first page due to the nfs header.

http://sourceforge.net/mailarchive/forum.php?thread_name=200708312003.30446.bernd-schubert%40gmx.de&forum_name=nfs

While this especially effects lustre, Olaf Kirch also noticed it on another 
filesystem before and wrote a nfs patch for it. This patch has two 
disadvantages  - it requires to move all data within the pages, IMHO rather 
cpu time consuming, furthermore, it presently causes data corruption when 
more than one nfs thread is running.

After thinking it over and over again we (Goswin and I) believe it would be 
best to improve generic_file_buffered_write().
If there is sufficient data now, as it is usual for aio writes, 
generic_file_buffered_write() will now fill each page as much as possible and 
only then prepare/commit it. Before generic_file_buffered_write() commited 
chunks of pages even though there were still more data.

The attached patch still has two FIXMEs, both for likely()/unlikely() 
conditions which IMHO don't reflect the likelyhood for the new aio data 
functions.

filemap.c |  144 
+++++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 96 insertions(+), 48 deletions(-)

Signed-off-by: Bernd Schubert <bs@...eap.de>
Signed-off-by: Goswin von Brederlow <goswin@...brederlow.de>


Cheers,
Goswin and Bernd


Index: linux-2.6.20.3/mm/filemap.c
===================================================================
--- linux-2.6.20.3.orig/mm/filemap.c	2007-09-04 13:43:04.000000000 +0200
+++ linux-2.6.20.3/mm/filemap.c	2007-09-05 12:39:23.000000000 +0200
@@ -2057,6 +2057,19 @@
 }
 EXPORT_SYMBOL(generic_file_direct_write);
 
+/**
+ * This function will do 3 main tasks for each iov:
+ * - prepare a write
+ * - copy the data from iov into a new page
+ * - commit this page
+ * @iob:	file operations
+ * @iov:	vector of data to write
+ * @nr_segs:	number of iov segments
+ * @pos:	position in the file
+ * @ppos:	position in the file after this function
+ * @count:	number of bytes to write
+ * written:	offset in iov->base (data to skip on write)
+ */
 ssize_t
 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 		unsigned long nr_segs, loff_t pos, loff_t *ppos,
@@ -2074,6 +2087,11 @@
 	const struct iovec *cur_iov = iov; /* current iovec */
 	size_t		iov_base = 0;	   /* offset in the current iovec */
 	char __user	*buf;
+	unsigned long	data_start = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+	loff_t		wpos = pos; /* the position in the file we will return */
+
+	/* position in file as index of pages */
+	unsigned long	index = pos >> PAGE_CACHE_SHIFT;
 
 	pagevec_init(&lru_pvec, 0);
 
@@ -2087,9 +2105,15 @@
 		buf = cur_iov->iov_base + iov_base;
 	}
 
+	page = __grab_cache_page(mapping, index, &cached_page, &lru_pvec);
+	if (!page) {
+		status = -ENOMEM;
+		goto out;
+	}
+
 	do {
-		unsigned long index;
 		unsigned long offset;
+		unsigned long data_end; /* end of data within the page */
 		size_t copied;
 
 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
@@ -2106,6 +2130,8 @@
 		 */
 		bytes = min(bytes, cur_iov->iov_len - iov_base);
 
+		data_end = offset + bytes;
+
 		/*
 		 * Bring in the user page that we will copy from _first_.
 		 * Otherwise there's a nasty deadlock on copying from the
@@ -2114,95 +2140,117 @@
 		 */
 		fault_in_pages_readable(buf, bytes);
 
-		page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
-		if (!page) {
-			status = -ENOMEM;
-			break;
-		}
-
 		if (unlikely(bytes == 0)) {
 			status = 0;
 			copied = 0;
 			goto zero_length_segment;
 		}
 
-		status = a_ops->prepare_write(file, page, offset, offset+bytes);
-		if (unlikely(status)) {
-			loff_t isize = i_size_read(inode);
-
-			if (status != AOP_TRUNCATED_PAGE)
-				unlock_page(page);
-			page_cache_release(page);
-			if (status == AOP_TRUNCATED_PAGE)
-				continue;
+		if (data_end == PAGE_CACHE_SIZE || count == bytes) {
 			/*
-			 * prepare_write() may have instantiated a few blocks
-			 * outside i_size.  Trim these off again.
+			 * Only prepare a write if its an entire page or if
+			 * we don't have more data
 			 */
-			if (pos + bytes > isize)
-				vmtruncate(inode, isize);
-			break;
+			status = a_ops->prepare_write(file, page, data_start, data_end);
+			if (unlikely(status)) {
+				loff_t isize = i_size_read(inode);
+
+				if (status == AOP_TRUNCATED_PAGE)
+					continue;
+				/*
+				* prepare_write() may have instantiated a few blocks
+				* outside i_size.  Trim these off again.
+				*/
+				if (pos + bytes > isize)
+					vmtruncate(inode, isize);
+			}
 		}
-		if (likely(nr_segs == 1))
+		if (likely(nr_segs == 1)) /* FIXME: really likely with aio? */
 			copied = filemap_copy_from_user(page, offset,
 							buf, bytes);
 		else
 			copied = filemap_copy_from_user_iovec(page, offset,
 						cur_iov, iov_base, bytes);
-		flush_dcache_page(page);
-		status = a_ops->commit_write(file, page, offset, offset+bytes);
-		if (status == AOP_TRUNCATED_PAGE) {
+
+		if (data_end == PAGE_CACHE_SIZE || count == bytes) {
+			/*
+			 * Same as above, always try fill pages up to
+			 * PAGE_CACHE_SIZE if possible (sufficient data)
+			 */
+			flush_dcache_page(page);
+			status = a_ops->commit_write(file, page,
+						     data_start, data_end);
+			if (status == AOP_TRUNCATED_PAGE) {
+				continue;
+			}
+			unlock_page(page);
+			mark_page_accessed(page);
 			page_cache_release(page);
-			continue;
+			balance_dirty_pages_ratelimited(mapping);
+			cond_resched();
+			if (bytes < count) { /* still more iov data to write */
+				page = __grab_cache_page(mapping, index + 1,
+							 &cached_page, &lru_pvec);
+				if (!page) {
+					status = -ENOMEM;
+					goto out;
+				}
+			} else {
+				page = NULL;
+			}
+			written += data_end - data_start;
+			wpos    += data_end - data_start;
+			data_start = 0; /* correct would be data_end % PAGE_SIZE
+			                 * but if this would be != 0, we don't
+			                 * have more data
+			                 */
 		}
 zero_length_segment:
 		if (likely(copied >= 0)) {
-			if (!status)
-				status = copied;
-
-			if (status >= 0) {
-				written += status;
-				count -= status;
-				pos += status;
-				buf += status;
-				if (unlikely(nr_segs > 1)) {
+			if (!status) {
+				count -= copied;
+				pos += copied;
+				buf += copied;
+				if (unlikely(nr_segs > 1)) { /* FIXME: really unlikely with aio? */
 					filemap_set_next_iovec(&cur_iov,
-							&iov_base, status);
+							&iov_base, copied);
 					if (count)
 						buf = cur_iov->iov_base +
 							iov_base;
 				} else {
-					iov_base += status;
+					iov_base += copied;
 				}
 			}
 		}
 		if (unlikely(copied != bytes))
-			if (status >= 0)
+			if (!status)
 				status = -EFAULT;
-		unlock_page(page);
-		mark_page_accessed(page);
-		page_cache_release(page);
-		if (status < 0)
+		if (status)
 			break;
-		balance_dirty_pages_ratelimited(mapping);
-		cond_resched();
 	} while (count);
-	*ppos = pos;
+
+out:
+	*ppos = wpos;
 
 	if (cached_page)
 		page_cache_release(cached_page);
 
+	if (page) {
+		unlock_page(page);
+		page_cache_release(page);
+	}
+
 	/*
 	 * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
 	 */
-	if (likely(status >= 0)) {
+	if (likely(!status)) {
 		if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
 			if (!a_ops->writepage || !is_sync_kiocb(iocb))
 				status = generic_osync_inode(inode, mapping,
 						OSYNC_METADATA|OSYNC_DATA);
 		}
-  	}
-	
+	}
+
 	/*
 	 * If we get here for O_DIRECT writes then we must have fallen through
 	 * to buffered writes (block instantiation inside i_size).  So we sync


-- 
Bernd Schubert
Q-Leap Networks GmbH

Content of type "application/pgp-signature" skipped