[<prev] [next>] [<thread-prev] [thread-next>] [day] [month] [year] [list]
Message-Id: <200709081415.54211.nickpiggin@yahoo.com.au>
Date: Sat, 8 Sep 2007 14:15:53 +1000
From: Nick Piggin <nickpiggin@...oo.com.au>
To: Bernd Schubert <bs@...eap.de>
Cc: Randy Dunlap <randy.dunlap@...cle.com>,
linux-kernel@...r.kernel.org,
"J. Bruce Fields" <bfields@...ldses.org>, brian@...sterfs.com,
Goswin von Brederlow <brederlo@...ormatik.uni-tuebingen.de>
Subject: Re: patch: improve generic_file_buffered_write() (2nd try 1/2)
On Thursday 06 September 2007 03:41, Bernd Schubert wrote:
> > This comment block should be:
> >
> > /**
> > * generic_file_buffered_write - handle an iov
> > * @iocb: file operations
> > * @iov: vector of data to write
> > * @nr_segs: number of iov segments
> > * @pos: position in the file
> > * @ppos: position in the file after this function
> > * @count: number of bytes to write
> > * @written: offset in iov->base (data to skip on write)
> > *
> > * This function will do 3 main tasks for each iov:
> > * - prepare a write
> > * - copy the data from iov into a new page
> > * - commit this page
>
> Thanks, done.
>
> I also removed the FIXMEs and created a second patch.
>
> Signed-off-by: Bernd Schubert <bs@...eap.de>
> Signed-off-by: Goswin von Brederlow <goswin@...brederlow.de>
Minor nit: when resubmitting a patch, you should include everything
(ie. the full changelog of problem statement and fix description) in a
single mail. It's just a bit easier...
So I believe the problem is that for a multi-segment iovec, we currently
prepare_write/commit_write once for each segment, right? We do this
because there is a nasty deadlock in the VM (copy_from_user being
called with a page locked), and copying multiple segs dramatically
increases the chances that one of these copies will cause a page fault
and thus potentially deadlock.
The fix you have I don't think can work because a filesystem must be
notified of the modification _before_ it has happened. (If I understand
correctly, you are skipping the prepare_write potentially until after
some data is copied?).
Anyway, there are fixes for this deadlock in Andrew's -mm tree, but
also a workaround for the NFSD problem in git commit 29dbb3fc. Did
you try a later kernel to see if it is fixed there?
Thanks,
Nick
>
> mm/filemap.c | 142 +++++++++++++++++++++++++++++++++----------------
> 1 file changed, 96 insertions(+), 46 deletions(-)
>
> Index: linux-2.6.20.3/mm/filemap.c
> ===================================================================
> --- linux-2.6.20.3.orig/mm/filemap.c 2007-09-05 14:04:18.000000000 +0200
> +++ linux-2.6.20.3/mm/filemap.c 2007-09-05 18:50:26.000000000 +0200
> @@ -2057,6 +2057,21 @@
> }
> EXPORT_SYMBOL(generic_file_direct_write);
>
> +/**
> + * generic_file_buffered_write - handle iov'ectors
> + * @iob: file operations
> + * @iov: vector of data to write
> + * @nr_segs: number of iov segments
> + * @pos: position in the file
> + * @ppos: position in the file after this function
> + * @count: number of bytes to write
> + * written: offset in iov->base (data to skip on write)
> + *
> + * This function will do 3 main tasks for each iov:
> + * - prepare a write
> + * - copy the data from iov into a new page
> + * - commit this page
> + */
> ssize_t
> generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
> unsigned long nr_segs, loff_t pos, loff_t *ppos,
> @@ -2074,6 +2089,11 @@
> const struct iovec *cur_iov = iov; /* current iovec */
> size_t iov_base = 0; /* offset in the current iovec */
> char __user *buf;
> + unsigned long data_start = (pos & (PAGE_CACHE_SIZE -1)); /* Within page
> */ + loff_t wpos = pos; /* the position in the file we will return */ +
> + /* position in file as index of pages */
> + unsigned long index = pos >> PAGE_CACHE_SHIFT;
>
> pagevec_init(&lru_pvec, 0);
>
> @@ -2087,9 +2107,15 @@
> buf = cur_iov->iov_base + iov_base;
> }
>
> + page = __grab_cache_page(mapping, index, &cached_page, &lru_pvec);
> + if (!page) {
> + status = -ENOMEM;
> + goto out;
> + }
> +
> do {
> - unsigned long index;
> unsigned long offset;
> + unsigned long data_end; /* end of data within the page */
> size_t copied;
>
> offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
> @@ -2106,6 +2132,8 @@
> */
> bytes = min(bytes, cur_iov->iov_len - iov_base);
>
> + data_end = offset + bytes;
> +
> /*
> * Bring in the user page that we will copy from _first_.
> * Otherwise there's a nasty deadlock on copying from the
> @@ -2114,34 +2142,30 @@
> */
> fault_in_pages_readable(buf, bytes);
>
> - page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
> - if (!page) {
> - status = -ENOMEM;
> - break;
> - }
> -
> if (unlikely(bytes == 0)) {
> status = 0;
> copied = 0;
> goto zero_length_segment;
> }
>
> - status = a_ops->prepare_write(file, page, offset, offset+bytes);
> - if (unlikely(status)) {
> - loff_t isize = i_size_read(inode);
> -
> - if (status != AOP_TRUNCATED_PAGE)
> - unlock_page(page);
> - page_cache_release(page);
> - if (status == AOP_TRUNCATED_PAGE)
> - continue;
> + if (data_end == PAGE_CACHE_SIZE || count == bytes) {
> /*
> - * prepare_write() may have instantiated a few blocks
> - * outside i_size. Trim these off again.
> + * Only prepare a write if its an entire page or if
> + * we don't have more data
> */
> - if (pos + bytes > isize)
> - vmtruncate(inode, isize);
> - break;
> + status = a_ops->prepare_write(file, page, data_start, data_end);
> + if (unlikely(status)) {
> + loff_t isize = i_size_read(inode);
> +
> + if (status == AOP_TRUNCATED_PAGE)
> + continue;
> + /*
> + * prepare_write() may have instantiated a few blocks
> + * outside i_size. Trim these off again.
> + */
> + if (pos + bytes > isize)
> + vmtruncate(inode, isize);
> + }
> }
> if (likely(nr_segs == 1))
> copied = filemap_copy_from_user(page, offset,
> @@ -2149,60 +2173,86 @@
> else
> copied = filemap_copy_from_user_iovec(page, offset,
> cur_iov, iov_base, bytes);
> - flush_dcache_page(page);
> - status = a_ops->commit_write(file, page, offset, offset+bytes);
> - if (status == AOP_TRUNCATED_PAGE) {
> +
> + if (data_end == PAGE_CACHE_SIZE || count == bytes) {
> + /*
> + * Same as above, always try fill pages up to
> + * PAGE_CACHE_SIZE if possible (sufficient data)
> + */
> + flush_dcache_page(page);
> + status = a_ops->commit_write(file, page,
> + data_start, data_end);
> + if (status == AOP_TRUNCATED_PAGE) {
> + continue;
> + }
> + unlock_page(page);
> + mark_page_accessed(page);
> page_cache_release(page);
> - continue;
> + balance_dirty_pages_ratelimited(mapping);
> + cond_resched();
> + if (bytes < count) { /* still more iov data to write */
> + page = __grab_cache_page(mapping, index + 1,
> + &cached_page, &lru_pvec);
> + if (!page) {
> + status = -ENOMEM;
> + goto out;
> + }
> + } else {
> + page = NULL;
> + }
> + written += data_end - data_start;
> + wpos += data_end - data_start;
> + data_start = 0; /* correct would be data_end % PAGE_SIZE
> + * but if this would be != 0, we don't
> + * have more data
> + */
> }
> zero_length_segment:
> if (likely(copied >= 0)) {
> - if (!status)
> - status = copied;
> -
> - if (status >= 0) {
> - written += status;
> - count -= status;
> - pos += status;
> - buf += status;
> + if (!status) {
> + count -= copied;
> + pos += copied;
> + buf += copied;
> if (unlikely(nr_segs > 1)) {
> filemap_set_next_iovec(&cur_iov,
> - &iov_base, status);
> + &iov_base, copied);
> if (count)
> buf = cur_iov->iov_base +
> iov_base;
> } else {
> - iov_base += status;
> + iov_base += copied;
> }
> }
> }
> if (unlikely(copied != bytes))
> - if (status >= 0)
> + if (!status)
> status = -EFAULT;
> - unlock_page(page);
> - mark_page_accessed(page);
> - page_cache_release(page);
> - if (status < 0)
> + if (status)
> break;
> - balance_dirty_pages_ratelimited(mapping);
> - cond_resched();
> } while (count);
> - *ppos = pos;
> +
> +out:
> + *ppos = wpos;
>
> if (cached_page)
> page_cache_release(cached_page);
>
> + if (page) {
> + unlock_page(page);
> + page_cache_release(page);
> + }
> +
> /*
> * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
> */
> - if (likely(status >= 0)) {
> + if (likely(!status)) {
> if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
> if (!a_ops->writepage || !is_sync_kiocb(iocb))
> status = generic_osync_inode(inode, mapping,
> OSYNC_METADATA|OSYNC_DATA);
> }
> - }
> -
> + }
> +
> /*
> * If we get here for O_DIRECT writes then we must have fallen through
> * to buffered writes (block instantiation inside i_size). So we sync
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Powered by blists - more mailing lists