From 9ed5720e8a3139cc703ae19aef5fd3a72658d132 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 3 Mar 2014 09:36:58 -0800 Subject: [PATCH 1/2] vfs: atomic f_pos accesses as per POSIX Our write() system call has always been atomic in the sense that you get the expected thread-safe contiguous write, but we haven't actually guaranteed that concurrent writes are serialized wrt f_pos accesses, so threads (or processes) that share a file descriptor and use "write()" concurrently would quite likely overwrite each others data. This violates POSIX.1-2008/SUSv4 Section XSI 2.9.7 that says: "2.9.7 Thread Interactions with Regular File Operations All of the following functions shall be atomic with respect to each other in the effects specified in POSIX.1-2008 when they operate on regular files or symbolic links: [...]" and one of the effects is the file position update. This unprotected file position behavior is not new behavior, and nobody has ever cared. Until now. Yongzhi Pan reported unexpected behavior to Michael Kerrisk that was due to this. This resolves the issue with a f_pos-specific lock that is taken by read/write/lseek on file descriptors that may be shared across threads or processes. Reported-by: Yongzhi Pan Reported-by: Michael Kerrisk Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/file_table.c | 1 + fs/namei.c | 2 +- fs/open.c | 4 ++++ fs/read_write.c | 54 ++++++++++++++++++++++++++++++++++++++-------------- include/linux/file.h | 6 ++++-- include/linux/fs.h | 6 +++++- 6 files changed, 55 insertions(+), 18 deletions(-) diff --git a/fs/file_table.c b/fs/file_table.c index 5fff9030be34..5b24008ea4f6 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -135,6 +135,7 @@ struct file *get_empty_filp(void) atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); + mutex_init(&f->f_pos_lock); eventpoll_init_file(f); /* f->f_version: 0 */ return f; diff --git a/fs/namei.c b/fs/namei.c index 385f7817bfcc..2f730ef9b4b3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1884,7 +1884,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, nd->path = f.file->f_path; if (flags & LOOKUP_RCU) { - if (f.need_put) + if (f.flags & FDPUT_FPUT) *fp = f.file; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); rcu_read_lock(); diff --git a/fs/open.c b/fs/open.c index 4b3e1edf2fe4..b9ed8b25c108 100644 --- a/fs/open.c +++ b/fs/open.c @@ -705,6 +705,10 @@ static int do_dentry_open(struct file *f, return 0; } + /* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */ + if (S_ISREG(inode->i_mode)) + f->f_mode |= FMODE_ATOMIC_POS; + f->f_op = fops_get(inode->i_fop); if (unlikely(WARN_ON(!f->f_op))) { error = -ENODEV; diff --git a/fs/read_write.c b/fs/read_write.c index edc5746a902a..e5b43dd53b94 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -264,10 +264,36 @@ loff_t vfs_llseek(struct file *file, loff_t offset, int whence) } EXPORT_SYMBOL(vfs_llseek); +/* + * We only lock f_pos if we have threads or if the file might be + * shared with another process. In both cases we'll have an elevated + * file count (done either by fdget() or by fork()). + */ +static struct fd fdget_pos(int fd) +{ + struct fd f = fdget(fd); + struct file *file = f.file; + + if (file && (file->f_mode & FMODE_ATOMIC_POS)) { + if (file_count(file) > 1) { + f.flags |= FDPUT_POS_UNLOCK; + mutex_lock(&file->f_pos_lock); + } + } + return f; +} + +static void fdput_pos(struct fd f) +{ + if (f.flags & FDPUT_POS_UNLOCK) + mutex_unlock(&f.file->f_pos_lock); + fdput(f); +} + SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) { off_t retval; - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); if (!f.file) return -EBADF; @@ -278,7 +304,7 @@ SYSCALL_DEFINE3(lseek, unsigned int, fd, off_t, offset, unsigned int, whence) if (res != (loff_t)retval) retval = -EOVERFLOW; /* LFS: should only happen on 32 bit platforms */ } - fdput(f); + fdput_pos(f); return retval; } @@ -498,7 +524,7 @@ static inline void file_pos_write(struct file *file, loff_t pos) SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -506,7 +532,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) ret = vfs_read(f.file, buf, count, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } return ret; } @@ -514,7 +540,7 @@ SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count) SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -522,7 +548,7 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, ret = vfs_write(f.file, buf, count, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } return ret; @@ -797,7 +823,7 @@ EXPORT_SYMBOL(vfs_writev); SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -805,7 +831,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, ret = vfs_readv(f.file, vec, vlen, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } if (ret > 0) @@ -817,7 +843,7 @@ SYSCALL_DEFINE3(readv, unsigned long, fd, const struct iovec __user *, vec, SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { @@ -825,7 +851,7 @@ SYSCALL_DEFINE3(writev, unsigned long, fd, const struct iovec __user *, vec, ret = vfs_writev(f.file, vec, vlen, &pos); if (ret >= 0) file_pos_write(f.file, pos); - fdput(f); + fdput_pos(f); } if (ret > 0) @@ -968,7 +994,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, const struct compat_iovec __user *,vec, compat_ulong_t, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret; loff_t pos; @@ -978,7 +1004,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, ret = compat_readv(f.file, vec, vlen, &pos); if (ret >= 0) f.file->f_pos = pos; - fdput(f); + fdput_pos(f); return ret; } @@ -1035,7 +1061,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, const struct compat_iovec __user *, vec, compat_ulong_t, vlen) { - struct fd f = fdget(fd); + struct fd f = fdget_pos(fd); ssize_t ret; loff_t pos; @@ -1045,7 +1071,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, ret = compat_writev(f.file, vec, vlen, &pos); if (ret >= 0) f.file->f_pos = pos; - fdput(f); + fdput_pos(f); return ret; } diff --git a/include/linux/file.h b/include/linux/file.h index cbacf4faf447..f2517fa2d610 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -28,12 +28,14 @@ static inline void fput_light(struct file *file, int fput_needed) struct fd { struct file *file; - int need_put; + unsigned int flags; }; +#define FDPUT_FPUT 1 +#define FDPUT_POS_UNLOCK 2 static inline void fdput(struct fd fd) { - if (fd.need_put) + if (fd.flags & FDPUT_FPUT) fput(fd.file); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 60829565e552..ebfde04bca06 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -123,6 +123,9 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)0x4000) +/* File needs atomic accesses to f_pos */ +#define FMODE_ATOMIC_POS ((__force fmode_t)0x8000) + /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x1000000) @@ -780,13 +783,14 @@ struct file { const struct file_operations *f_op; /* - * Protects f_ep_links, f_flags, f_pos vs i_size in lseek SEEK_CUR. + * Protects f_ep_links, f_flags. * Must not be taken from IRQ context. */ spinlock_t f_lock; atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; + struct mutex f_pos_lock; loff_t f_pos; struct fown_struct f_owner; const struct cred *f_cred; -- 1.9.0