lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Date:	Thu, 25 Jun 2009 13:41:10 +0900
From:	Hisashi Hifumi <hifumi.hisashi@....ntt.co.jp>
To:	Andrew Morton <akpm@...ux-foundation.org>
Cc:	linux-kernel@...r.kernel.org, linux-fsdevel@...r.kernel.org,
	Al Viro <viro@...IV.linux.org.uk>,
	Christoph Hellwig <hch@...radead.org>
Subject: [RESEND][PATCH] lseek: remove i_mutex

Hi, 

Following patch removes i_mutex from generic_file_llseek.
I think the reason of protecting lseek with i_mutex is just
touching i_size (and f_pos) atomically.
So i_mutex is no longer needed here by introducing i_size_read 
to read i_size.
I also made f_pos access atomic here without i_mutex regarding
former i_mutex holders by introducing file_pos_read/file_pos_write
that use seqlock to preserve atomicity.

Following patch removes i_mutex from generic_file_llseek, and deletes 
generic_file_llseek_nolock totally.

Currently there is i_mutex contention not only around lseek, but also fsync or write.
So,  I think we can mitigate i_mutex contention between fsync, lseek and write by
removing i_mutex. For example, PostgreSQL (DBMS software) can obtain benefit 
from this patch. When checkpoint of DBMS started, lseek and write contend with i_mutex 
and throughput drops. But with this patch this performance drop can mitigate because of the
reduction of i_mutex contention. 

I did some test to measure i_mutex contention.
This test do:
	1. make an 128MB file.
	2. fork 100 processes. repeat 1000000 times lseeking and reading randomly on 
		each process to this file.
	3, gauge seconds between start and end of this test.

The result was:

	-2.6.31-rc1
	# time ./lseek_test
	182 sec

	real    3m2.751s
	user    0m13.273s
	sys     46m26.764s

	-2.6.31-rc1-patched
	# time ./lseek_test
	8 sec

	real    0m9.026s
	user    0m12.680s
	sys     2m0.643s 

Hardware environment:
CPU 2.4GHz(Quad Core) *4
Memory 64GB

Following is the test program "lseek_test.c".

#include <stdio.h>
#include <sched.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <signal.h>
#define NUM 100
#define LEN 4096
#define LOOP 32*1024

int num;

void catch_SIGCHLD(int signo)
{
	pid_t child_pid = 0;
	do {
		int child_ret;
		child_pid = waitpid(-1, &child_ret, WNOHANG);
		if (child_pid > 0)
			num++;
	} while (child_pid > 0);
}
main()
{
        int  i, pid;
	char buf[LEN];
	unsigned long offset, filesize;
	time_t t1, t2;
	struct sigaction act;
	memset(buf, 0, LEN);
	memset(&act, 0, sizeof(act));
	act.sa_handler = catch_SIGCHLD;
	act.sa_flags = SA_NOCLDSTOP|SA_RESTART;
	sigemptyset(&act.sa_mask);
	sigaction(SIGCHLD, &act, NULL);	

	filesize = LEN * LOOP;
	int fd = open("targetfile1",O_RDWR|O_CREAT);	

	/* create a 128MB file */
	for(i = 0; i < LOOP; i++)
		write(fd, buf, LEN);
	fsync(fd);
	close(fd);
	
	time(&t1);	
	for (i = 0; i < NUM; i++) {	
		pid = fork();
		if(pid == 0){
		/* child */
			int fd = open("targetfile1", O_RDWR);	
			int j;
			for (j = 0; j < 1000000; j++) {
				offset = (random() % filesize);
				lseek(fd, offset, SEEK_SET);
				read(fd, buf, 10);
			}
			close(fd);
			exit(0);
		}
	}
	while(num < NUM)
		sleep(1);
	time(&t2);	
	printf("%d sec\n",t2-t1);
}



Thanks.

Signed-off-by: Hisashi Hifumi <hifumi.hisashi@....ntt.co.jp>

diff -Nrup linux-2.6.31-rc1.org/fs/cifs/cifsfs.c linux-2.6.31-rc1/fs/cifs/cifsfs.c
--- linux-2.6.31-rc1.org/fs/cifs/cifsfs.c	2009-06-25 10:22:44.000000000 +0900
+++ linux-2.6.31-rc1/fs/cifs/cifsfs.c	2009-06-25 10:30:27.000000000 +0900
@@ -641,7 +641,7 @@ static loff_t cifs_llseek(struct file *f
 		if (retval < 0)
 			return (loff_t)retval;
 	}
-	return generic_file_llseek_unlocked(file, offset, origin);
+	return generic_file_llseek(file, offset, origin);
 }
 
 #ifdef CONFIG_CIFS_EXPERIMENTAL
diff -Nrup linux-2.6.31-rc1.org/fs/file_table.c linux-2.6.31-rc1/fs/file_table.c
--- linux-2.6.31-rc1.org/fs/file_table.c	2009-06-25 10:22:44.000000000 +0900
+++ linux-2.6.31-rc1/fs/file_table.c	2009-06-25 10:31:12.000000000 +0900
@@ -126,6 +126,7 @@ struct file *get_empty_filp(void)
 
 	INIT_LIST_HEAD(&f->f_u.fu_list);
 	atomic_long_set(&f->f_count, 1);
+	f_pos_ordered_init(f);
 	rwlock_init(&f->f_owner.lock);
 	f->f_cred = get_cred(cred);
 	spin_lock_init(&f->f_lock);
diff -Nrup linux-2.6.31-rc1.org/fs/gfs2/file.c linux-2.6.31-rc1/fs/gfs2/file.c
--- linux-2.6.31-rc1.org/fs/gfs2/file.c	2009-06-25 10:22:44.000000000 +0900
+++ linux-2.6.31-rc1/fs/gfs2/file.c	2009-06-25 10:32:25.000000000 +0900
@@ -62,11 +62,11 @@ static loff_t gfs2_llseek(struct file *f
 		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
 					   &i_gh);
 		if (!error) {
-			error = generic_file_llseek_unlocked(file, offset, origin);
+			error = generic_file_llseek(file, offset, origin);
 			gfs2_glock_dq_uninit(&i_gh);
 		}
 	} else
-		error = generic_file_llseek_unlocked(file, offset, origin);
+		error = generic_file_llseek(file, offset, origin);
 
 	return error;
 }
diff -Nrup linux-2.6.31-rc1.org/fs/ncpfs/file.c linux-2.6.31-rc1/fs/ncpfs/file.c
--- linux-2.6.31-rc1.org/fs/ncpfs/file.c	2009-06-10 12:05:27.000000000 +0900
+++ linux-2.6.31-rc1/fs/ncpfs/file.c	2009-06-25 10:32:55.000000000 +0900
@@ -286,7 +286,7 @@ static loff_t ncp_remote_llseek(struct f
 {
 	loff_t ret;
 	lock_kernel();
-	ret = generic_file_llseek_unlocked(file, offset, origin);
+	ret = generic_file_llseek(file, offset, origin);
 	unlock_kernel();
 	return ret;
 }
diff -Nrup linux-2.6.31-rc1.org/fs/nfs/file.c linux-2.6.31-rc1/fs/nfs/file.c
--- linux-2.6.31-rc1.org/fs/nfs/file.c	2009-06-25 10:22:44.000000000 +0900
+++ linux-2.6.31-rc1/fs/nfs/file.c	2009-06-25 10:33:39.000000000 +0900
@@ -192,10 +192,10 @@ static loff_t nfs_file_llseek(struct fil
 			return (loff_t)retval;
 
 		spin_lock(&inode->i_lock);
-		loff = generic_file_llseek_unlocked(filp, offset, origin);
+		loff = generic_file_llseek(filp, offset, origin);
 		spin_unlock(&inode->i_lock);
 	} else
-		loff = generic_file_llseek_unlocked(filp, offset, origin);
+		loff = generic_file_llseek(filp, offset, origin);
 	return loff;
 }
 
diff -Nrup linux-2.6.31-rc1.org/fs/read_write.c linux-2.6.31-rc1/fs/read_write.c
--- linux-2.6.31-rc1.org/fs/read_write.c	2009-06-25 10:22:44.000000000 +0900
+++ linux-2.6.31-rc1/fs/read_write.c	2009-06-25 10:48:08.000000000 +0900
@@ -31,23 +31,61 @@ const struct file_operations generic_ro_
 
 EXPORT_SYMBOL(generic_ro_fops);
 
+static inline loff_t file_pos_read(struct file *file)
+{
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+	loff_t pos;
+	unsigned int seq;
+
+	do {
+		seq = read_seqbegin(&file->f_pos_seqlock);
+		pos = file->f_pos;
+	} while (read_seqretry(&file->f_pos_seqlock, seq));
+	return pos;
+#elif BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT)
+	loff_t pos;
+
+	preempt_disable();
+	pos = file->f_pos;
+	preempt_enable();
+	return pos;
+#else
+	return file->f_pos;
+#endif
+}
+
+static inline void file_pos_write(struct file *file, loff_t pos)
+{
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+	write_seqlock(&file->f_pos_seqlock);
+	file->f_pos = pos;
+	write_sequnlock(&file->f_pos_seqlock);
+#elif BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT)
+	preempt_disable();
+	file->f_pos = pos;
+	preempt_enable();
+#else
+	file->f_pos = pos;
+#endif
+}
+
 /**
- * generic_file_llseek_unlocked - lockless generic llseek implementation
+ * generic_file_llseek - generic llseek implementation for regular files
  * @file:	file structure to seek on
  * @offset:	file offset to seek to
  * @origin:	type of seek
  *
- * Updates the file offset to the value specified by @offset and @origin.
- * Locking must be provided by the caller.
+ * This is a generic implemenation of ->llseek useable for all normal local
+ * filesystems.  It just updates the file offset to the value specified by
+ * @offset and @origin.
  */
-loff_t
-generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
+loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 {
 	struct inode *inode = file->f_mapping->host;
 
 	switch (origin) {
 	case SEEK_END:
-		offset += inode->i_size;
+		offset += i_size_read(inode);
 		break;
 	case SEEK_CUR:
 		/*
@@ -57,8 +95,8 @@ generic_file_llseek_unlocked(struct file
 		 * write() or lseek() might have altered it
 		 */
 		if (offset == 0)
-			return file->f_pos;
-		offset += file->f_pos;
+			return file_pos_read(file);
+		offset += file_pos_read(file);
 		break;
 	}
 
@@ -66,35 +104,13 @@ generic_file_llseek_unlocked(struct file
 		return -EINVAL;
 
 	/* Special lock needed here? */
-	if (offset != file->f_pos) {
-		file->f_pos = offset;
+	if (offset != file_pos_read(file)) {
+		file_pos_write(file, offset);
 		file->f_version = 0;
 	}
 
 	return offset;
 }
-EXPORT_SYMBOL(generic_file_llseek_unlocked);
-
-/**
- * generic_file_llseek - generic llseek implementation for regular files
- * @file:	file structure to seek on
- * @offset:	file offset to seek to
- * @origin:	type of seek
- *
- * This is a generic implemenation of ->llseek useable for all normal local
- * filesystems.  It just updates the file offset to the value specified by
- * @offset and @origin under i_mutex.
- */
-loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
-{
-	loff_t rval;
-
-	mutex_lock(&file->f_dentry->d_inode->i_mutex);
-	rval = generic_file_llseek_unlocked(file, offset, origin);
-	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
-
-	return rval;
-}
 EXPORT_SYMBOL(generic_file_llseek);
 
 loff_t no_llseek(struct file *file, loff_t offset, int origin)
@@ -359,16 +375,6 @@ ssize_t vfs_write(struct file *file, con
 
 EXPORT_SYMBOL(vfs_write);
 
-static inline loff_t file_pos_read(struct file *file)
-{
-	return file->f_pos;
-}
-
-static inline void file_pos_write(struct file *file, loff_t pos)
-{
-	file->f_pos = pos;
-}
-
 SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
 {
 	struct file *file;
diff -Nrup linux-2.6.31-rc1.org/fs/smbfs/file.c linux-2.6.31-rc1/fs/smbfs/file.c
--- linux-2.6.31-rc1.org/fs/smbfs/file.c	2009-06-10 12:05:27.000000000 +0900
+++ linux-2.6.31-rc1/fs/smbfs/file.c	2009-06-25 10:48:29.000000000 +0900
@@ -426,7 +426,7 @@ static loff_t smb_remote_llseek(struct f
 {
 	loff_t ret;
 	lock_kernel();
-	ret = generic_file_llseek_unlocked(file, offset, origin);
+	ret = generic_file_llseek(file, offset, origin);
 	unlock_kernel();
 	return ret;
 }
diff -Nrup linux-2.6.31-rc1.org/include/linux/fs.h linux-2.6.31-rc1/include/linux/fs.h
--- linux-2.6.31-rc1.org/include/linux/fs.h	2009-06-25 10:22:44.000000000 +0900
+++ linux-2.6.31-rc1/include/linux/fs.h	2009-06-25 10:51:08.000000000 +0900
@@ -902,6 +902,16 @@ static inline int ra_has_index(struct fi
 #define FILE_MNT_WRITE_TAKEN	1
 #define FILE_MNT_WRITE_RELEASED	2
 
+/*
+ * Use sequence lock to get consistent f_pos on 32-bit processors.
+ */
+#if BITS_PER_LONG == 32 && defined(CONFIG_SMP)
+#define __NEED_F_POS_ORDERED
+#define f_pos_ordered_init(file) seqlock_init(&file->f_pos_seqlock)
+#else
+#define f_pos_ordered_init(file) do { } while (0)
+#endif
+
 struct file {
 	/*
 	 * fu_list becomes invalid after file_free is called and queued via
@@ -920,6 +930,9 @@ struct file {
 	unsigned int 		f_flags;
 	fmode_t			f_mode;
 	loff_t			f_pos;
+#ifdef __NEED_F_POS_ORDERED
+	seqlock_t               f_pos_seqlock;
+#endif
 	struct fown_struct	f_owner;
 	const struct cred	*f_cred;
 	struct file_ra_state	f_ra;
@@ -2219,8 +2232,6 @@ extern void
 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
 extern loff_t no_llseek(struct file *file, loff_t offset, int origin);
 extern loff_t generic_file_llseek(struct file *file, loff_t offset, int origin);
-extern loff_t generic_file_llseek_unlocked(struct file *file, loff_t offset,
-			int origin);
 extern int generic_file_open(struct inode * inode, struct file * filp);
 extern int nonseekable_open(struct inode * inode, struct file * filp);
 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ