lists.openwall.net   lists  /  announce  owl-users  owl-dev  john-users  john-dev  passwdqc-users  yescrypt  popa3d-users  /  oss-security  kernel-hardening  musl  sabotage  tlsify  passwords  /  crypt-dev  xvendor  /  Bugtraq  Full-Disclosure  linux-kernel  linux-netdev  linux-ext4  linux-hardening  linux-cve-announce  PHC 
Open Source and information security mailing list archives
 
Hash Suite: Windows password security audit tool. GUI, reports in PDF.
[<prev] [next>] [thread-next>] [day] [month] [year] [list]
Message-Id: <1275428028-10901-1-git-send-email-arnd@arndb.de>
Date:	Tue,  1 Jun 2010 23:33:48 +0200
From:	Arnd Bergmann <arnd@...db.de>
To:	Jens Axboe <jaxboe@...ionio.com>
Cc:	Arnd Bergmann <arnd@...db.de>, Jens Axboe <jaxboe@...ionio.com>,
	Vivek Goyal <vgoyal@...hat.com>, Tejun Heo <tj@...nel.org>,
	Frederic Weisbecker <fweisbec@...il.com>,
	FUJITA Tomonori <fujita.tomonori@....ntt.co.jp>,
	"Martin K. Petersen" <martin.petersen@...cle.com>,
	Steven Rostedt <rostedt@...dmis.org>,
	Douglas Gilbert <dgilbert@...erlog.com>,
	Ingo Molnar <mingo@...hat.com>, John Kacur <jkacur@...hat.com>,
	linux-scsi@...r.kernel.org, linux-kernel@...r.kernel.org
Subject: [PATCH] block: replace BKL with global mutex

The block layer is one of the remaining users
of the Big Kernel Lock. In there, it is used
mainly for serializing the blkdev_get and
blkdev_put functions and some ioctl implementations
as well as some less common open functions of
related character devices following a previous
pushdown and parts of the blktrace code.

The only one that seems to be a bit nasty is the
blkdev_get function which is actually recursive
and may try to get the BKL twice.

All users except the one in blkdev_get seem to
be outermost locks, meaning we don't rely on
the release-on-sleep semantics to avoid deadlocks.

The ctl_mutex (pktcdvd.ko), raw_mutex (raw.ko),
state_mutex (dasd.ko), reconfig_mutex (md.ko),
and jfs_log_mutex (jfs.ko) may be held when
blkdev_get is called, but as far as I can tell,
these mutexes are never acquired from any of the
functions that get converted in this patch.

In order to get rid of the BKL, this introduces
a new global mutex called blkdev_mutex, which
replaces the BKL in all drivers that directly
interact with the block layer. In case of blkdev_get,
the mutex is moved outside of the function itself
in order to avoid the recursive taking of blkdev_mutex.

Testing so far has shown no problems whatsoever
from this patch, but the usage in blkdev_get
may introduce extra latencies, and I may have
missed corner cases where an block device ioctl
function sleeps for a significant amount of time,
which may be harmful to the performance of other
threads.

Signed-off-by: Arnd Bergmann <arnd@...db.de>
Cc: Jens Axboe <jaxboe@...ionio.com>
Cc: Vivek Goyal <vgoyal@...hat.com>
Cc: Tejun Heo <tj@...nel.org>
Cc: Frederic Weisbecker <fweisbec@...il.com>
Cc: FUJITA Tomonori <fujita.tomonori@....ntt.co.jp>
Cc: "Martin K. Petersen" <martin.petersen@...cle.com>
Cc: Steven Rostedt <rostedt@...dmis.org>
Cc: Douglas Gilbert <dgilbert@...erlog.com>
Cc: Ingo Molnar <mingo@...hat.com>
Cc: John Kacur <jkacur@...hat.com>
Cc: linux-scsi@...r.kernel.org
Cc: linux-kernel@...r.kernel.org
---

Jens, would you consider this patch for 2.6.36 and take it into
your -next tree?

 block/bsg.c             |    2 --
 block/compat_ioctl.c    |    8 ++++----
 block/ioctl.c           |   24 ++++++++++++------------
 drivers/block/DAC960.c  |    4 ++--
 drivers/block/cciss.c   |    4 ++--
 drivers/scsi/sg.c       |    4 ++--
 fs/block_dev.c          |   20 +++++++++++++-------
 include/linux/blkdev.h  |    6 ++++++
 kernel/trace/blktrace.c |   14 ++++++++------
 9 files changed, 49 insertions(+), 37 deletions(-)

diff --git a/block/bsg.c b/block/bsg.c
index 82d5882..7c50a26 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -843,9 +843,7 @@ static int bsg_open(struct inode *inode, struct file *file)
 {
 	struct bsg_device *bd;
 
-	lock_kernel();
 	bd = bsg_get_device(inode, file);
-	unlock_kernel();
 
 	if (IS_ERR(bd))
 		return PTR_ERR(bd);
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index f26051f..11cfd3c 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -802,16 +802,16 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		return compat_put_u64(arg, bdev->bd_inode->i_size);
 
 	case BLKTRACESETUP32:
-		lock_kernel();
+		mutex_lock(&blkdev_mutex);
 		ret = compat_blk_trace_setup(bdev, compat_ptr(arg));
-		unlock_kernel();
+		mutex_unlock(&blkdev_mutex);
 		return ret;
 	case BLKTRACESTART: /* compatible */
 	case BLKTRACESTOP:  /* compatible */
 	case BLKTRACETEARDOWN: /* compatible */
-		lock_kernel();
+		mutex_lock(&blkdev_mutex);
 		ret = blk_trace_ioctl(bdev, cmd, compat_ptr(arg));
-		unlock_kernel();
+		mutex_unlock(&blkdev_mutex);
 		return ret;
 	default:
 		if (disk->fops->compat_ioctl)
diff --git a/block/ioctl.c b/block/ioctl.c
index e8eb679..6b8fde1 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -169,9 +169,9 @@ int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
 		return disk->fops->ioctl(bdev, mode, cmd, arg);
 
 	if (disk->fops->locked_ioctl) {
-		lock_kernel();
+		mutex_lock(&blkdev_mutex);
 		ret = disk->fops->locked_ioctl(bdev, mode, cmd, arg);
-		unlock_kernel();
+		mutex_unlock(&blkdev_mutex);
 		return ret;
 	}
 
@@ -206,10 +206,10 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		if (ret != -EINVAL && ret != -ENOTTY)
 			return ret;
 
-		lock_kernel();
+		mutex_lock(&blkdev_mutex);
 		fsync_bdev(bdev);
 		invalidate_bdev(bdev);
-		unlock_kernel();
+		mutex_unlock(&blkdev_mutex);
 		return 0;
 
 	case BLKROSET:
@@ -221,9 +221,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 			return -EACCES;
 		if (get_user(n, (int __user *)(arg)))
 			return -EFAULT;
-		lock_kernel();
+		mutex_lock(&blkdev_mutex);
 		set_device_ro(bdev, n);
-		unlock_kernel();
+		mutex_unlock(&blkdev_mutex);
 		return 0;
 
 	case BLKDISCARD: {
@@ -309,14 +309,14 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 			bd_release(bdev);
 		return ret;
 	case BLKPG:
-		lock_kernel();
+		mutex_lock(&blkdev_mutex);
 		ret = blkpg_ioctl(bdev, (struct blkpg_ioctl_arg __user *) arg);
-		unlock_kernel();
+		mutex_unlock(&blkdev_mutex);
 		break;
 	case BLKRRPART:
-		lock_kernel();
+		mutex_lock(&blkdev_mutex);
 		ret = blkdev_reread_part(bdev);
-		unlock_kernel();
+		mutex_unlock(&blkdev_mutex);
 		break;
 	case BLKGETSIZE:
 		size = bdev->bd_inode->i_size;
@@ -329,9 +329,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKTRACESTOP:
 	case BLKTRACESETUP:
 	case BLKTRACETEARDOWN:
-		lock_kernel();
+		mutex_lock(&blkdev_mutex);
 		ret = blk_trace_ioctl(bdev, cmd, (char __user *) arg);
-		unlock_kernel();
+		mutex_unlock(&blkdev_mutex);
 		break;
 	default:
 		ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index c5f22bb..8faaa12 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -6620,7 +6620,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
   long ErrorCode = 0;
   if (!capable(CAP_SYS_ADMIN)) return -EACCES;
 
-  lock_kernel();
+  mutex_lock(&blkdev_mutex);
   switch (Request)
     {
     case DAC960_IOCTL_GET_CONTROLLER_COUNT:
@@ -7051,7 +7051,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
       default:
 	ErrorCode = -ENOTTY;
     }
-  unlock_kernel();
+  mutex_unlock(&blkdev_mutex);
   return ErrorCode;
 }
 
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 51ceaee..6bb6c4d 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1027,9 +1027,9 @@ static int do_ioctl(struct block_device *bdev, fmode_t mode,
 		    unsigned cmd, unsigned long arg)
 {
 	int ret;
-	lock_kernel();
+	mutex_lock(&blkdev_mutex);
 	ret = cciss_ioctl(bdev, mode, cmd, arg);
-	unlock_kernel();
+	mutex_unlock(&blkdev_mutex);
 	return ret;
 }
 
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index ef752b2..927aea6 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -229,7 +229,7 @@ sg_open(struct inode *inode, struct file *filp)
 	int res;
 	int retval;
 
-	lock_kernel();
+	mutex_lock(&blkdev_mutex);
 	nonseekable_open(inode, filp);
 	SCSI_LOG_TIMEOUT(3, printk("sg_open: dev=%d, flags=0x%x\n", dev, flags));
 	sdp = sg_get_dev(dev);
@@ -307,7 +307,7 @@ error_out:
 sg_put:
 	if (sdp)
 		sg_put_dev(sdp);
-	unlock_kernel();
+	mutex_unlock(&blkdev_mutex);
 	return retval;
 }
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 7346c96..06882f9 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -29,6 +29,9 @@
 #include <asm/uaccess.h>
 #include "internal.h"
 
+DEFINE_MUTEX(blkdev_mutex);
+EXPORT_SYMBOL_GPL(blkdev_mutex);
+
 struct bdev_inode {
 	struct block_device bdev;
 	struct inode vfs_inode;
@@ -1321,7 +1324,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		return ret;
 	}
 
-	lock_kernel();
  restart:
 
 	ret = -ENXIO;
@@ -1407,7 +1409,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 	if (for_part)
 		bdev->bd_part_count++;
 	mutex_unlock(&bdev->bd_mutex);
-	unlock_kernel();
 	return 0;
 
  out_clear:
@@ -1421,7 +1422,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
  out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
  out_unlock_kernel:
-	unlock_kernel();
 
 	if (disk)
 		module_put(disk->fops->owner);
@@ -1433,7 +1433,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 
 int blkdev_get(struct block_device *bdev, fmode_t mode)
 {
-	return __blkdev_get(bdev, mode, 0);
+	int ret;
+	mutex_lock(&blkdev_mutex);
+	ret = __blkdev_get(bdev, mode, 0);
+	mutex_unlock(&blkdev_mutex);
+	return ret;
 }
 EXPORT_SYMBOL(blkdev_get);
 
@@ -1491,7 +1495,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 	struct block_device *victim = NULL;
 
 	mutex_lock_nested(&bdev->bd_mutex, for_part);
-	lock_kernel();
 	if (for_part)
 		bdev->bd_part_count--;
 
@@ -1516,7 +1519,6 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 			victim = bdev->bd_contains;
 		bdev->bd_contains = NULL;
 	}
-	unlock_kernel();
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
 	if (victim)
@@ -1526,7 +1528,11 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 
 int blkdev_put(struct block_device *bdev, fmode_t mode)
 {
-	return __blkdev_put(bdev, mode, 0);
+	int ret;
+	mutex_lock(&blkdev_mutex);
+	ret = __blkdev_put(bdev, mode, 0);
+	mutex_unlock(&blkdev_mutex);
+	return ret;
 }
 EXPORT_SYMBOL(blkdev_put);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8b7f5e0..69aa7f1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1340,6 +1340,12 @@ struct block_device_operations {
 
 extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 				 unsigned long);
+
+/*
+ * replaces the big kernel lock in the block subsystem
+ */
+extern struct mutex blkdev_mutex;
+
 #else /* CONFIG_BLOCK */
 /*
  * stubs for when the block layer is configured out
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 36ea2b6..fcf17e7 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -305,7 +305,7 @@ static int blk_dropped_open(struct inode *inode, struct file *filp)
 {
 	filp->private_data = inode->i_private;
 
-	return 0;
+	return nonseekable_open(inode, filp);
 }
 
 static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
@@ -323,13 +323,14 @@ static const struct file_operations blk_dropped_fops = {
 	.owner =	THIS_MODULE,
 	.open =		blk_dropped_open,
 	.read =		blk_dropped_read,
+	.llseek =	no_llseek,
 };
 
 static int blk_msg_open(struct inode *inode, struct file *filp)
 {
 	filp->private_data = inode->i_private;
 
-	return 0;
+	return nonseekable_open(inode, filp);
 }
 
 static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
@@ -362,6 +363,7 @@ static const struct file_operations blk_msg_fops = {
 	.owner =	THIS_MODULE,
 	.open =		blk_msg_open,
 	.write =	blk_msg_write,
+	.llseek =	no_llseek,
 };
 
 /*
@@ -1601,7 +1603,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 	struct block_device *bdev;
 	ssize_t ret = -ENXIO;
 
-	lock_kernel();
+	mutex_lock(&blkdev_mutex);
 	bdev = bdget(part_devt(p));
 	if (bdev == NULL)
 		goto out_unlock_kernel;
@@ -1633,7 +1635,7 @@ out_unlock_bdev:
 out_bdput:
 	bdput(bdev);
 out_unlock_kernel:
-	unlock_kernel();
+	mutex_unlock(&blkdev_mutex);
 	return ret;
 }
 
@@ -1663,7 +1665,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 
 	ret = -ENXIO;
 
-	lock_kernel();
+	mutex_lock(&blkdev_mutex);
 	p = dev_to_part(dev);
 	bdev = bdget(part_devt(p));
 	if (bdev == NULL)
@@ -1703,7 +1705,7 @@ out_unlock_bdev:
 out_bdput:
 	bdput(bdev);
 out_unlock_kernel:
-	unlock_kernel();
+	mutex_unlock(&blkdev_mutex);
 out:
 	return ret ? ret : count;
 }
-- 
1.7.0.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@...r.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Powered by blists - more mailing lists

Powered by Openwall GNU/*/Linux Powered by OpenVZ