2.6.28-stable review patch. If anyone has any objections, please let us know. ------------------ From: Dan Williams [backport of 5fd3a17ed456637a224cf4ca82b9ad9d005bc8d4] Resolve a deadlock when stopping redundant arrays, i.e. ones that require a call to sysfs_remove_group when shutdown. The deadlock is summarized below: Thread1 Thread2 ------- ------- read sysfs attribute stop array take mddev lock sysfs_remove_group sysfs_get_active wait for mddev lock wait for active Sysrq-w: -------- mdmon S 00000017 2212 4163 1 f1982ea8 00000046 2dcf6b85 00000017 c0b23100 f2f83ed0 c0b23100 f2f8413c c0b23100 c0b23100 c0b1fb98 f2f8413c 00000000 f2f8413c c0b23100 f2291ecc 00000002 c0b23100 00000000 00000017 f2f83ed0 f1982eac 00000046 c044d9dd Call Trace: [] ? debug_mutex_add_waiter+0x1d/0x58 [] __mutex_lock_common+0x1d9/0x338 [] ? __mutex_lock_common+0x1d9/0x338 [] mutex_lock_interruptible_nested+0x33/0x3a [] ? mddev_lock+0x14/0x16 [] mddev_lock+0x14/0x16 [] md_attr_show+0x2a/0x49 [] sysfs_read_file+0x93/0xf9 mdadm D 00000017 2812 4177 1 f0401d78 00000046 430456f8 00000017 f0401d58 f0401d20 c0b23100 f2da2c4c c0b23100 c0b23100 c0b1fb98 f2da2c4c 0a10fc36 00000000 c0b23100 f0401d70 00000003 c0b23100 00000000 00000017 f2da29e0 00000001 00000002 00000000 Call Trace: [] schedule_timeout+0x1b/0x95 [] ? schedule_timeout+0x1b/0x95 [] ? wait_for_common+0x34/0xdc [] ? trace_hardirqs_on_caller+0x18/0x145 [] ? trace_hardirqs_on+0xb/0xd [] wait_for_common+0xa0/0xdc [] ? default_wake_function+0x0/0x12 [] wait_for_completion+0x17/0x19 [] sysfs_addrm_finish+0x19f/0x1d1 [] sysfs_hash_and_remove+0x42/0x55 [] sysfs_remove_group+0x57/0x86 [] do_md_stop+0x13a/0x499 This has been there for a while, but is easier to trigger now that mdmon is closely watching sysfs. Cc: Neil Brown Reported-by: Jacek Danecki Signed-off-by: Dan Williams Signed-off-by: Greg Kroah-Hartman --- drivers/md/md.c | 27 ++++++++++++++++++++++++--- include/linux/raid/md_k.h | 2 ++ 2 files changed, 26 insertions(+), 3 deletions(-) --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3694,6 +3694,10 @@ static int do_md_run(mddev_t * mddev) return err; } if (mddev->pers->sync_request) { + /* wait for any previously scheduled redundancy groups + * to be removed + */ + flush_scheduled_work(); if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) printk(KERN_WARNING "md: cannot register extra attributes for %s\n", @@ -3824,6 +3828,14 @@ static void restore_bitmap_write_access( spin_unlock(&inode->i_lock); } + +static void sysfs_delayed_rm(struct work_struct *ws) +{ + mddev_t *mddev = container_of(ws, mddev_t, del_work); + + sysfs_remove_group(&mddev->kobj, &md_redundancy_group); +} + /* mode: * 0 - completely stop and dis-assemble array * 1 - switch to readonly @@ -3833,6 +3845,7 @@ static int do_md_stop(mddev_t * mddev, i { int err = 0; struct gendisk *disk = mddev->gendisk; + int remove_group = 0; if (atomic_read(&mddev->openers) > is_open) { printk("md: %s still in use.\n",mdname(mddev)); @@ -3868,10 +3881,9 @@ static int do_md_stop(mddev_t * mddev, i mddev->queue->merge_bvec_fn = NULL; mddev->queue->unplug_fn = NULL; mddev->queue->backing_dev_info.congested_fn = NULL; - if (mddev->pers->sync_request) - sysfs_remove_group(&mddev->kobj, &md_redundancy_group); - module_put(mddev->pers->owner); + if (mddev->pers->sync_request) + remove_group = 1; mddev->pers = NULL; /* tell userspace to handle 'inactive' */ sysfs_notify_dirent(mddev->sysfs_state); @@ -3919,6 +3931,15 @@ static int do_md_stop(mddev_t * mddev, i /* make sure all md_delayed_delete calls have finished */ flush_scheduled_work(); + /* we can't wait for group removal under mddev_lock as + * threads holding the group 'active' need to acquire + * mddev_lock before going inactive + */ + if (remove_group) { + INIT_WORK(&mddev->del_work, sysfs_delayed_rm); + schedule_work(&mddev->del_work); + } + export_array(mddev); mddev->array_sectors = 0; --- a/include/linux/raid/md_k.h +++ b/include/linux/raid/md_k.h @@ -245,6 +245,8 @@ struct mddev_s * file in sysfs. */ + struct work_struct del_work; /* used for delayed sysfs removal */ + spinlock_t write_lock; wait_queue_head_t sb_wait; /* for waiting on superblock updates */ atomic_t pending_writes; /* number of active superblock writes */ -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/