Break the protection of sb->s_files out from under the global file_list_lock. sb->s_files is converted to a lock_list. furthermore to prevent the lock_list_head of getting too contended with concurrent add operations the add is buffered in per cpu filevecs. This would ordinarily require a flush before a delete operation - to ensure the to be deleted entry is indeed added to the list. This is avoided by storing a pointer to the filevec location in the not yet used list_head. This pointer can then be used to clear the filevec entry before its actually added. The file_flag mess is a bit unfortunate - this can be removed by also converting tty->tty_files to a lock_list (TODO). Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- fs/dquot.c | 10 +- fs/file_table.c | 170 +++++++++++++++++++++++++++++++++++++++---- fs/open.c | 2 fs/proc/generic.c | 8 -- fs/super.c | 7 - include/linux/fs.h | 23 +++++ mm/readahead.c | 2 security/selinux/selinuxfs.c | 9 +- 8 files changed, 194 insertions(+), 37 deletions(-) Index: linux-2.6/fs/dquot.c =================================================================== --- linux-2.6.orig/fs/dquot.c 2007-01-13 21:04:07.000000000 +0100 +++ linux-2.6/fs/dquot.c 2007-01-27 21:07:44.000000000 +0100 @@ -688,23 +688,21 @@ static int dqinit_needed(struct inode *i /* This routine is guarded by dqonoff_mutex mutex */ static void add_dquot_ref(struct super_block *sb, int type) { - struct list_head *p; + struct file *filp; restart: - file_list_lock(); - list_for_each(p, &sb->s_files) { - struct file *filp = list_entry(p, struct file, f_u.fu_list); + filevec_add_drain_all(); + lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) { struct inode *inode = filp->f_path.dentry->d_inode; if (filp->f_mode & FMODE_WRITE && dqinit_needed(inode, type)) { struct dentry *dentry = dget(filp->f_path.dentry); - file_list_unlock(); + lock_list_for_each_entry_stop(filp, f_u.fu_llist); sb->dq_op->initialize(inode, type); dput(dentry); /* As we may have blocked we had better restart... */ goto restart; } } - file_list_unlock(); } /* Return 0 if dqput() won't block (note that 1 doesn't necessarily mean blocking) */ Index: linux-2.6/fs/file_table.c =================================================================== --- linux-2.6.orig/fs/file_table.c 2007-01-13 21:04:07.000000000 +0100 +++ linux-2.6/fs/file_table.c 2007-01-27 21:07:44.000000000 +0100 @@ -113,7 +113,7 @@ struct file *get_empty_filp(void) goto fail_sec; tsk = current; - INIT_LIST_HEAD(&f->f_u.fu_list); + INIT_LOCK_LIST_HEAD(&f->f_u.fu_llist); atomic_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); f->f_uid = tsk->fsuid; @@ -245,32 +245,175 @@ void put_filp(struct file *file) } } -void file_move(struct file *file, struct list_head *list) +enum { + FILEVEC_SIZE = 15 +}; + +struct filevec { + unsigned long nr; + struct file *files[FILEVEC_SIZE]; +}; + +static DEFINE_PER_CPU(struct filevec, sb_fvec); + +static inline unsigned int filevec_size(struct filevec *fvec) { - if (!list) - return; - file_list_lock(); - list_move(&file->f_u.fu_list, list); - file_list_unlock(); + return FILEVEC_SIZE - fvec->nr; +} + +static inline unsigned int filevec_count(struct filevec *fvec) +{ + return fvec->nr; +} + +static inline void filevec_reinit(struct filevec *fvec) +{ + fvec->nr = 0; +} + +static inline unsigned int filevec_add(struct filevec *fvec, struct file *filp) +{ + rcu_assign_pointer(fvec->files[fvec->nr], filp); + + /* + * Here we do icky stuff in order to avoid flushing the per cpu filevec + * on list removal. + * + * We store the location on the per cpu filevec in the as of yet unused + * fu_llist.next field and toggle bit 0 to indicate we done so. This + * allows the removal code to set the filevec entry to NULL, thereby + * avoiding the list add. + * + * Abuse the fu_llist.lock for protection. + */ + spin_lock(&filp->f_u.fu_llist.lock); + filp->f_u.fu_llist.next = (void *)&fvec->files[fvec->nr]; + __set_bit(0, (void *)&filp->f_u.fu_llist.next); + spin_unlock(&filp->f_u.fu_llist.lock); + + fvec->nr++; + return filevec_size(fvec); +} + +static void __filevec_add(struct filevec *fvec) +{ + int i; + + for (i = 0; i < filevec_count(fvec); i++) { + struct file *filp; + + /* + * see the comment in filevec_add(); + * need RCU because a concurrent remove might have deleted + * the entry from under us. + */ + rcu_read_lock(); + filp = rcu_dereference(fvec->files[i]); + /* + * the simple case, its gone - NEXT! + */ + if (!filp) { + rcu_read_unlock(); + continue; + } + + spin_lock(&filp->f_u.fu_llist.lock); + /* + * If the entry really is still there, add it! + */ + if (rcu_dereference(fvec->files[i])) { + struct super_block *sb = + filp->f_mapping->host->i_sb; + + __lock_list_add(&filp->f_u.fu_llist, &sb->s_files); + } + spin_unlock(&filp->f_u.fu_llist.lock); + rcu_read_unlock(); + } + filevec_reinit(fvec); +} + +static void filevec_add_drain(void) +{ + struct filevec *fvec = &get_cpu_var(sb_fvec); + if (filevec_count(fvec)) + __filevec_add(fvec); + put_cpu_var(sb_fvec); } +static void filevec_add_drain_per_cpu(struct work_struct *dummy) +{ + filevec_add_drain(); +} + +int filevec_add_drain_all(void) +{ + return schedule_on_each_cpu(filevec_add_drain_per_cpu); +} +EXPORT_SYMBOL_GPL(filevec_add_drain_all); + void file_kill(struct file *file) { - if (!list_empty(&file->f_u.fu_list)) { + if (file_flag(file, F_SUPERBLOCK)) { + void **ptr; + + file_flag_clear(file, F_SUPERBLOCK); + + /* + * If bit 0 of the fu_llist.next pointer is set we're still + * enqueued on a per cpu filevec, in that case clear the entry + * and we're done. + */ + spin_lock(&file->f_u.fu_llist.lock); + ptr = (void **)file->f_u.fu_llist.next; + if (__test_and_clear_bit(0, (void *)&ptr)) { + rcu_assign_pointer(*ptr, NULL); + INIT_LIST_HEAD(&file->f_u.fu_llist.head); + spin_unlock(&file->f_u.fu_llist.lock); + return; + } + spin_unlock(&file->f_u.fu_llist.lock); + + if (!list_empty(&file->f_u.fu_list)) + lock_list_del_init(&file->f_u.fu_llist); + + } else if (!list_empty(&file->f_u.fu_list)) { file_list_lock(); list_del_init(&file->f_u.fu_list); file_list_unlock(); } } +void file_move(struct file *file, struct list_head *list) +{ + struct super_block *sb; + + if (!list) + return; + + file_kill(file); + + sb = file->f_mapping->host->i_sb; + if (list == &sb->s_files.head) { + struct filevec *fvec = &get_cpu_var(sb_fvec); + file_flag_set(file, F_SUPERBLOCK); + if (!filevec_add(fvec, file)) + __filevec_add(fvec); + put_cpu_var(sb_fvec); + } else { + file_list_lock(); + list_add(&file->f_u.fu_list, list); + file_list_unlock(); + } +} + int fs_may_remount_ro(struct super_block *sb) { - struct list_head *p; + struct file *file; /* Check that no files are currently opened for writing. */ - file_list_lock(); - list_for_each(p, &sb->s_files) { - struct file *file = list_entry(p, struct file, f_u.fu_list); + filevec_add_drain_all(); + lock_list_for_each_entry(file, &sb->s_files, f_u.fu_llist) { struct inode *inode = file->f_path.dentry->d_inode; /* File with pending delete? */ @@ -281,10 +424,9 @@ int fs_may_remount_ro(struct super_block if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) goto too_bad; } - file_list_unlock(); return 1; /* Tis' cool bro. */ too_bad: - file_list_unlock(); + lock_list_for_each_entry_stop(file, f_u.fu_llist); return 0; } Index: linux-2.6/fs/open.c =================================================================== --- linux-2.6.orig/fs/open.c 2007-01-13 21:04:07.000000000 +0100 +++ linux-2.6/fs/open.c 2007-01-27 21:07:44.000000000 +0100 @@ -692,7 +692,7 @@ static struct file *__dentry_open(struct f->f_path.mnt = mnt; f->f_pos = 0; f->f_op = fops_get(inode->i_fop); - file_move(f, &inode->i_sb->s_files); + file_move(f, &inode->i_sb->s_files.head); if (!open && f->f_op) open = f->f_op->open; Index: linux-2.6/fs/proc/generic.c =================================================================== --- linux-2.6.orig/fs/proc/generic.c 2007-01-13 21:04:07.000000000 +0100 +++ linux-2.6/fs/proc/generic.c 2007-01-27 21:07:44.000000000 +0100 @@ -549,15 +549,14 @@ static int proc_register(struct proc_dir */ static void proc_kill_inodes(struct proc_dir_entry *de) { - struct list_head *p; + struct file *filp; struct super_block *sb = proc_mnt->mnt_sb; /* * Actually it's a partial revoke(). */ - file_list_lock(); - list_for_each(p, &sb->s_files) { - struct file * filp = list_entry(p, struct file, f_u.fu_list); + filevec_add_drain_all(); + lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) { struct dentry * dentry = filp->f_path.dentry; struct inode * inode; const struct file_operations *fops; @@ -571,7 +570,6 @@ static void proc_kill_inodes(struct proc filp->f_op = NULL; fops_put(fops); } - file_list_unlock(); } static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent, Index: linux-2.6/fs/super.c =================================================================== --- linux-2.6.orig/fs/super.c 2007-01-13 21:04:07.000000000 +0100 +++ linux-2.6/fs/super.c 2007-01-27 21:07:44.000000000 +0100 @@ -67,7 +67,7 @@ static struct super_block *alloc_super(s } INIT_LIST_HEAD(&s->s_dirty); INIT_LIST_HEAD(&s->s_io); - INIT_LIST_HEAD(&s->s_files); + INIT_LOCK_LIST_HEAD(&s->s_files); INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); @@ -568,12 +568,11 @@ static void mark_files_ro(struct super_b { struct file *f; - file_list_lock(); - list_for_each_entry(f, &sb->s_files, f_u.fu_list) { + filevec_add_drain_all(); + lock_list_for_each_entry(f, &sb->s_files, f_u.fu_llist) { if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f)) f->f_mode &= ~FMODE_WRITE; } - file_list_unlock(); } /** Index: linux-2.6/include/linux/fs.h =================================================================== --- linux-2.6.orig/include/linux/fs.h 2007-01-22 22:43:28.000000000 +0100 +++ linux-2.6/include/linux/fs.h 2007-01-27 21:07:44.000000000 +0100 @@ -275,6 +275,7 @@ extern int dir_notify_enable; #include #include #include +#include #include #include #include @@ -709,9 +710,13 @@ struct file { /* * fu_list becomes invalid after file_free is called and queued via * fu_rcuhead for RCU freeing + * fu_llist is used for the superblock s_files list; its crucial that + * the spinlock contained therein is not clobbered by other uses of + * the union. */ union { struct list_head fu_list; + struct lock_list_head fu_llist; struct rcu_head fu_rcuhead; } f_u; struct path f_path; @@ -744,9 +749,25 @@ extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); #define file_list_unlock() spin_unlock(&files_lock); +/* + * steal the upper 8 bits from the read-a-head flags + */ +#define F_SHIFT 24 + +#define F_SUPERBLOCK 0 + +#define file_flag_set(file, flag) \ + __set_bit((flag) + F_SHIFT, &(file)->f_ra.flags) +#define file_flag_clear(file, flag) \ + __clear_bit((flag) + F_SHIFT, &(file)->f_ra.flags) +#define file_flag(file, flag) \ + test_bit((flag) + F_SHIFT, &(file)->f_ra.flags) + #define get_file(x) atomic_inc(&(x)->f_count) #define file_count(x) atomic_read(&(x)->f_count) +extern int filevec_add_drain_all(void); + #define MAX_NON_LFS ((1UL<<31) - 1) /* Page cache limit. The filesystems should put that into their s_maxbytes @@ -928,7 +949,7 @@ struct super_block { struct list_head s_dirty; /* dirty inodes */ struct list_head s_io; /* parked for writeback */ struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ - struct list_head s_files; + struct lock_list_head s_files; struct block_device *s_bdev; struct list_head s_instances; Index: linux-2.6/mm/readahead.c =================================================================== --- linux-2.6.orig/mm/readahead.c 2007-01-22 22:43:29.000000000 +0100 +++ linux-2.6/mm/readahead.c 2007-01-27 21:07:44.000000000 +0100 @@ -69,7 +69,7 @@ static inline void reset_ahead_window(st static inline void ra_off(struct file_ra_state *ra) { ra->start = 0; - ra->flags = 0; + ra->flags &= (~0UL) << F_SHIFT; ra->size = 0; reset_ahead_window(ra); return; Index: linux-2.6/security/selinux/selinuxfs.c =================================================================== --- linux-2.6.orig/security/selinux/selinuxfs.c 2007-01-13 21:04:19.000000000 +0100 +++ linux-2.6/security/selinux/selinuxfs.c 2007-01-27 21:10:48.000000000 +0100 @@ -940,7 +940,8 @@ static struct file_operations sel_commit * fs/proc/generic.c proc_kill_inodes */ static void sel_remove_bools(struct dentry *de) { - struct list_head *p, *node; + struct list_head *node; + struct file *filp; struct super_block *sb = de->d_sb; spin_lock(&dcache_lock); @@ -962,9 +963,8 @@ static void sel_remove_bools(struct dent spin_unlock(&dcache_lock); - file_list_lock(); - list_for_each(p, &sb->s_files) { - struct file * filp = list_entry(p, struct file, f_u.fu_list); + filevec_add_drain_all(); + lock_list_for_each_entry(filp, &sb->s_files, f_u.fu_llist) { struct dentry * dentry = filp->f_path.dentry; if (dentry->d_parent != de) { @@ -972,7 +972,6 @@ static void sel_remove_bools(struct dent } filp->f_op = NULL; } - file_list_unlock(); } #define BOOL_DIR_NAME "booleans" -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/