Adds functionality to read/write lock CLONE_THREAD fork()ing per-threadgroup From: Ben Blum This patch adds an rwsem that lives in a threadgroup's sighand_struct (next to the sighand's atomic count, to piggyback on its cacheline), and two functions in kernel/cgroup.c (for now) for easily+safely obtaining and releasing it. If another part of the kernel later wants to use such a locking mechanism, the CONFIG_CGROUPS ifdefs should be changed to a higher-up flag that CGROUPS and the other system would both depend on, and the lock/unlock functions could be moved to sched.c or so. This is a pre-patch for cgroups-procs-write.patch. Signed-off-by: Ben Blum --- include/linux/cgroup.h | 14 +++++-- include/linux/init_task.h | 9 ++++ include/linux/sched.h | 15 +++++++ kernel/cgroup.c | 93 ++++++++++++++++++++++++++++++++++++++++++++- kernel/fork.c | 9 +++- 5 files changed, 131 insertions(+), 9 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9be4c22..2eb54bb 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -30,10 +30,12 @@ extern int cgroup_init(void); extern void cgroup_lock(void); extern bool cgroup_lock_live_group(struct cgroup *cgrp); extern void cgroup_unlock(void); -extern void cgroup_fork(struct task_struct *p); +extern void cgroup_fork(struct task_struct *p, unsigned long clone_flags); extern void cgroup_fork_callbacks(struct task_struct *p); -extern void cgroup_post_fork(struct task_struct *p); +extern void cgroup_post_fork(struct task_struct *p, unsigned long clone_flags); extern void cgroup_exit(struct task_struct *p, int run_callbacks); +extern void cgroup_fork_failed(struct task_struct *p, int run_callbacks, + unsigned long clone_flags); extern int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); extern int cgroup_load_subsys(struct cgroup_subsys *ss); @@ -580,10 +582,14 @@ unsigned short css_depth(struct cgroup_subsys_state *css); static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } -static inline void cgroup_fork(struct task_struct *p) {} +static inline void cgroup_fork(struct task_struct *p, + unsigned long clone_flags) {} static inline void cgroup_fork_callbacks(struct task_struct *p) {} -static inline void cgroup_post_fork(struct task_struct *p) {} +static inline void cgroup_post_fork(struct task_struct *p, + unsigned long clone_flags) {} static inline void cgroup_exit(struct task_struct *p, int callbacks) {} +static inline void cgroup_fork_failed(struct task_struct *p, int callbacks, + unsigned long clone_flags) {} static inline void cgroup_lock(void) {} static inline void cgroup_unlock(void) {} diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 8ed0abf..aaa4b9c 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -41,7 +41,16 @@ extern struct nsproxy init_nsproxy; INIT_IPC_NS(ipc_ns) \ } +#ifdef CONFIG_CGROUPS +# define INIT_THREADGROUP_FORK_LOCK(sighand) \ + .threadgroup_fork_lock = \ + __RWSEM_INITIALIZER(sighand.threadgroup_fork_lock), +#else +# define INIT_THREADGROUP_FORK_LOCK(sighand) +#endif + #define INIT_SIGHAND(sighand) { \ + INIT_THREADGROUP_FORK_LOCK(sighand) \ .count = ATOMIC_INIT(1), \ .action = { { { .sa_handler = NULL, } }, }, \ .siglock = __SPIN_LOCK_UNLOCKED(sighand.siglock), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 23b26c7..10a22a5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -475,6 +475,21 @@ extern int get_dumpable(struct mm_struct *mm); #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) struct sighand_struct { +#ifdef CONFIG_CGROUPS + /* + * The threadgroup_fork_lock is used to prevent any threads in a + * threadgroup from forking with CLONE_THREAD while held for writing, + * used for threadgroup-wide operations that are fork-sensitive. It + * lives here next to sighand.count as a cacheline optimization. + * + * TODO: if anybody besides cgroups uses this lock, change the + * CONFIG_CGROUPS to a higher-up CONFIG_* that the other user and + * cgroups would both depend upon. Also, they'll want to move where + * the readlock happens - it currently lives in kernel/cgroup.c in + * cgroup_{fork,post_fork,fork_failed}(). + */ + struct rw_semaphore threadgroup_fork_lock; +#endif atomic_t count; struct k_sigaction action[_NSIG]; spinlock_t siglock; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cc2e1f6..99782a0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1623,6 +1623,71 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) } /** + * threadgroup_fork_lock - block all CLONE_THREAD forks in the threadgroup + * @tsk: the task whose threadgroup should be locked + * + * Takes the threadgroup_lock_mutex in the threadgroup's sighand_struct, by + * means of searching the threadgroup list for a live thread in the group. + * Returns the sighand_struct that should be given to threadgroup_fork_unlock, + * or -ESRCH if all threads in the group are exiting and have cleared their + * sighand pointers, or -EAGAIN if tsk is not the threadgroup leader. + */ +struct sighand_struct *threadgroup_fork_lock(struct task_struct *tsk) +{ + struct sighand_struct *sighand; + struct task_struct *p; + + /* tasklist lock protects sighand_struct's disappearance in exit(). */ + read_lock(&tasklist_lock); + + /* make sure the threadgroup's state is sane before we proceed */ + if (unlikely(!thread_group_leader(tsk))) { + /* a race with de_thread() stripped us of our leadership */ + read_unlock(&tasklist_lock); + return ERR_PTR(-EAGAIN); + } + + /* now try to find a sighand */ + if (likely(tsk->sighand)) { + sighand = tsk->sighand; + } else { + sighand = ERR_PTR(-ESRCH); + /* + * tsk is exiting; try to find another thread in the group + * whose sighand pointer is still alive. + */ + list_for_each_entry_rcu(p, &tsk->thread_group, thread_group) { + if (p->sighand) { + sighand = tsk->sighand; + break; + } + } + } + /* prevent sighand from vanishing before we let go of tasklist_lock */ + if (likely(sighand)) + atomic_inc(&sighand->count); + + /* done searching. */ + read_unlock(&tasklist_lock); + + if (likely(sighand)) + down_write(&sighand->threadgroup_fork_lock); + return sighand; +} + +/** + * threadgroup_fork_lock - let threadgroup resume CLONE_THREAD forks. + * @sighand: the threadgroup's sighand that threadgroup_fork_lock gave back + * + * Lets go of the threadgroup_fork_lock, and drops the sighand reference. + */ +void threadgroup_fork_unlock(struct sighand_struct *sighand) +{ + up_write(&sighand->threadgroup_fork_lock); + __cleanup_sighand(sighand); +} + +/** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' * @cgrp: the cgroup the task is attaching to * @tsk: the task to be attached @@ -3713,8 +3778,10 @@ static const struct file_operations proc_cgroupstats_operations = { * At the point that cgroup_fork() is called, 'current' is the parent * task, and the passed argument 'child' points to the child task. */ -void cgroup_fork(struct task_struct *child) +void cgroup_fork(struct task_struct *child, unsigned long clone_flags) { + if (clone_flags & CLONE_THREAD) + down_read(¤t->sighand->threadgroup_fork_lock); task_lock(current); child->cgroups = current->cgroups; get_css_set(child->cgroups); @@ -3756,7 +3823,7 @@ void cgroup_fork_callbacks(struct task_struct *child) * with the first call to cgroup_iter_start() - to guarantee that the * new task ends up on its list. */ -void cgroup_post_fork(struct task_struct *child) +void cgroup_post_fork(struct task_struct *child, unsigned long clone_flags) { if (use_task_css_set_links) { write_lock(&css_set_lock); @@ -3766,6 +3833,8 @@ void cgroup_post_fork(struct task_struct *child) task_unlock(child); write_unlock(&css_set_lock); } + if (clone_flags & CLONE_THREAD) + up_read(¤t->sighand->threadgroup_fork_lock); } /** * cgroup_exit - detach cgroup from exiting task @@ -3841,6 +3910,26 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } /** + * cgroup_fork_failed - undo operations for fork failure + * @tsk: pointer to task_struct of exiting process + * @run_callback: run exit callbacks? + * + * Description: Undo cgroup operations after cgroup_fork in fork failure. + * + * We release the read lock that was taken in cgroup_fork(), since it is + * supposed to be dropped in cgroup_post_fork in the success case. The other + * thing that wants to be done is detaching the failed child task from the + * cgroup, so we wrap cgroup_exit. + */ +void cgroup_fork_failed(struct task_struct *tsk, int run_callbacks, + unsigned long clone_flags) +{ + if (clone_flags & CLONE_THREAD) + up_read(¤t->sighand->threadgroup_fork_lock); + cgroup_exit(tsk, run_callbacks); +} + +/** * cgroup_clone - clone the cgroup the given subsystem is attached to * @tsk: the task to be moved * @subsys: the given subsystem diff --git a/kernel/fork.c b/kernel/fork.c index 404e6ca..daf5967 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -809,6 +809,9 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) return -ENOMEM; atomic_set(&sig->count, 1); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->threadgroup_fork_lock); +#endif return 0; } @@ -1091,7 +1094,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, monotonic_to_bootbased(&p->real_start_time); p->io_context = NULL; p->audit_context = NULL; - cgroup_fork(p); + cgroup_fork(p, clone_flags); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { @@ -1316,7 +1319,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, spin_unlock(¤t->sighand->siglock); write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - cgroup_post_fork(p); + cgroup_post_fork(p, clone_flags); perf_event_fork(p); return p; @@ -1350,7 +1353,7 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: #endif - cgroup_exit(p, cgroup_callbacks_done); + cgroup_fork_failed(p, cgroup_callbacks_done, clone_flags); delayacct_tsk_free(p); module_put(task_thread_info(p)->exec_domain->module); bad_fork_cleanup_count: -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/