>From 7ac17dcf2615f40cef789b203d6f6876038627b6 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Thu, 22 Jul 2021 20:19:47 +0200 Subject: [PATCH 3/3] ipc/shm: locking / questions for discussion, not for merging: - make locking a bit more explicit - there is no advantage of a temporary list in exit_shm(), thus delete segments one by one. - lots of debug pr_info. Signed-off-by: Manfred Spraul --- include/linux/ipc_namespace.h | 16 ++++ ipc/namespace.c | 4 + ipc/shm.c | 160 +++++++++++++++++++++------------- 3 files changed, 121 insertions(+), 59 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 05e22770af51..f3a09604d0d5 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -126,11 +126,22 @@ extern struct ipc_namespace *copy_ipcs(unsigned long flags, static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) { +pr_info("get_ipc_ns: inc for ns %px.\n", ns); if (ns) refcount_inc(&ns->ns.count); return ns; } +static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) +{ +pr_info("get_ipc_ns_not_zero: inc for ns %px.\n", ns); + if (ns) { + if (refcount_inc_not_zero(&ns->ns.count)) + return ns; + } + return NULL; +} + extern void put_ipc_ns(struct ipc_namespace *ns); #else static inline struct ipc_namespace *copy_ipcs(unsigned long flags, @@ -147,6 +158,11 @@ static inline struct ipc_namespace *get_ipc_ns(struct ipc_namespace *ns) return ns; } +static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) +{ + return ns; +} + static inline void put_ipc_ns(struct ipc_namespace *ns) { } diff --git a/ipc/namespace.c b/ipc/namespace.c index 7bd0766ddc3b..4160ea18dcd2 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -117,6 +117,7 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, static void free_ipc_ns(struct ipc_namespace *ns) { +pr_info("free_ipc_ns: worker task for namespace %px.\n", ns); /* mq_put_mnt() waits for a grace period as kern_unmount() * uses synchronize_rcu(). */ @@ -129,6 +130,7 @@ static void free_ipc_ns(struct ipc_namespace *ns) put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); kfree(ns); +pr_info("free_ipc_ns: worker task for namespace %px done.\n", ns); } static LLIST_HEAD(free_ipc_list); @@ -164,9 +166,11 @@ static DECLARE_WORK(free_ipc_work, free_ipc); */ void put_ipc_ns(struct ipc_namespace *ns) { +pr_info("put_ipc_ns: got called for %px.\n", ns); if (refcount_dec_and_lock(&ns->ns.count, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); +pr_info("put_ipc_ns: destroying namespace %px.\n", ns); if (llist_add(&ns->mnt_llist, &free_ipc_list)) schedule_work(&free_ipc_work); diff --git a/ipc/shm.c b/ipc/shm.c index a746886a0e00..f55118d0a425 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -62,9 +62,14 @@ struct shmid_kernel /* private to the kernel */ struct pid *shm_lprid; struct ucounts *mlock_ucounts; - /* The task created the shm object. NULL if the task is dead. */ + /* The task created the shm object, for looking up + * task->sysvshm.shm_clist_lock */ struct task_struct *shm_creator; - struct list_head shm_clist; /* list by creator */ + + /* list by creator. shm_clist_lock required for read/write + * if list_empty(), then the creator is dead already + */ + struct list_head shm_clist; struct ipc_namespace *ns; } __randomize_layout; @@ -130,6 +135,7 @@ static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) #ifdef CONFIG_IPC_NS void shm_exit_ns(struct ipc_namespace *ns) { +pr_info("shm_exit_ns(), %px.\n", ns); free_ipcs(ns, &shm_ids(ns), do_shm_rmid); idr_destroy(&ns->ids[IPC_SHM_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_SHM_IDS].key_ht); @@ -237,15 +243,30 @@ static inline void task_shm_clist_unlock(struct task_struct *task) spin_unlock(&task->sysvshm.shm_clist_lock); } -void shm_clist_rm(struct shmid_kernel *shp) +/* It has to be called with shp locked. + * It must be called before ipc_rmid() */ +static inline void shm_clist_rm(struct shmid_kernel *shp) { - if (!shp->shm_creator) + struct task_struct *creator; + + creator = READ_ONCE(shp->shm_creator); + if (!creator) { +pr_info("shm_clist_rm: creator already NULL for %px.\n", shp); return; + } + + task_shm_clist_lock(creator); + +pr_info("shm_clist_rm: creator %px locked for shp %px.\n", creator, shp); - task_shm_clist_lock(shp->shm_creator); - list_del_init(&shp->shm_clist); - task_shm_clist_unlock(shp->shm_creator); - shp->shm_creator = NULL; + /* A concurrent exit_shm may do a list_del_init() as well. + * Just do nothing if exit_shm already did the work + */ + if (!list_empty(&shp->shm_clist)) { + list_del_init(&shp->shm_clist); + WRITE_ONCE(shp->shm_creator, NULL); + } + task_shm_clist_unlock(creator); } static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) @@ -305,6 +326,8 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) { struct file *shm_file; +pr_info("shm_destroy: current %px, namespace %px, shp %px.\n", current, ns, shp); + shm_file = shp->shm_file; shp->shm_file = NULL; ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -333,6 +356,11 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) */ static bool shm_may_destroy(struct shmid_kernel *shp) { +pr_info("shm_may_destroy for current %px, ns %px, shp %px.\n", current, shp->ns, shp); + +/* TODO: + * understand refcounting for shp->ns: What guarantees that ns cannot go out of scope? + */ return (shp->shm_nattch == 0) && (shp->ns->shm_rmid_forced || (shp->shm_perm.mode & SHM_DEST)); @@ -365,6 +393,7 @@ static void shm_close(struct vm_area_struct *vma) ipc_update_pid(&shp->shm_lprid, task_tgid(current)); shp->shm_dtim = ktime_get_real_seconds(); shp->shm_nattch--; +pr_info("shm_close for current %px, ns %px, shp %px.\n", current, shp->ns, shp); if (shm_may_destroy(shp)) shm_destroy(ns, shp); else @@ -413,67 +442,66 @@ void shm_init_task(struct task_struct *task) /* Locking assumes this will only be called with task == current */ void exit_shm(struct task_struct *task) { - LIST_HEAD(tmp); - struct shmid_kernel *shp, *n; - if (list_empty(&task->sysvshm.shm_clist)) - return; +pr_info("exit_shm: called for task %px, current %px\n", task, current); - rcu_read_lock(); /* for refcount_inc_not_zero */ - task_shm_clist_lock(task); + for (;;) { + struct shmid_kernel *shp; + struct ipc_namespace *ns; - list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) { - struct ipc_namespace *ns = shp->ns; - /* - * Remove shm from task list and nullify shm_creator which - * marks object as orphaned. - * - * If kernel.shm_rmid_forced is not set then only keep track of - * which shmids are orphaned, so that a later set of the sysctl - * can clean them up. - */ - list_del_init(&shp->shm_clist); - shp->shm_creator = NULL; + task_shm_clist_lock(task); + if (list_empty(&task->sysvshm.shm_clist)) { + task_shm_clist_unlock(task); + break; + } - printk("exit_shm() %px refcnt=%u, id=%d,key=%x\n", shp, - refcount_read(&shp->shm_perm.refcount), - shp->shm_perm.id, shp->shm_perm.key - ); + shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel, + shm_clist); + + /* 1) unlink */ + list_del_init(&shp->shm_clist); + WRITE_ONCE(shp->shm_creator, NULL); /* - * Will destroy all already created segments, that were not yet mapped, - * and mark any mapped as orphan to cover the sysctl toggling. - * Destroy is skipped if shm_may_destroy() returns false. + * 2) get a reference to the namespace. + * The refcount could be already 0. If it is 0, then + * the shm objects will be free by free_ipc_work(). */ - if (shp->ns->shm_rmid_forced && shm_may_destroy(shp)) { + ns = shp->ns; +pr_info("exit_shm: called for task %px, current %px, processing shp %px, ns %px\n", task, current, shp, ns); + + ns = get_ipc_ns_not_zero(ns); + if (ns) { /* - * We may race with shm_exit_ns() if refcounter - * already zero. Let's skip shm_destroy() of such - * shm object as it will be destroyed during shm_exit_ns() + * 3) get a reference to the shp itself. + * This cannot fail: shm_clist_rm() is called before + * ipc_rmid(), thus the refcount cannot be 0. */ - if (!refcount_inc_not_zero(&ns->ns.count)) /* get_ipc_ns */ - continue; - - list_add(&shp->shm_clist, &tmp); + ipc_rcu_getref(&shp->shm_perm); + } + task_shm_clist_unlock(task); + + if (ns) { + down_write(&shm_ids(ns).rwsem); + shm_lock_by_ptr(shp); + ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); + + if (ipc_valid_object(&shp->shm_perm)) { + if (shm_may_destroy(shp)) + shm_destroy(ns, shp); + else + shm_unlock(shp); + } else { + /* + * Someone else deleted the shp while we have waited. + * Just unlock and continue. + */ + shm_unlock(shp); + } + up_write(&shm_ids(ns).rwsem); + put_ipc_ns(ns); } - } - - list_del(&task->sysvshm.shm_clist); - task_shm_clist_unlock(task); - rcu_read_unlock(); - - list_for_each_entry_safe(shp, n, &tmp, shm_clist) { - struct ipc_namespace *ns = shp->ns; - - list_del_init(&shp->shm_clist); - - down_write(&shm_ids(ns).rwsem); - shm_lock_by_ptr(shp); - /* will do put_ipc_ns(shp->ns) */ - shm_destroy(ns, shp); - up_write(&shm_ids(ns).rwsem); - put_ipc_ns(ns); /* see refcount_inc_not_zero */ } } @@ -566,6 +594,8 @@ static int shm_release(struct inode *ino, struct file *file) { struct shm_file_data *sfd = shm_file_data(file); +pr_info("shm_release for current %px.\n", current); + put_ipc_ns(sfd->ns); fput(sfd->file); shm_file_data(file) = NULL; @@ -731,9 +761,21 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) if (error < 0) goto no_id; - list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); shp->ns = ns; + task_shm_clist_lock(current); + list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); + +pr_info("newseg: current %px, namespace %px, shp %px.\n", current, ns, shp); + { + struct shmid_kernel *shp; + + list_for_each_entry(shp, ¤t->sysvshm.shm_clist, shm_clist) { +pr_info("newseg: current %px, list entry %px.\n", current, shp); + } + } + task_shm_clist_unlock(current); + /* * shmid gets reported as "inode#" in /proc/pid/maps. * proc-ps tools use this. Changing this will break them. -- 2.31.1